HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_unsafe_atomics.h
1/*
2Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#pragma once
24
25#ifdef __cplusplus
26
27#if defined(__clang__)
28#pragma clang diagnostic push
29#pragma clang diagnostic ignored "-Wold-style-cast"
30#endif
31
58__device__ inline float unsafeAtomicAdd(float* addr, float value) {
59#if defined(__gfx90a__) && \
60 __has_builtin(__builtin_amdgcn_is_shared) && \
61 __has_builtin(__builtin_amdgcn_is_private) && \
62 __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
63 __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
64 if (__builtin_amdgcn_is_shared(
65 (const __attribute__((address_space(0))) void*)addr))
66 return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
67 else if (__builtin_amdgcn_is_private(
68 (const __attribute__((address_space(0))) void*)addr)) {
69 float temp = *addr;
70 *addr = temp + value;
71 return temp;
72 }
73 else
74 return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
75#elif __has_builtin(__hip_atomic_fetch_add)
76 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
77#else
78 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
79#endif
80}
81
96__device__ inline float unsafeAtomicMax(float* addr, float val) {
97 #if __has_builtin(__hip_atomic_load) && \
98 __has_builtin(__hip_atomic_compare_exchange_strong)
99 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
100 bool done = false;
101 while (!done && value < val) {
102 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
103 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
104 }
105 return value;
106 #else
107 unsigned int *uaddr = (unsigned int *)addr;
108 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
109 bool done = false;
110 while (!done && __uint_as_float(value) < val) {
111 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
112 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
113 }
114 return __uint_as_float(value);
115 #endif
116}
117
132__device__ inline float unsafeAtomicMin(float* addr, float val) {
133 #if __has_builtin(__hip_atomic_load) && \
134 __has_builtin(__hip_atomic_compare_exchange_strong)
135 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
136 bool done = false;
137 while (!done && value > val) {
138 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
139 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
140 }
141 return value;
142 #else
143 unsigned int *uaddr = (unsigned int *)addr;
144 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
145 bool done = false;
146 while (!done && __uint_as_float(value) > val) {
147 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
148 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
149 }
150 return __uint_as_float(value);
151 #endif
152}
153
180__device__ inline double unsafeAtomicAdd(double* addr, double value) {
181#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
182 return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
183#elif defined (__hip_atomic_fetch_add)
184 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
185#else
186 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
187#endif
188}
189
216__device__ inline double unsafeAtomicMax(double* addr, double val) {
217#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
218 __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
219 return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
220#else
221 #if __has_builtin(__hip_atomic_load) && \
222 __has_builtin(__hip_atomic_compare_exchange_strong)
223 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
224 bool done = false;
225 while (!done && value < val) {
226 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
227 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
228 }
229 return value;
230 #else
231 unsigned long long *uaddr = (unsigned long long *)addr;
232 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
233 bool done = false;
234 while (!done && __longlong_as_double(value) < val) {
235 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
236 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
237 }
238 return __longlong_as_double(value);
239 #endif
240#endif
241}
242
269__device__ inline double unsafeAtomicMin(double* addr, double val) {
270#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
271 __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
272 return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
273#else
274 #if __has_builtin(__hip_atomic_load) && \
275 __has_builtin(__hip_atomic_compare_exchange_strong)
276 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
277 bool done = false;
278 while (!done && value > val) {
279 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
280 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
281 }
282 return value;
283 #else
284 unsigned long long *uaddr = (unsigned long long *)addr;
285 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
286 bool done = false;
287 while (!done && __longlong_as_double(value) > val) {
288 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
289 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
290 }
291 return __longlong_as_double(value);
292 #endif
293#endif
294}
295
310__device__ inline float safeAtomicAdd(float* addr, float value) {
311#if defined(__gfx908__) || defined(__gfx941__) \
312 || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
313 && !__has_builtin(__hip_atomic_fetch_add))
314 // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
315 // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
316 // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
317 // so we need to force a CAS loop emulation to ensure safety.
318 // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
319 // need to force a CAS loop here.
320 float old_val;
321#if __has_builtin(__hip_atomic_load)
322 old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
323#else // !__has_builtin(__hip_atomic_load)
324 old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
325#endif // __has_builtin(__hip_atomic_load)
326 float expected, temp;
327 do {
328 temp = expected = old_val;
329#if __has_builtin(__hip_atomic_compare_exchange_strong)
330 __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
331 __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
332#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
333 __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
334 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
335#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
336 old_val = expected;
337 } while (__float_as_uint(temp) != __float_as_uint(old_val));
338 return old_val;
339#elif defined(__gfx90a__)
340 // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
341 // atomics will produce safe CAS loops, but are otherwise not different than
342 // agent-scope atomics. This logic is only applicable for gfx90a, and should
343 // not be assumed on other architectures.
344 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
345#elif __has_builtin(__hip_atomic_fetch_add)
346 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
347#else
348 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
349#endif
350}
351
366__device__ inline float safeAtomicMax(float* addr, float val) {
367 #if __has_builtin(__hip_atomic_load) && \
368 __has_builtin(__hip_atomic_compare_exchange_strong)
369 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
370 bool done = false;
371 while (!done && value < val) {
372 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
373 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
374 }
375 return value;
376 #else
377 unsigned int *uaddr = (unsigned int *)addr;
378 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
379 bool done = false;
380 while (!done && __uint_as_float(value) < val) {
381 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
382 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
383 }
384 return __uint_as_float(value);
385 #endif
386}
387
402__device__ inline float safeAtomicMin(float* addr, float val) {
403 #if __has_builtin(__hip_atomic_load) && \
404 __has_builtin(__hip_atomic_compare_exchange_strong)
405 float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
406 bool done = false;
407 while (!done && value > val) {
408 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
409 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
410 }
411 return value;
412 #else
413 unsigned int *uaddr = (unsigned int *)addr;
414 unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
415 bool done = false;
416 while (!done && __uint_as_float(value) > val) {
417 done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
418 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
419 }
420 return __uint_as_float(value);
421 #endif
422}
423
438__device__ inline double safeAtomicAdd(double* addr, double value) {
439#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
440 // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
441 // atomics will produce safe CAS loops, but are otherwise not different than
442 // agent-scope atomics. This logic is only applicable for gfx90a, and should
443 // not be assumed on other architectures.
444 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
445#elif defined(__gfx90a__)
446 // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
447 // force a CAS loop here.
448 double old_val;
449#if __has_builtin(__hip_atomic_load)
450 old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
451#else // !__has_builtin(__hip_atomic_load)
452 old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
453#endif // __has_builtin(__hip_atomic_load)
454 double expected, temp;
455 do {
456 temp = expected = old_val;
457#if __has_builtin(__hip_atomic_compare_exchange_strong)
458 __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
459 __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
460#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
461 __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
462 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
463#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
464 old_val = expected;
465 } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
466 return old_val;
467#else // !defined(__gfx90a__)
468#if __has_builtin(__hip_atomic_fetch_add)
469 return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
470#else // !__has_builtin(__hip_atomic_fetch_add)
471 return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
472#endif // __has_builtin(__hip_atomic_fetch_add)
473#endif
474}
475
490__device__ inline double safeAtomicMax(double* addr, double val) {
491 #if __has_builtin(__builtin_amdgcn_is_private)
492 if (__builtin_amdgcn_is_private(
493 (const __attribute__((address_space(0))) void*)addr)) {
494 double old = *addr;
495 *addr = __builtin_fmax(old, val);
496 return old;
497 } else {
498 #endif
499 #if __has_builtin(__hip_atomic_load) && \
500 __has_builtin(__hip_atomic_compare_exchange_strong)
501 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
502 bool done = false;
503 while (!done && value < val) {
504 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
505 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
506 }
507 return value;
508 #else
509 unsigned long long *uaddr = (unsigned long long *)addr;
510 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
511 bool done = false;
512 while (!done && __longlong_as_double(value) < val) {
513 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
514 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
515 }
516 return __longlong_as_double(value);
517 #endif
518 #if __has_builtin(__builtin_amdgcn_is_private)
519 }
520 #endif
521}
522
537__device__ inline double safeAtomicMin(double* addr, double val) {
538 #if __has_builtin(__builtin_amdgcn_is_private)
539 if (__builtin_amdgcn_is_private(
540 (const __attribute__((address_space(0))) void*)addr)) {
541 double old = *addr;
542 *addr = __builtin_fmin(old, val);
543 return old;
544 } else {
545 #endif
546 #if __has_builtin(__hip_atomic_load) && \
547 __has_builtin(__hip_atomic_compare_exchange_strong)
548 double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
549 bool done = false;
550 while (!done && value > val) {
551 done = __hip_atomic_compare_exchange_strong(addr, &value, val,
552 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
553 }
554 return value;
555 #else
556 unsigned long long *uaddr = (unsigned long long *)addr;
557 unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
558 bool done = false;
559 while (!done && __longlong_as_double(value) > val) {
560 done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
561 __ATOMIC_RELAXED, __ATOMIC_RELAXED);
562 }
563 return __longlong_as_double(value);
564 #endif
565 #if __has_builtin(__builtin_amdgcn_is_private)
566 }
567 #endif
568}
569
570#if defined(__clang__)
571#pragma clang diagnostic pop
572#endif
573
574#endif