23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
27#pragma clang diagnostic push
28#pragma clang diagnostic ignored "-Wreserved-identifier"
29#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
30#pragma clang diagnostic ignored "-Wsign-conversion"
31#pragma clang diagnostic ignored "-Wold-style-cast"
32#pragma clang diagnostic ignored "-Wc++98-compat"
33#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
36__device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
37 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
38 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
42__device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
43 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
44 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
48__device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
49 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
50 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
54__device__
static inline float __hip_ds_permutef(
int index,
float src) {
55 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
56 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
60#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
61#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
64__device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
65 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
66 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
71__device__
static inline float __hip_ds_swizzlef_N(
float src) {
72 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
73 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
77#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
78 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
80template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
81__device__
static inline int __hip_move_dpp_N(
int src) {
82 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
86static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
90int __shfl(
int var,
int src_lane,
int width = warpSize) {
91 int self = __lane_id();
92 int index = (src_lane & (width - 1)) + (self & ~(width-1));
93 return __builtin_amdgcn_ds_bpermute(index<<2, var);
97unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
98 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
99 tmp.i = __shfl(tmp.i, src_lane, width);
104float __shfl(
float var,
int src_lane,
int width = warpSize) {
105 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
106 tmp.i = __shfl(tmp.i, src_lane, width);
111double __shfl(
double var,
int src_lane,
int width = warpSize) {
112 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
113 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
115 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
116 tmp[0] = __shfl(tmp[0], src_lane, width);
117 tmp[1] = __shfl(tmp[1], src_lane, width);
119 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
120 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
125long __shfl(
long var,
int src_lane,
int width = warpSize)
128 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
129 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
131 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
132 tmp[0] = __shfl(tmp[0], src_lane, width);
133 tmp[1] = __shfl(tmp[1], src_lane, width);
135 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
136 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
139 static_assert(
sizeof(long) ==
sizeof(
int),
"");
140 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
145unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
147 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
148 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
150 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
151 tmp[0] = __shfl(tmp[0], src_lane, width);
152 tmp[1] = __shfl(tmp[1], src_lane, width);
154 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
155 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
158 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
159 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
164long long __shfl(
long long var,
int src_lane,
int width = warpSize)
166 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
167 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
169 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
170 tmp[0] = __shfl(tmp[0], src_lane, width);
171 tmp[1] = __shfl(tmp[1], src_lane, width);
173 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
174 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
179unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
180 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
181 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
183 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
184 tmp[0] = __shfl(tmp[0], src_lane, width);
185 tmp[1] = __shfl(tmp[1], src_lane, width);
187 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
188 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
194int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
195 int self = __lane_id();
196 int index = self - lane_delta;
197 index = (index < (self & ~(width-1)))?self:index;
198 return __builtin_amdgcn_ds_bpermute(index<<2, var);
202unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
203 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
204 tmp.i = __shfl_up(tmp.i, lane_delta, width);
209float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
210 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
211 tmp.i = __shfl_up(tmp.i, lane_delta, width);
216double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
217 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
218 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
220 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
221 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
222 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
224 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
225 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
230long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
233 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
234 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
236 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
237 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
238 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
240 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
241 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
244 static_assert(
sizeof(long) ==
sizeof(
int),
"");
245 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
251unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
254 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
255 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
257 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
258 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
259 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
261 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
262 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
265 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
266 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
272long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
274 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
275 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
276 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
277 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
278 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
279 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
280 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
286unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
288 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
289 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
290 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
291 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
292 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
293 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
294 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
300int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
301 int self = __lane_id();
302 int index = self + lane_delta;
303 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
304 return __builtin_amdgcn_ds_bpermute(index<<2, var);
308unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
309 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
310 tmp.i = __shfl_down(tmp.i, lane_delta, width);
315float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
316 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
317 tmp.i = __shfl_down(tmp.i, lane_delta, width);
322double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
323 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
324 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
326 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
327 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
328 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
330 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
331 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
336long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
339 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
340 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
342 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
343 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
344 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
346 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
347 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
350 static_assert(
sizeof(long) ==
sizeof(
int),
"");
351 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
356unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
359 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
360 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
362 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
363 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
364 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
366 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
367 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
370 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
371 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
376long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
378 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
379 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
380 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
381 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
382 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
383 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
384 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
389unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
391 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
392 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
393 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
394 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
395 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
396 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
397 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
403int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
404 int self = __lane_id();
405 int index = self^lane_mask;
406 index = index >= ((self+width)&~(width-1))?self:index;
407 return __builtin_amdgcn_ds_bpermute(index<<2, var);
411unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
412 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
413 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
418float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
419 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
420 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
425double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
426 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
427 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
429 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
430 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
431 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
433 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
434 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
439long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
442 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
443 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
445 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
446 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
447 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
449 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
450 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
453 static_assert(
sizeof(long) ==
sizeof(
int),
"");
454 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
459unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
462 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
463 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
465 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
466 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
467 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
469 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
470 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
473 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
474 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
479long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
481 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
482 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
483 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
484 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
485 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
486 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
487 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
492unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
494 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
495 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
496 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
497 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
498 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
499 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
500 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
504#if defined(__clang__)
505#pragma clang diagnostic pop