HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_warp_functions.h
1/*
2Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
26__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27 union { int i; unsigned u; float f; } tmp; tmp.u = src;
28 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29 return tmp.u;
30}
31
32__device__ static inline float __hip_ds_bpermutef(int index, float src) {
33 union { int i; unsigned u; float f; } tmp; tmp.f = src;
34 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35 return tmp.f;
36}
37
38__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39 union { int i; unsigned u; float f; } tmp; tmp.u = src;
40 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41 return tmp.u;
42}
43
44__device__ static inline float __hip_ds_permutef(int index, float src) {
45 union { int i; unsigned u; float f; } tmp; tmp.f = src;
46 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47 return tmp.f;
48}
49
50#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52
53template <int pattern>
54__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55 union { int i; unsigned u; float f; } tmp; tmp.u = src;
56 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57 return tmp.u;
58}
59
60template <int pattern>
61__device__ static inline float __hip_ds_swizzlef_N(float src) {
62 union { int i; unsigned u; float f; } tmp; tmp.f = src;
63 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64 return tmp.f;
65}
66
67#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69
70template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71__device__ static inline int __hip_move_dpp_N(int src) {
72 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73 bound_ctrl);
74}
75
76static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
77
78// warp vote function __all __any __ballot
79__device__
80inline
81int __all(int predicate) {
82 return __ockl_wfall_i32(predicate);
83}
84
85__device__
86inline
87int __any(int predicate) {
88 return __ockl_wfany_i32(predicate);
89}
90
91// XXX from llvm/include/llvm/IR/InstrTypes.h
92#define ICMP_NE 33
93
94__device__
95inline
96unsigned long long int __ballot(int predicate) {
97 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
98}
99
100__device__
101inline
102unsigned long long int __ballot64(int predicate) {
103 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
104}
105
106// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
107#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
108// Since threads in a wave do not make independent progress, __activemask()
109// always returns the exact active mask, i.e, all active threads in the wave.
110__device__
111inline
112unsigned long long __activemask() {
113 return __ballot(true);
114}
115#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
116
117__device__ static inline unsigned int __lane_id() {
118 return __builtin_amdgcn_mbcnt_hi(
119 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
120}
121
122__device__
123inline
124int __shfl(int var, int src_lane, int width = warpSize) {
125 int self = __lane_id();
126 int index = (src_lane & (width - 1)) + (self & ~(width-1));
127 return __builtin_amdgcn_ds_bpermute(index<<2, var);
128}
129__device__
130inline
131unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
132 union { int i; unsigned u; float f; } tmp; tmp.u = var;
133 tmp.i = __shfl(tmp.i, src_lane, width);
134 return tmp.u;
135}
136__device__
137inline
138float __shfl(float var, int src_lane, int width = warpSize) {
139 union { int i; unsigned u; float f; } tmp; tmp.f = var;
140 tmp.i = __shfl(tmp.i, src_lane, width);
141 return tmp.f;
142}
143__device__
144inline
145double __shfl(double var, int src_lane, int width = warpSize) {
146 static_assert(sizeof(double) == 2 * sizeof(int), "");
147 static_assert(sizeof(double) == sizeof(uint64_t), "");
148
149 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
150 tmp[0] = __shfl(tmp[0], src_lane, width);
151 tmp[1] = __shfl(tmp[1], src_lane, width);
152
153 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
154 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
155 return tmp1;
156}
157__device__
158inline
159long __shfl(long var, int src_lane, int width = warpSize)
160{
161 #ifndef _MSC_VER
162 static_assert(sizeof(long) == 2 * sizeof(int), "");
163 static_assert(sizeof(long) == sizeof(uint64_t), "");
164
165 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
166 tmp[0] = __shfl(tmp[0], src_lane, width);
167 tmp[1] = __shfl(tmp[1], src_lane, width);
168
169 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
170 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
171 return tmp1;
172 #else
173 static_assert(sizeof(long) == sizeof(int), "");
174 return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
175 #endif
176}
177__device__
178inline
179unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
180 #ifndef _MSC_VER
181 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
182 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
183
184 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
185 tmp[0] = __shfl(tmp[0], src_lane, width);
186 tmp[1] = __shfl(tmp[1], src_lane, width);
187
188 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
189 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
190 return tmp1;
191 #else
192 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
193 return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
194 #endif
195}
196__device__
197inline
198long long __shfl(long long var, int src_lane, int width = warpSize)
199{
200 static_assert(sizeof(long long) == 2 * sizeof(int), "");
201 static_assert(sizeof(long long) == sizeof(uint64_t), "");
202
203 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
204 tmp[0] = __shfl(tmp[0], src_lane, width);
205 tmp[1] = __shfl(tmp[1], src_lane, width);
206
207 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
208 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
209 return tmp1;
210}
211__device__
212inline
213unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
214 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
215 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
216
217 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
218 tmp[0] = __shfl(tmp[0], src_lane, width);
219 tmp[1] = __shfl(tmp[1], src_lane, width);
220
221 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
222 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
223 return tmp1;
224}
225
226__device__
227inline
228int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
229 int self = __lane_id();
230 int index = self - lane_delta;
231 index = (index < (self & ~(width-1)))?self:index;
232 return __builtin_amdgcn_ds_bpermute(index<<2, var);
233}
234__device__
235inline
236unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
237 union { int i; unsigned u; float f; } tmp; tmp.u = var;
238 tmp.i = __shfl_up(tmp.i, lane_delta, width);
239 return tmp.u;
240}
241__device__
242inline
243float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
244 union { int i; unsigned u; float f; } tmp; tmp.f = var;
245 tmp.i = __shfl_up(tmp.i, lane_delta, width);
246 return tmp.f;
247}
248__device__
249inline
250double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
251 static_assert(sizeof(double) == 2 * sizeof(int), "");
252 static_assert(sizeof(double) == sizeof(uint64_t), "");
253
254 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
255 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
256 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
257
258 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
259 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
260 return tmp1;
261}
262__device__
263inline
264long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
265{
266 #ifndef _MSC_VER
267 static_assert(sizeof(long) == 2 * sizeof(int), "");
268 static_assert(sizeof(long) == sizeof(uint64_t), "");
269
270 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
271 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
272 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
273
274 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
275 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
276 return tmp1;
277 #else
278 static_assert(sizeof(long) == sizeof(int), "");
279 return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
280 #endif
281}
282
283__device__
284inline
285unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
286{
287 #ifndef _MSC_VER
288 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
289 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
290
291 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
292 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
293 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
294
295 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
296 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
297 return tmp1;
298 #else
299 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
300 return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
301 #endif
302}
303
304__device__
305inline
306long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
307{
308 static_assert(sizeof(long long) == 2 * sizeof(int), "");
309 static_assert(sizeof(long long) == sizeof(uint64_t), "");
310 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
311 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
312 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
313 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
314 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
315 return tmp1;
316}
317
318__device__
319inline
320unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
321{
322 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
323 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
324 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
325 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
326 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
327 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
328 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
329 return tmp1;
330}
331
332__device__
333inline
334int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
335 int self = __lane_id();
336 int index = self + lane_delta;
337 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
338 return __builtin_amdgcn_ds_bpermute(index<<2, var);
339}
340__device__
341inline
342unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
343 union { int i; unsigned u; float f; } tmp; tmp.u = var;
344 tmp.i = __shfl_down(tmp.i, lane_delta, width);
345 return tmp.u;
346}
347__device__
348inline
349float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
350 union { int i; unsigned u; float f; } tmp; tmp.f = var;
351 tmp.i = __shfl_down(tmp.i, lane_delta, width);
352 return tmp.f;
353}
354__device__
355inline
356double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
357 static_assert(sizeof(double) == 2 * sizeof(int), "");
358 static_assert(sizeof(double) == sizeof(uint64_t), "");
359
360 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
361 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
362 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
363
364 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
365 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
366 return tmp1;
367}
368__device__
369inline
370long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
371{
372 #ifndef _MSC_VER
373 static_assert(sizeof(long) == 2 * sizeof(int), "");
374 static_assert(sizeof(long) == sizeof(uint64_t), "");
375
376 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
377 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
378 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
379
380 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
381 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
382 return tmp1;
383 #else
384 static_assert(sizeof(long) == sizeof(int), "");
385 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
386 #endif
387}
388__device__
389inline
390unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
391{
392 #ifndef _MSC_VER
393 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
394 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
395
396 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
397 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
398 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
399
400 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
401 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
402 return tmp1;
403 #else
404 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
405 return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
406 #endif
407}
408__device__
409inline
410long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
411{
412 static_assert(sizeof(long long) == 2 * sizeof(int), "");
413 static_assert(sizeof(long long) == sizeof(uint64_t), "");
414 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
415 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
416 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
417 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
418 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
419 return tmp1;
420}
421__device__
422inline
423unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
424{
425 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
426 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
427 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
428 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
429 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
430 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
431 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
432 return tmp1;
433}
434
435__device__
436inline
437int __shfl_xor(int var, int lane_mask, int width = warpSize) {
438 int self = __lane_id();
439 int index = self^lane_mask;
440 index = index >= ((self+width)&~(width-1))?self:index;
441 return __builtin_amdgcn_ds_bpermute(index<<2, var);
442}
443__device__
444inline
445unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
446 union { int i; unsigned u; float f; } tmp; tmp.u = var;
447 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
448 return tmp.u;
449}
450__device__
451inline
452float __shfl_xor(float var, int lane_mask, int width = warpSize) {
453 union { int i; unsigned u; float f; } tmp; tmp.f = var;
454 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
455 return tmp.f;
456}
457__device__
458inline
459double __shfl_xor(double var, int lane_mask, int width = warpSize) {
460 static_assert(sizeof(double) == 2 * sizeof(int), "");
461 static_assert(sizeof(double) == sizeof(uint64_t), "");
462
463 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
464 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
465 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
466
467 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
468 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
469 return tmp1;
470}
471__device__
472inline
473long __shfl_xor(long var, int lane_mask, int width = warpSize)
474{
475 #ifndef _MSC_VER
476 static_assert(sizeof(long) == 2 * sizeof(int), "");
477 static_assert(sizeof(long) == sizeof(uint64_t), "");
478
479 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
480 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
481 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
482
483 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
484 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
485 return tmp1;
486 #else
487 static_assert(sizeof(long) == sizeof(int), "");
488 return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
489 #endif
490}
491__device__
492inline
493unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
494{
495 #ifndef _MSC_VER
496 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
497 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
498
499 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
500 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
501 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
502
503 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
504 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
505 return tmp1;
506 #else
507 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
508 return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
509 #endif
510}
511__device__
512inline
513long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
514{
515 static_assert(sizeof(long long) == 2 * sizeof(int), "");
516 static_assert(sizeof(long long) == sizeof(uint64_t), "");
517 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
518 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
519 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
520 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
521 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
522 return tmp1;
523}
524__device__
525inline
526unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
527{
528 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
529 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
530 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
531 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
532 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
533 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
534 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
535 return tmp1;
536}
537
538#endif