HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
hip_cooperative_groups_helper.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
31#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
32#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
33
34#if __cplusplus
35#if !defined(__HIPCC_RTC__)
36#include <hip/amd_detail/amd_device_functions.h>
37#endif
38#if !defined(__align__)
39#define __align__(x) __attribute__((aligned(x)))
40#endif
41
42#if defined(__clang__)
43#pragma clang diagnostic push
44#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
45#pragma clang diagnostic ignored "-Wc++98-compat"
46#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
47#pragma clang diagnostic ignored "-Wshorten-64-to-32"
48#endif
49
50#if !defined(__CG_QUALIFIER__)
51#define __CG_QUALIFIER__ __device__ __forceinline__
52#endif
53
54#if !defined(__CG_STATIC_QUALIFIER__)
55#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
56#endif
57
58#if !defined(_CG_STATIC_CONST_DECL_)
59#define _CG_STATIC_CONST_DECL_ static constexpr
60#endif
61
62#if __AMDGCN_WAVEFRONT_SIZE == 32
63using lane_mask = unsigned int;
64#else
65using lane_mask = unsigned long long int;
66#endif
67
68namespace cooperative_groups {
69
70/* Global scope */
71template <unsigned int size>
72using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
73
74template <unsigned int size>
75using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
76
77template <unsigned int size>
78using is_valid_tile_size =
79 std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
80
81template <typename T>
82using is_valid_type =
83 std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
84
85namespace internal {
86
92typedef enum {
93 cg_invalid,
94 cg_multi_grid,
95 cg_grid,
96 cg_workgroup,
97 cg_tiled_group,
98 cg_coalesced_group
99} group_type;
118namespace multi_grid {
119
120__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
121 return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
122
123__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
124 return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
125
126__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
127
128__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
129 return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
130
131__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
132
133__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
134
135} // namespace multi_grid
136
141namespace grid {
142
143__CG_STATIC_QUALIFIER__ uint32_t size() {
144 return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
145 (blockDim.x * gridDim.x));
146}
147
148__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
149 // Compute global id of the workgroup to which the current thread belongs to
150 uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
151 (blockIdx.y * gridDim.x) + (blockIdx.x));
152
153 // Compute total number of threads being passed to reach current workgroup
154 // within grid
155 uint32_t num_threads_till_current_workgroup =
156 static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
157
158 // Compute thread local rank within current workgroup
159 uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
160 (threadIdx.y * blockDim.x) + (threadIdx.x));
161
162 return (num_threads_till_current_workgroup + local_thread_rank);
163}
164
165__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
166
167__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
168
169} // namespace grid
170
176namespace workgroup {
177
178__CG_STATIC_QUALIFIER__ dim3 group_index() {
179 return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
180 static_cast<uint32_t>(blockIdx.z)));
181}
182
183__CG_STATIC_QUALIFIER__ dim3 thread_index() {
184 return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
185 static_cast<uint32_t>(threadIdx.z)));
186}
187
188__CG_STATIC_QUALIFIER__ uint32_t size() {
189 return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
190}
191
192__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
193 return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
194 (threadIdx.y * blockDim.x) + (threadIdx.x)));
195}
196
197__CG_STATIC_QUALIFIER__ bool is_valid() {
198 return true;
199}
200
201__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
202
203__CG_STATIC_QUALIFIER__ dim3 block_dim() {
204 return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
205 static_cast<uint32_t>(blockDim.z)));
206}
207
208} // namespace workgroup
209
210namespace tiled_group {
211
212// enforce ordering for memory intructions
213__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
214
215} // namespace tiled_group
216
217namespace coalesced_group {
218
219// enforce ordering for memory intructions
220__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
221
222// Masked bit count
223//
224// For each thread, this function returns the number of active threads which
225// have i-th bit of x set and come before the current thread.
226__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
227 unsigned int counter=0;
228 #if __AMDGCN_WAVEFRONT_SIZE == 32
229 counter = __builtin_amdgcn_mbcnt_lo(x, add);
230 #else
231 counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
232 counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
233 #endif
234
235 return counter;
236}
237
238} // namespace coalesced_group
239
240
241} // namespace internal
242
243} // namespace cooperative_groups
247#if defined(__clang__)
248#pragma clang diagnostic pop
249#endif
250
251#endif // __cplusplus
252#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H