HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_runtime.h
1/*
2Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
28//#pragma once
29#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
31
32#include <hip/amd_detail/amd_hip_common.h>
33
34#ifdef __cplusplus
35extern "C" {
36#endif
37
46const char* amd_dbgapi_get_build_name();
47
55const char* amd_dbgapi_get_git_hash();
56
64size_t amd_dbgapi_get_build_id();
65
66#ifdef __cplusplus
67} /* extern "c" */
68#endif
69
70//---
71// Top part of file can be compiled with any compiler
72
73#if !defined(__HIPCC_RTC__)
74//#include <cstring>
75#if __cplusplus
76#include <cmath>
77#include <cstdint>
78#else
79#include <math.h>
80#include <string.h>
81#include <stddef.h>
82#endif // __cplusplus
83#else
84typedef unsigned int uint32_t;
85typedef unsigned long long uint64_t;
86typedef signed int int32_t;
87typedef signed long long int64_t;
88namespace std {
89using ::uint32_t;
90using ::uint64_t;
91using ::int32_t;
92using ::int64_t;
93}
94#endif // !defined(__HIPCC_RTC__)
95
96#if __HIP_CLANG_ONLY__
97
98#if !defined(__align__)
99#define __align__(x) __attribute__((aligned(x)))
100#endif
101
102#define CUDA_SUCCESS hipSuccess
103
104#if !defined(__HIPCC_RTC__)
105#include <hip/hip_runtime_api.h>
106extern int HIP_TRACE_API;
107#endif // !defined(__HIPCC_RTC__)
108
109#ifdef __cplusplus
110#include <hip/amd_detail/hip_ldg.h>
111#endif
112#include <hip/amd_detail/amd_hip_atomic.h>
114#include <hip/amd_detail/amd_device_functions.h>
115#include <hip/amd_detail/amd_surface_functions.h>
116#include <hip/amd_detail/texture_fetch_functions.h>
117#include <hip/amd_detail/texture_indirect_functions.h>
118
119// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
120#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
121#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
122#endif
123
124// Feature tests:
125#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
126// Device compile and not host compile:
127
128// 32-bit Atomics:
129#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
130#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
131#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
132#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
133#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
134
135// 64-bit Atomics:
136#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
137#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
138
139// Doubles
140#define __HIP_ARCH_HAS_DOUBLES__ (1)
141
142// warp cross-lane operations:
143#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
144#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
145#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
146#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
147
148// sync
149#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
150#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
151
152// misc
153#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
154#define __HIP_ARCH_HAS_3DGRID__ (1)
155#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
156
157#endif /* Device feature flags */
158
159
160#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
161 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
162#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
163 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
164 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
165#define select_impl_(_1, _2, impl_, ...) impl_
166#define __launch_bounds__(...) \
167 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
168
169#if !defined(__HIPCC_RTC__)
170__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
171#endif // !defined(__HIPCC_RTC__)
172
173// End doxygen API:
178//
179// hip-clang functions
180//
181#if !defined(__HIPCC_RTC__)
182#define HIP_KERNEL_NAME(...) __VA_ARGS__
183#define HIP_SYMBOL(X) X
184
185typedef int hipLaunchParm;
186
187template <std::size_t n, typename... Ts,
188 typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
189void pArgs(const std::tuple<Ts...>&, void*) {}
190
191template <std::size_t n, typename... Ts,
192 typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
193void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
194 using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
195
196 static_assert(!std::is_reference<T>{},
197 "A __global__ function cannot have a reference as one of its "
198 "arguments.");
199#if defined(HIP_STRICT)
200 static_assert(std::is_trivially_copyable<T>{},
201 "Only TriviallyCopyable types can be arguments to a __global__ "
202 "function");
203#endif
204 _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
205 return pArgs<n + 1>(formals, _vargs);
206}
207
208template <typename... Formals, typename... Actuals>
209std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
210 static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
211 std::tuple<Formals...> to_formals{std::move(actuals)};
212 return to_formals;
213}
214
215#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
216template <typename... Args, typename F = void (*)(Args...)>
217void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
218 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
219 constexpr size_t count = sizeof...(Args);
220 auto tup_ = std::tuple<Args...>{args...};
221 auto tup = validateArgsCountType(kernel, tup_);
222 void* _Args[count];
223 pArgs<0>(tup, _Args);
224
225 auto k = reinterpret_cast<void*>(kernel);
226 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
227}
228#else
229#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
230 do { \
231 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
232 } while (0)
233
234#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
235#endif
236
237#include <hip/hip_runtime_api.h>
238#endif // !defined(__HIPCC_RTC__)
239
240extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
241extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
242extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
243extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
244struct __HIP_BlockIdx {
245 __device__
246 std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
247};
248struct __HIP_BlockDim {
249 __device__
250 std::uint32_t operator()(std::uint32_t x) const noexcept {
251 return __ockl_get_local_size(x);
252 }
253};
254struct __HIP_GridDim {
255 __device__
256 std::uint32_t operator()(std::uint32_t x) const noexcept {
257 return __ockl_get_num_groups(x);
258 }
259};
260struct __HIP_ThreadIdx {
261 __device__
262 std::uint32_t operator()(std::uint32_t x) const noexcept {
263 return __ockl_get_local_id(x);
264 }
265};
266
267#if defined(__HIPCC_RTC__)
268typedef struct dim3 {
269 uint32_t x;
270 uint32_t y;
271 uint32_t z;
272#ifdef __cplusplus
273 constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
274#endif
275} dim3;
276#endif // !defined(__HIPCC_RTC__)
277
278template <typename F>
279struct __HIP_Coordinates {
280 using R = decltype(F{}(0));
281
282 struct __X {
283 __device__ operator R() const noexcept { return F{}(0); }
284 __device__ R operator+=(const R& rhs) { return F{}(0) + rhs; }
285 };
286 struct __Y {
287 __device__ operator R() const noexcept { return F{}(1); }
288 __device__ R operator+=(const R& rhs) { return F{}(1) + rhs; }
289 };
290 struct __Z {
291 __device__ operator R() const noexcept { return F{}(2); }
292 __device__ R operator+=(const R& rhs) { return F{}(2) + rhs; }
293 };
294
295 static constexpr __X x{};
296 static constexpr __Y y{};
297 static constexpr __Z z{};
298#ifdef __cplusplus
299 __device__ operator dim3() const { return dim3(x, y, z); }
300#endif
301
302};
303template <typename F>
304#if !defined(_MSC_VER)
305__attribute__((weak))
306#endif
307constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
308template <typename F>
309#if !defined(_MSC_VER)
310__attribute__((weak))
311#endif
312constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
313template <typename F>
314#if !defined(_MSC_VER)
315__attribute__((weak))
316#endif
317constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
318
319extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
320inline
321__device__
322std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
323 __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
324 return __ockl_get_global_size(0);
325}
326inline
327__device__
328std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
329 __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
330 return __ockl_get_global_size(0);
331}
332inline
333__device__
334std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
335 __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
336 return __ockl_get_global_size(1);
337}
338inline
339__device__
340std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
341 __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
342 return __ockl_get_global_size(1);
343}
344inline
345__device__
346std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
347 __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
348 return __ockl_get_global_size(2);
349}
350inline
351__device__
352std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
353 __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
354 return __ockl_get_global_size(2);
355}
356
357static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
358static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
359static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
360static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
361
362extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
363#define hipThreadIdx_x (__ockl_get_local_id(0))
364#define hipThreadIdx_y (__ockl_get_local_id(1))
365#define hipThreadIdx_z (__ockl_get_local_id(2))
366
367extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
368#define hipBlockIdx_x (__ockl_get_group_id(0))
369#define hipBlockIdx_y (__ockl_get_group_id(1))
370#define hipBlockIdx_z (__ockl_get_group_id(2))
371
372extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
373#define hipBlockDim_x (__ockl_get_local_size(0))
374#define hipBlockDim_y (__ockl_get_local_size(1))
375#define hipBlockDim_z (__ockl_get_local_size(2))
376
377extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
378#define hipGridDim_x (__ockl_get_num_groups(0))
379#define hipGridDim_y (__ockl_get_num_groups(1))
380#define hipGridDim_z (__ockl_get_num_groups(2))
381
382#include <hip/amd_detail/amd_math_functions.h>
383
384#if __HIP_HCC_COMPAT_MODE__
385// Define HCC work item functions in terms of HIP builtin variables.
386#pragma push_macro("__DEFINE_HCC_FUNC")
387#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
388inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
389 if (i==0) \
390 return hip_var.x; \
391 else if(i==1) \
392 return hip_var.y; \
393 else \
394 return hip_var.z; \
395}
396
397__DEFINE_HCC_FUNC(workitem_id, threadIdx)
398__DEFINE_HCC_FUNC(group_id, blockIdx)
399__DEFINE_HCC_FUNC(group_size, blockDim)
400__DEFINE_HCC_FUNC(num_groups, gridDim)
401#pragma pop_macro("__DEFINE_HCC_FUNC")
402
403extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
404inline __device__ __attribute__((always_inline)) uint
405hc_get_workitem_absolute_id(int dim)
406{
407 return (uint)__ockl_get_global_id(dim);
408}
409
410#endif
411
412#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
413#if !defined(__HIPCC_RTC__)
414// Support std::complex.
415#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
416#pragma push_macro("__CUDA__")
417#define __CUDA__
418#include <__clang_cuda_math_forward_declares.h>
419#include <__clang_cuda_complex_builtins.h>
420// Workaround for using libc++ with HIP-Clang.
421// The following headers requires clang include path before standard C++ include path.
422// However libc++ include path requires to be before clang include path.
423// To workaround this, we pass -isystem with the parent directory of clang include
424// path instead of the clang include path itself.
425#include <include/cuda_wrappers/algorithm>
426#include <include/cuda_wrappers/complex>
427#include <include/cuda_wrappers/new>
428#undef __CUDA__
429#pragma pop_macro("__CUDA__")
430#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
431#endif // !defined(__HIPCC_RTC__)
432#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
433#endif // __HIP_CLANG_ONLY__
434
435#endif // HIP_AMD_DETAIL_RUNTIME_H
#define __host__
Definition host_defines.h:170