From a48905ec9cfe0e017cc64943195be82b530117d7 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 17 Sep 2024 03:14:56 +0100 Subject: [PATCH 1/4] Add scaffolding for SPIR-V support. --- hipamd/src/hip_fatbin.cpp | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp index b26377135e..f9ffdc8c3a 100644 --- a/hipamd/src/hip_fatbin.cpp +++ b/hipamd/src/hip_fatbin.cpp @@ -242,6 +242,9 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vectordevices()[0]->isa().isaName(); unique_isa_names.insert({device_name, std::make_pair(0,0)}); } + unique_isa_names.emplace(std::piecewise_construct, + std::forward_as_tuple("spirv64-amd-amdhsa--amdgcnspirv"), + std::forward_as_tuple(0u, 0u)); // Create a query list using COMGR info for unique ISAs. std::vector query_list_array; @@ -263,29 +266,35 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vectorsecond = std::pair - (static_cast(item.size), - static_cast(item.offset)); + unique_it->second = std::make_pair(item.size, item.offset); } for (auto device : devices) { std::string device_name = device->devices()[0]->isa().isaName(); auto dev_it = unique_isa_names.find(device_name); - // If the size is 0, then COMGR API could not find the CO for this GPU device/ISA - if (dev_it->second.first == 0) { + guarantee(unique_isa_names.cend() != dev_it, + "Cannot find the device name in the unique device name"); + if (dev_it->second.first != 0) { + fatbin_dev_info_[device->deviceId()] + = new FatBinaryDeviceInfo(reinterpret_cast
(const_cast(image_)) + + dev_it->second.second, dev_it->second.first, + dev_it->second.second); + } else if (unique_isa_names["spirv64-amd-amdhsa--amdgcnspirv"].first) { + LogPrintfError("SPIR-V support is not yet available, even though it " + "is needed for the bundle %s, which does not contain " + "requested ISA: %s", + fname_.c_str(), device_name.c_str()); + hip_status = hipErrorNoBinaryForGpu; + ListAllDeviceWithNoCOFromBundle(unique_isa_names); + break; + } else { LogPrintfError("Cannot find CO in the bundle %s for ISA: %s", fname_.c_str(), device_name.c_str()); hip_status = hipErrorNoBinaryForGpu; ListAllDeviceWithNoCOFromBundle(unique_isa_names); break; } - guarantee(unique_isa_names.cend() != dev_it, - "Cannot find the device name in the unique device name"); - fatbin_dev_info_[device->deviceId()] - = new FatBinaryDeviceInfo(reinterpret_cast
(const_cast(image_)) - + dev_it->second.second, dev_it->second.first, - dev_it->second.second); + fatbin_dev_info_[device->deviceId()]->program_ = new amd::Program(*(device->asContext())); } From 5e40dd746ac4f8c93b521ef048ff9d494905ba95 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 6 Dec 2024 22:46:05 +0000 Subject: [PATCH 2/4] Revert stale change. --- hipamd/src/hip_fatbin.cpp | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp index ae23d4a668..c434403726 100644 --- a/hipamd/src/hip_fatbin.cpp +++ b/hipamd/src/hip_fatbin.cpp @@ -250,9 +250,6 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vectordevices()[0]->isa().isaName(); unique_isa_names.insert({device_name, std::make_pair(0,0)}); } - unique_isa_names.emplace(std::piecewise_construct, - std::forward_as_tuple("spirv64-amd-amdhsa--amdgcnspirv"), - std::forward_as_tuple(0u, 0u)); // Create a query list using COMGR info for unique ISAs. std::vector query_list_array; @@ -274,35 +271,29 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vectorsecond = std::make_pair(item.size, item.offset); + guarantee(unique_isa_names.cend() != unique_it, "Cannot find unique isa "); + unique_it->second = std::pair + (static_cast(item.size), + static_cast(item.offset)); } for (auto device : devices) { std::string device_name = device->devices()[0]->isa().isaName(); auto dev_it = unique_isa_names.find(device_name); - guarantee(unique_isa_names.cend() != dev_it, - "Cannot find the device name in the unique device name"); - if (dev_it->second.first != 0) { - fatbin_dev_info_[device->deviceId()] - = new FatBinaryDeviceInfo(reinterpret_cast
(const_cast(image_)) - + dev_it->second.second, dev_it->second.first, - dev_it->second.second); - } else if (unique_isa_names["spirv64-amd-amdhsa--amdgcnspirv"].first) { - LogPrintfError("SPIR-V support is not yet available, even though it " - "is needed for the bundle %s, which does not contain " - "requested ISA: %s", - fname_.c_str(), device_name.c_str()); - hip_status = hipErrorNoBinaryForGpu; - ListAllDeviceWithNoCOFromBundle(unique_isa_names); - break; - } else { + // If the size is 0, then COMGR API could not find the CO for this GPU device/ISA + if (dev_it->second.first == 0) { LogPrintfError("Cannot find CO in the bundle %s for ISA: %s", fname_.c_str(), device_name.c_str()); hip_status = hipErrorNoBinaryForGpu; ListAllDeviceWithNoCOFromBundle(unique_isa_names); break; } - + guarantee(unique_isa_names.cend() != dev_it, + "Cannot find the device name in the unique device name"); + fatbin_dev_info_[device->deviceId()] + = new FatBinaryDeviceInfo(reinterpret_cast
(const_cast(image_)) + + dev_it->second.second, dev_it->second.first, + dev_it->second.second); fatbin_dev_info_[device->deviceId()]->program_ = new amd::Program(*(device->asContext())); } From a72307a7353034c2de53fd164e016967945fd0d1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 6 Dec 2024 23:12:14 +0000 Subject: [PATCH 3/4] Prepare HIP RT for SPIR-V. --- hipamd/include/hip/amd_detail/amd_hip_bf16.h | 16 ++++----- .../amd_detail/amd_hip_cooperative_groups.h | 34 +++++++++---------- .../hip/amd_detail/amd_warp_functions.h | 15 ++++++-- .../hip/amd_detail/amd_warp_sync_functions.h | 8 ++--- .../hip_cooperative_groups_helper.h | 20 ++++++----- 5 files changed, 54 insertions(+), 39 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h index d89a4bb0ce..51e5aca284 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -689,7 +689,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_down_sync(const unsigned long long mask, const __hip_bfloat16 in, const unsigned int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { return __ushort_as_bfloat16(__shfl_down_sync(mask, __bfloat16_as_ushort(in), delta, width)); } @@ -700,7 +700,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_down_sync(const unsigned long long __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_down_sync(const unsigned long long mask, const __hip_bfloat162 in, const unsigned int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int)); union { __hip_bfloat162 bf162; @@ -716,7 +716,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_down_sync(const unsigned long long */ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_sync(const unsigned long long mask, const __hip_bfloat16 in, const int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { return __ushort_as_bfloat16(__shfl_sync(mask, __bfloat16_as_ushort(in), delta, width)); } @@ -726,7 +726,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_sync(const unsigned long long mask, */ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_sync(const unsigned long long mask, const __hip_bfloat162 in, const int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int)); union { __hip_bfloat162 bf162; @@ -743,7 +743,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_sync(const unsigned long long mask __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_up_sync(const unsigned long long mask, const __hip_bfloat16 in, const unsigned int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { return __ushort_as_bfloat16(__shfl_up_sync(mask, __bfloat16_as_ushort(in), delta, width)); } @@ -754,7 +754,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_up_sync(const unsigned long long ma __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_up_sync(const unsigned long long mask, const __hip_bfloat162 in, const unsigned int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int)); union { __hip_bfloat162 bf162; @@ -770,7 +770,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_up_sync(const unsigned long long m */ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_xor_sync(const unsigned long long mask, const __hip_bfloat16 in, const int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { return __ushort_as_bfloat16(__shfl_xor_sync(mask, __bfloat16_as_ushort(in), delta, width)); } @@ -780,7 +780,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_xor_sync(const unsigned long long m */ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long mask, const __hip_bfloat162 in, const int delta, - const int width = __AMDGCN_WAVEFRONT_SIZE) { + const int width = warpSize) { static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int)); union { __hip_bfloat162 bf162; diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 9b23e2d2d9..8b0dd5bb26 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -215,7 +215,7 @@ class thread_block : public thread_group { __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); // Invalid tile size, assert - if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + if (!tile_size || (tile_size > warpSize) || !pow2) { __hip_assert(false && "invalid tile size"); } @@ -276,7 +276,7 @@ class tiled_group : public thread_group { __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); - if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { + if (!tile_size || (tile_size > warpSize) || !pow2) { __hip_assert(false && "invalid tile size"); } @@ -339,7 +339,7 @@ class coalesced_group : public thread_group { if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); - lane_mask member_mask = static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); + lane_mask member_mask = static_cast(-1) >> (warpSize - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); coalesced_group coalesced_tile = coalesced_group(member_mask); @@ -354,7 +354,7 @@ class coalesced_group : public thread_group { unsigned int tile_rank = 0; int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size; - for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) { + for (unsigned int i = 0; i < warpSize; i++) { lane_mask active = coalesced_info.member_mask & (1 << i); // Make sure the lane is active if (active) { @@ -413,11 +413,11 @@ class coalesced_group : public thread_group { srcRank = srcRank % static_cast(size()); - int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank - : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) - : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); + int lane = (size() == warpSize) ? srcRank + : (warpSize == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) + : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); - return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + return __shfl(var, lane, warpSize); } template @@ -428,12 +428,12 @@ class coalesced_group : public thread_group { // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. - if (size() == __AMDGCN_WAVEFRONT_SIZE) { - return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); + if (size() == warpSize) { + return __shfl_down(var, lane_delta, warpSize); } int lane; - if (__AMDGCN_WAVEFRONT_SIZE == 64) { + if (warpSize == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1); } else { @@ -444,7 +444,7 @@ class coalesced_group : public thread_group { lane = __lane_id(); } - return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + return __shfl(var, lane, warpSize); } template @@ -455,15 +455,15 @@ class coalesced_group : public thread_group { // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. - if (size() == __AMDGCN_WAVEFRONT_SIZE) { - return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE); + if (size() == warpSize) { + return __shfl_up(var, lane_delta, warpSize); } int lane; - if (__AMDGCN_WAVEFRONT_SIZE == 64) { + if (warpSize == 64) { lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } - else if (__AMDGCN_WAVEFRONT_SIZE == 32) { + else if (warpSize == 32) { lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1)); } @@ -471,7 +471,7 @@ class coalesced_group : public thread_group { lane = __lane_id(); } - return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); + return __shfl(var, lane, warpSize); } #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) __CG_QUALIFIER__ unsigned long long ballot(int pred) const { diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h index 0fae22fd8a..d8d22ea0be 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h @@ -83,8 +83,18 @@ __device__ static inline int __hip_move_dpp_N(int src) { bound_ctrl); } -__device__ -static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; +#if defined(__SPIRV__) + inline __device__ const struct final { + __device__ + __attribute__((always_inline, const)) + operator int() const noexcept { + return __builtin_amdgcn_wavefrontsize(); + } + } warpSize{}; +#else + __device__ + static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; +#endif // warp vote function __all __any __ballot __device__ @@ -123,6 +133,7 @@ unsigned long long __activemask() { #endif // HIP_DISABLE_WARP_SYNC_BUILTINS __device__ static inline unsigned int __lane_id() { + if (warpSize == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0); return __builtin_amdgcn_mbcnt_hi( -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); } diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index c531bb3dc6..8328b9d98b 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -229,7 +229,7 @@ unsigned long long __match_all_sync(MaskT mask, T value, int* pred) { template __device__ inline T __shfl_sync(MaskT mask, T var, int srcLane, - int width = __AMDGCN_WAVEFRONT_SIZE) { + int width = warpSize) { static_assert( __hip_internal::is_integral::value && sizeof(MaskT) == 8, "The mask must be a 64-bit integer. " @@ -242,7 +242,7 @@ T __shfl_sync(MaskT mask, T var, int srcLane, template __device__ inline T __shfl_up_sync(MaskT mask, T var, unsigned int delta, - int width = __AMDGCN_WAVEFRONT_SIZE) { + int width = warpSize) { static_assert( __hip_internal::is_integral::value && sizeof(MaskT) == 8, "The mask must be a 64-bit integer. " @@ -255,7 +255,7 @@ T __shfl_up_sync(MaskT mask, T var, unsigned int delta, template __device__ inline T __shfl_down_sync(MaskT mask, T var, unsigned int delta, - int width = __AMDGCN_WAVEFRONT_SIZE) { + int width = warpSize) { static_assert( __hip_internal::is_integral::value && sizeof(MaskT) == 8, "The mask must be a 64-bit integer. " @@ -268,7 +268,7 @@ T __shfl_down_sync(MaskT mask, T var, unsigned int delta, template __device__ inline T __shfl_xor_sync(MaskT mask, T var, int laneMask, - int width = __AMDGCN_WAVEFRONT_SIZE) { + int width = warpSize) { static_assert( __hip_internal::is_integral::value && sizeof(MaskT) == 8, "The mask must be a 64-bit integer. " diff --git a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index 95379bcaef..995c8e5949 100644 --- a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -52,6 +52,10 @@ THE SOFTWARE. #define _CG_STATIC_CONST_DECL_ static constexpr #endif +#if defined(__SPIRV__) && !defined(__AMDGCN_WAVEFRONT_SIZE) +#error "TEMPORARY LIMITATION: when targeting AMDGCN SPIR-V" + "__AMDGCN_WAVEFRONT_SIZE is not defined, and must be defined by the user" +#endif #if __AMDGCN_WAVEFRONT_SIZE == 32 using lane_mask = unsigned int; #else @@ -94,7 +98,7 @@ typedef enum { * @ingroup CooperativeG * @{ * This section describes the cooperative groups functions of HIP runtime API. - * + * * The cooperative groups provides flexible thread parallel programming algorithms, threads * cooperate and share data to perform collective computations. * @@ -118,7 +122,7 @@ namespace helper { __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask( unsigned long long base_mask, unsigned long long input_mask) { unsigned long long out = 0; - for (unsigned int i = 0, index = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) { + for (unsigned int i = 0, index = 0; i < warpSize; i++) { auto lane_active = base_mask & (1ull << i); if (lane_active) { auto result = input_mask & (1ull << i); @@ -245,14 +249,14 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, " // have i-th bit of x set and come before the current thread. __CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { unsigned int counter=0; - #if __AMDGCN_WAVEFRONT_SIZE == 32 + if (warpSize == 32) { counter = __builtin_amdgcn_mbcnt_lo(x, add); - #else - counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); - counter = __builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), counter); - #endif + } else { + counter = __builtin_amdgcn_mbcnt_lo(static_cast(x), add); + counter = __builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), counter); + } - return counter; + return counter; } } // namespace coalesced_group From 6840826c3fec8516857dc4f2092d84358550f588 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 6 Dec 2024 23:36:32 +0000 Subject: [PATCH 4/4] Add deprecation warning for constexpr uses of `warpSize`. --- hipamd/include/hip/amd_detail/amd_warp_functions.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h index d8d22ea0be..143ea234fb 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h @@ -92,6 +92,10 @@ __device__ static inline int __hip_move_dpp_N(int src) { } } warpSize{}; #else + #if defined(__HIP_DEVICE_COMPILE__) + [[deprecated("Using warpSize as a compile time constant is deprecated, " + "and will no longer be possible in future releases.")]] + #endif __device__ static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; #endif