From a48905ec9cfe0e017cc64943195be82b530117d7 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 17 Sep 2024 03:14:56 +0100
Subject: [PATCH 1/4] Add scaffolding for SPIR-V support.

---
 hipamd/src/hip_fatbin.cpp | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)
diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp
index b26377135e..f9ffdc8c3a 100644
--- a/hipamd/src/hip_fatbin.cpp
+++ b/hipamd/src/hip_fatbin.cpp
@@ -242,6 +242,9 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
       std::string device_name = device->devices()[0]->isa().isaName();
       unique_isa_names.insert({device_name, std::make_pair<size_t, size_t>(0,0)});
     }
+    unique_isa_names.emplace(std::piecewise_construct,
+                             std::forward_as_tuple("spirv64-amd-amdhsa--amdgcnspirv"),
+                             std::forward_as_tuple(0u, 0u));
 
     // Create a query list using COMGR info for unique ISAs.
     std::vector<amd_comgr_code_object_info_t> query_list_array;
@@ -263,29 +266,35 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
 
     for (const auto &item : query_list_array) {
       auto unique_it = unique_isa_names.find(item.isa);
-      guarantee(unique_isa_names.cend() != unique_it, "Cannot find unique isa ");
-      unique_it->second = std::pair<size_t, size_t>
-                            (static_cast<size_t>(item.size),
-                             static_cast<size_t>(item.offset));
+      unique_it->second = std::make_pair(item.size, item.offset);
     }
 
     for (auto device : devices) {
       std::string device_name = device->devices()[0]->isa().isaName();
       auto dev_it = unique_isa_names.find(device_name);
-      // If the size is 0, then COMGR API could not find the CO for this GPU device/ISA
-      if (dev_it->second.first == 0) {
+      guarantee(unique_isa_names.cend() != dev_it,
+                "Cannot find the device name in the unique device name");
+      if (dev_it->second.first != 0) {
+        fatbin_dev_info_[device->deviceId()]
+          = new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
+                                    + dev_it->second.second, dev_it->second.first,
+                                                             dev_it->second.second);
+      } else if (unique_isa_names["spirv64-amd-amdhsa--amdgcnspirv"].first) {
+        LogPrintfError("SPIR-V support is not yet available, even though it "
+                       "is needed for the bundle %s, which does not contain "
+                       "requested ISA: %s",
+                        fname_.c_str(), device_name.c_str());
+        hip_status = hipErrorNoBinaryForGpu;
+        ListAllDeviceWithNoCOFromBundle(unique_isa_names);
+        break;
+      } else {
         LogPrintfError("Cannot find CO in the bundle %s for ISA: %s",
                         fname_.c_str(), device_name.c_str());
         hip_status = hipErrorNoBinaryForGpu;
         ListAllDeviceWithNoCOFromBundle(unique_isa_names);
         break;
       }
-      guarantee(unique_isa_names.cend() != dev_it,
-                "Cannot find the device name in the unique device name");
-      fatbin_dev_info_[device->deviceId()]
-        = new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
-                                  + dev_it->second.second, dev_it->second.first,
-                                                           dev_it->second.second);
+
       fatbin_dev_info_[device->deviceId()]->program_
         = new amd::Program(*(device->asContext()));
     }

From 5e40dd746ac4f8c93b521ef048ff9d494905ba95 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Fri, 6 Dec 2024 22:46:05 +0000
Subject: [PATCH 2/4] Revert stale change.

---
 hipamd/src/hip_fatbin.cpp | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp
index ae23d4a668..c434403726 100644
--- a/hipamd/src/hip_fatbin.cpp
+++ b/hipamd/src/hip_fatbin.cpp
@@ -250,9 +250,6 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
       std::string device_name = device->devices()[0]->isa().isaName();
       unique_isa_names.insert({device_name, std::make_pair<size_t, size_t>(0,0)});
     }
-    unique_isa_names.emplace(std::piecewise_construct,
-                             std::forward_as_tuple("spirv64-amd-amdhsa--amdgcnspirv"),
-                             std::forward_as_tuple(0u, 0u));
 
     // Create a query list using COMGR info for unique ISAs.
     std::vector<amd_comgr_code_object_info_t> query_list_array;
@@ -274,35 +271,29 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
 
     for (const auto &item : query_list_array) {
       auto unique_it = unique_isa_names.find(item.isa);
-      unique_it->second = std::make_pair(item.size, item.offset);
+      guarantee(unique_isa_names.cend() != unique_it, "Cannot find unique isa ");
+      unique_it->second = std::pair<size_t, size_t>
+                            (static_cast<size_t>(item.size),
+                             static_cast<size_t>(item.offset));
     }
 
     for (auto device : devices) {
       std::string device_name = device->devices()[0]->isa().isaName();
       auto dev_it = unique_isa_names.find(device_name);
-      guarantee(unique_isa_names.cend() != dev_it,
-                "Cannot find the device name in the unique device name");
-      if (dev_it->second.first != 0) {
-        fatbin_dev_info_[device->deviceId()]
-          = new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
-                                    + dev_it->second.second, dev_it->second.first,
-                                                             dev_it->second.second);
-      } else if (unique_isa_names["spirv64-amd-amdhsa--amdgcnspirv"].first) {
-        LogPrintfError("SPIR-V support is not yet available, even though it "
-                       "is needed for the bundle %s, which does not contain "
-                       "requested ISA: %s",
-                        fname_.c_str(), device_name.c_str());
-        hip_status = hipErrorNoBinaryForGpu;
-        ListAllDeviceWithNoCOFromBundle(unique_isa_names);
-        break;
-      } else {
+      // If the size is 0, then COMGR API could not find the CO for this GPU device/ISA
+      if (dev_it->second.first == 0) {
         LogPrintfError("Cannot find CO in the bundle %s for ISA: %s",
                         fname_.c_str(), device_name.c_str());
         hip_status = hipErrorNoBinaryForGpu;
         ListAllDeviceWithNoCOFromBundle(unique_isa_names);
         break;
       }
-
+      guarantee(unique_isa_names.cend() != dev_it,
+                "Cannot find the device name in the unique device name");
+      fatbin_dev_info_[device->deviceId()]
+        = new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
+                                  + dev_it->second.second, dev_it->second.first,
+                                                           dev_it->second.second);
       fatbin_dev_info_[device->deviceId()]->program_
         = new amd::Program(*(device->asContext()));
     }

From a72307a7353034c2de53fd164e016967945fd0d1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Fri, 6 Dec 2024 23:12:14 +0000
Subject: [PATCH 3/4] Prepare HIP RT for SPIR-V.

---
 hipamd/include/hip/amd_detail/amd_hip_bf16.h  | 16 ++++-----
 .../amd_detail/amd_hip_cooperative_groups.h   | 34 +++++++++----------
 .../hip/amd_detail/amd_warp_functions.h       | 15 ++++++--
 .../hip/amd_detail/amd_warp_sync_functions.h  |  8 ++---
 .../hip_cooperative_groups_helper.h           | 20 ++++++-----
 5 files changed, 54 insertions(+), 39 deletions(-)

diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
index d89a4bb0ce..51e5aca284 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_bf16.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
@@ -689,7 +689,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
 __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_down_sync(const unsigned long long mask,
                                                        const __hip_bfloat16 in,
                                                        const unsigned int delta,
-                                                       const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                       const int width = warpSize) {
   return __ushort_as_bfloat16(__shfl_down_sync(mask, __bfloat16_as_ushort(in), delta, width));
 }
 
@@ -700,7 +700,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_down_sync(const unsigned long long
 __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_down_sync(const unsigned long long mask,
                                                         const __hip_bfloat162 in,
                                                         const unsigned int delta,
-                                                        const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                        const int width = warpSize) {
   static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
   union {
     __hip_bfloat162 bf162;
@@ -716,7 +716,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_down_sync(const unsigned long long
  */
 __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_sync(const unsigned long long mask,
                                                   const __hip_bfloat16 in, const int delta,
-                                                  const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                  const int width = warpSize) {
   return __ushort_as_bfloat16(__shfl_sync(mask, __bfloat16_as_ushort(in), delta, width));
 }
 
@@ -726,7 +726,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_sync(const unsigned long long mask,
  */
 __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_sync(const unsigned long long mask,
                                                    const __hip_bfloat162 in, const int delta,
-                                                   const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                   const int width = warpSize) {
   static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
   union {
     __hip_bfloat162 bf162;
@@ -743,7 +743,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_sync(const unsigned long long mask
 __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_up_sync(const unsigned long long mask,
                                                      const __hip_bfloat16 in,
                                                      const unsigned int delta,
-                                                     const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                     const int width = warpSize) {
   return __ushort_as_bfloat16(__shfl_up_sync(mask, __bfloat16_as_ushort(in), delta, width));
 }
 
@@ -754,7 +754,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_up_sync(const unsigned long long ma
 __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_up_sync(const unsigned long long mask,
                                                       const __hip_bfloat162 in,
                                                       const unsigned int delta,
-                                                      const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                      const int width = warpSize) {
   static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
   union {
     __hip_bfloat162 bf162;
@@ -770,7 +770,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_up_sync(const unsigned long long m
  */
 __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_xor_sync(const unsigned long long mask,
                                                       const __hip_bfloat16 in, const int delta,
-                                                      const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                      const int width = warpSize) {
   return __ushort_as_bfloat16(__shfl_xor_sync(mask, __bfloat16_as_ushort(in), delta, width));
 }
 
@@ -780,7 +780,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_xor_sync(const unsigned long long m
  */
 __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long mask,
                                                        const __hip_bfloat162 in, const int delta,
-                                                       const int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                                       const int width = warpSize) {
   static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
   union {
     __hip_bfloat162 bf162;
diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
index 9b23e2d2d9..8b0dd5bb26 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
@@ -215,7 +215,7 @@ class thread_block : public thread_group {
   __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
     const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
     // Invalid tile size, assert
-    if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
+    if (!tile_size || (tile_size > warpSize) || !pow2) {
       __hip_assert(false && "invalid tile size");
     }
 
@@ -276,7 +276,7 @@ class tiled_group : public thread_group {
   __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
     const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
 
-    if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
+    if (!tile_size || (tile_size > warpSize) || !pow2) {
       __hip_assert(false && "invalid tile size");
     }
 
@@ -339,7 +339,7 @@ class coalesced_group : public thread_group {
     if (coalesced_info.tiled_info.is_tiled) {
       unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
       unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
-      lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
+      lane_mask member_mask = static_cast<lane_mask>(-1) >> (warpSize - masklength);
 
       member_mask <<= (__lane_id() & ~(tile_size - 1));
       coalesced_group coalesced_tile = coalesced_group(member_mask);
@@ -354,7 +354,7 @@ class coalesced_group : public thread_group {
       unsigned int tile_rank = 0;
       int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
 
-      for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
+      for (unsigned int i = 0; i < warpSize; i++) {
         lane_mask active = coalesced_info.member_mask & (1 << i);
         // Make sure the lane is active
         if (active) {
@@ -413,11 +413,11 @@ class coalesced_group : public thread_group {
 
     srcRank = srcRank % static_cast<int>(size());
 
-    int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
-             : (__AMDGCN_WAVEFRONT_SIZE == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
-                                          : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
+    int lane = (size() == warpSize) ? srcRank
+             : (warpSize == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
+                                    : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
 
-    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
+    return __shfl(var, lane, warpSize);
   }
 
   template <class T>
@@ -428,12 +428,12 @@ class coalesced_group : public thread_group {
     // and WARP_SIZE as the shift value rather than lane_delta itself.
     // This is not described in the documentation and is not done here.
 
-    if (size() == __AMDGCN_WAVEFRONT_SIZE) {
-      return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
+    if (size() == warpSize) {
+      return __shfl_down(var, lane_delta, warpSize);
     }
 
     int lane;
-    if (__AMDGCN_WAVEFRONT_SIZE == 64) {
+    if (warpSize == 64) {
       lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
     }
     else {
@@ -444,7 +444,7 @@ class coalesced_group : public thread_group {
       lane = __lane_id();
     }
 
-    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
+    return __shfl(var, lane, warpSize);
   }
 
   template <class T>
@@ -455,15 +455,15 @@ class coalesced_group : public thread_group {
     // and WARP_SIZE as the shift value rather than lane_delta itself.
     // This is not described in the documentation and is not done here.
 
-    if (size() == __AMDGCN_WAVEFRONT_SIZE) {
-      return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
+    if (size() == warpSize) {
+      return __shfl_up(var, lane_delta, warpSize);
     }
 
     int lane;
-    if (__AMDGCN_WAVEFRONT_SIZE == 64) {
+    if (warpSize == 64) {
       lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
     }
-    else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
+    else if (warpSize == 32) {
       lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
     }
 
@@ -471,7 +471,7 @@ class coalesced_group : public thread_group {
       lane = __lane_id();
     }
 
-    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
+    return __shfl(var, lane, warpSize);
   }
 #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
    __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h
index 0fae22fd8a..d8d22ea0be 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h
@@ -83,8 +83,18 @@ __device__ static inline int __hip_move_dpp_N(int src) {
                                     bound_ctrl);
 }
 
-__device__
-static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+#if defined(__SPIRV__)
+    inline __device__ const struct final {
+        __device__
+        __attribute__((always_inline, const))
+        operator int() const noexcept {
+            return __builtin_amdgcn_wavefrontsize();
+        }
+    } warpSize{};
+#else
+    __device__
+    static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+#endif
 
 // warp vote function __all __any __ballot
 __device__
@@ -123,6 +133,7 @@ unsigned long long __activemask() {
 #endif // HIP_DISABLE_WARP_SYNC_BUILTINS
 
 __device__ static inline unsigned int __lane_id() {
+    if (warpSize == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);
     return  __builtin_amdgcn_mbcnt_hi(
         -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
 }
diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
index c531bb3dc6..8328b9d98b 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
@@ -229,7 +229,7 @@ unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
 template <typename MaskT, typename T>
 __device__ inline
 T __shfl_sync(MaskT mask, T var, int srcLane,
-              int width = __AMDGCN_WAVEFRONT_SIZE) {
+              int width = warpSize) {
   static_assert(
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
@@ -242,7 +242,7 @@ T __shfl_sync(MaskT mask, T var, int srcLane,
 template <typename MaskT, typename T>
 __device__ inline
 T __shfl_up_sync(MaskT mask, T var, unsigned int delta,
-                                   int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                   int width = warpSize) {
   static_assert(
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
@@ -255,7 +255,7 @@ T __shfl_up_sync(MaskT mask, T var, unsigned int delta,
 template <typename MaskT, typename T>
 __device__ inline
 T __shfl_down_sync(MaskT mask, T var, unsigned int delta,
-                                     int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                     int width = warpSize) {
   static_assert(
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
@@ -268,7 +268,7 @@ T __shfl_down_sync(MaskT mask, T var, unsigned int delta,
 template <typename MaskT, typename T>
 __device__ inline
 T __shfl_xor_sync(MaskT mask, T var, int laneMask,
-                                    int width = __AMDGCN_WAVEFRONT_SIZE) {
+                                    int width = warpSize) {
   static_assert(
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
diff --git a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h
index 95379bcaef..995c8e5949 100644
--- a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h
+++ b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h
@@ -52,6 +52,10 @@ THE SOFTWARE.
 #define _CG_STATIC_CONST_DECL_ static constexpr
 #endif
 
+#if defined(__SPIRV__) && !defined(__AMDGCN_WAVEFRONT_SIZE)
+#error "TEMPORARY LIMITATION: when targeting AMDGCN SPIR-V"
+       "__AMDGCN_WAVEFRONT_SIZE is not defined, and must be defined by the user"
+#endif
 #if __AMDGCN_WAVEFRONT_SIZE == 32
 using lane_mask = unsigned int;
 #else
@@ -94,7 +98,7 @@ typedef enum {
  *  @ingroup CooperativeG
  *  @{
  *  This section describes the cooperative groups functions of HIP runtime API.
- *  
+ *
  *  The cooperative groups provides flexible thread parallel programming algorithms, threads
  *  cooperate and share data to perform collective computations.
  *
@@ -118,7 +122,7 @@ namespace helper {
 __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
     unsigned long long base_mask, unsigned long long input_mask) {
   unsigned long long out = 0;
-  for (unsigned int i = 0, index = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
+  for (unsigned int i = 0, index = 0; i < warpSize; i++) {
     auto lane_active = base_mask & (1ull << i);
     if (lane_active) {
       auto result = input_mask & (1ull << i);
@@ -245,14 +249,14 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "
 // have i-th bit of x set and come before the current thread.
 __CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
   unsigned int counter=0;
-    #if __AMDGCN_WAVEFRONT_SIZE == 32
+  if (warpSize == 32) {
       counter = __builtin_amdgcn_mbcnt_lo(x, add);
-    #else
-      counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
-      counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
-    #endif
+  } else {
+    counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
+    counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
+  }
 
-    return counter;
+  return counter;
 }
 
 }  // namespace coalesced_group

From 6840826c3fec8516857dc4f2092d84358550f588 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Fri, 6 Dec 2024 23:36:32 +0000
Subject: [PATCH 4/4] Add deprecation warning for constexpr uses of `warpSize`.

---
 hipamd/include/hip/amd_detail/amd_warp_functions.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h
index d8d22ea0be..143ea234fb 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h
@@ -92,6 +92,10 @@ __device__ static inline int __hip_move_dpp_N(int src) {
         }
     } warpSize{};
 #else
+    #if defined(__HIP_DEVICE_COMPILE__)
+        [[deprecated("Using warpSize as a compile time constant is deprecated, "
+                     "and will no longer be possible in future releases.")]]
+    #endif
     __device__
     static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
 #endif