From b97616f703818f08f764dacfb092e7aed2e8afce Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Mon, 28 Jul 2025 20:47:12 -0500 Subject: [PATCH 1/8] dpp pptimization for match_any() --- .../hip/amd_detail/amd_warp_sync_functions.h | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index fb0b3d4ce..38cbb621c 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -224,18 +224,21 @@ unsigned long long __match_any(T value) { (sizeof(T) == 4 || sizeof(T) == 8), "T can be int, unsigned int, long, unsigned long, long long, unsigned " "long long, float or double."); - bool done = false; - unsigned long long retval = 0; - - while (__any(!done)) { - if (!done) { - T chosen = __hip_readfirstlane(value); - if (chosen == value) { - retval = __activemask(); - done = true; - } + + unsigned long long retval = 1; + union dill { unsigned int i[2]; T val; } dill_ = { .val = value }; + dill my_dill_ = dill_; + for (int i = 1; i < static_cast(warpSize); i++) { + dill_.i[0] = __builtin_amdgcn_mov_dpp(dill_.i[0], 0x134, 0xf, 0xf, 0); //wave_rol1 + if (dill_.i[0] != my_dill_.i[0]) continue; + if constexpr(sizeof(T) == 8) { + dill_.i[1] = __builtin_amdgcn_mov_dpp(dill_.i[1], 0x134, 0xf, 0xf, 0); + if (dill_.i[1] != my_dill_.i[1]) continue; } + retval |= (1 << i); } + int rotv = __lane_id(); + retval = (retval << rotv) | (retval >> (64 - rotv)); return retval; } From 48ba22886c66b0f5c16684909d42a411c0d0e4d6 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Mon, 28 Jul 2025 21:04:12 -0500 Subject: [PATCH 2/8] handle wave32 --- hipamd/include/hip/amd_detail/amd_warp_sync_functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 38cbb621c..10d50928d 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -238,7 +238,7 @@ unsigned long long __match_any(T value) { retval |= (1 << i); } int rotv = __lane_id(); - retval = (retval << rotv) | (retval >> (64 - rotv)); + retval = (retval << rotv) | (retval >> (static_cast(warpSize) - rotv)); return retval; } From a945ed2f2e21e312ab8d7ee89a0e4f0a5572ed50 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Tue, 29 Jul 2025 21:35:56 -0500 Subject: [PATCH 3/8] fix for partially active wave scenario --- .../hip/amd_detail/amd_warp_sync_functions.h | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 10d50928d..367d3096c 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -225,20 +225,32 @@ unsigned long long __match_any(T value) { "T can be int, unsigned int, long, unsigned long, long long, unsigned " "long long, float or double."); - unsigned long long retval = 1; - union dill { unsigned int i[2]; T val; } dill_ = { .val = value }; - dill my_dill_ = dill_; - for (int i = 1; i < static_cast(warpSize); i++) { - dill_.i[0] = __builtin_amdgcn_mov_dpp(dill_.i[0], 0x134, 0xf, 0xf, 0); //wave_rol1 - if (dill_.i[0] != my_dill_.i[0]) continue; - if constexpr(sizeof(T) == 8) { - dill_.i[1] = __builtin_amdgcn_mov_dpp(dill_.i[1], 0x134, 0xf, 0xf, 0); - if (dill_.i[1] != my_dill_.i[1]) continue; + auto actvmask = __activemask(); + unsigned long long retval = 0; + if (actvmask != ~((decltype(actvmask))0)) { + bool done = false; + while (__any(!done)) { + if (!done) { + T chosen = __hip_readfirstlane(value); + if (chosen == value) { + retval = __activemask(); + done = true; + } + } + } + } else { + union dill { unsigned int i[2]; unsigned long long ill; decltype(value) val; } dill_ = { .val = value }; + retval = 1; + for (int i = 1; i < static_cast(warpSize); i++) { + if constexpr (sizeof(value) == 8) + dill_.ill = __builtin_amdgcn_mov_dpp(dill_.ill, 0x134, 0xf, 0xf, 0); //wave_rol1 + else + dill_.i[0] = __builtin_amdgcn_mov_dpp(dill_.i[0], 0x134, 0xf, 0xf, 0); //wave_rol1 + retval |= ((decltype(mask))(dill_.val == value)) << i; } - retval |= (1 << i); + int rotv = __lane_id(); + retval = (retval << rotv) | (retval >> (static_cast(warpSize) - rotv)); } - int rotv = __lane_id(); - retval = (retval << rotv) | (retval >> (static_cast(warpSize) - rotv)); return retval; } From caff838872051bd18530dc810f27fcbd41811417 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Thu, 7 Aug 2025 23:59:19 -0500 Subject: [PATCH 4/8] Adjustments, comments. --- .../include/hip/amd_detail/amd_warp_sync_functions.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 367d3096c..13bad2783 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -225,9 +225,9 @@ unsigned long long __match_any(T value) { "T can be int, unsigned int, long, unsigned long, long long, unsigned " "long long, float or double."); - auto actvmask = __activemask(); + unsigned long long actvmask = __activemask(); unsigned long long retval = 0; - if (actvmask != ~((decltype(actvmask))0)) { + if (actvmask != ~0ull) { bool done = false; while (__any(!done)) { if (!done) { @@ -243,11 +243,13 @@ unsigned long long __match_any(T value) { retval = 1; for (int i = 1; i < static_cast(warpSize); i++) { if constexpr (sizeof(value) == 8) - dill_.ill = __builtin_amdgcn_mov_dpp(dill_.ill, 0x134, 0xf, 0xf, 0); //wave_rol1 + dill_.ill = __builtin_amdgcn_mov_dpp(dill_.ill, 0x134 /*dpp_ctrl=wave_rol1*/, 0xf/*row_mask*/, 0xf/*clmn_mask*/, 1/*bound_ctrl*/); else - dill_.i[0] = __builtin_amdgcn_mov_dpp(dill_.i[0], 0x134, 0xf, 0xf, 0); //wave_rol1 + dill_.i[0] = __builtin_amdgcn_mov_dpp(dill_.i[0], 0x134 /*dpp_ctrl=wave_rol1*/, 0xf/*full*/, 0xf/*full*/, 1/*bound_ctrl*/); retval |= ((decltype(mask))(dill_.val == value)) << i; } + //At this point each lane has a rotated match_any mask, where it is in the LSB. + //So we just need to rotate the mask by the lane's actual position to get the correct mask. int rotv = __lane_id(); retval = (retval << rotv) | (retval >> (static_cast(warpSize) - rotv)); } From 14bfaa2df11291f8226dee225b932f2ddb00e4c7 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Tue, 12 Aug 2025 21:53:55 +0000 Subject: [PATCH 5/8] More comments --- hipamd/include/hip/amd_detail/amd_warp_sync_functions.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 13bad2783..df93401a4 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -241,6 +241,9 @@ unsigned long long __match_any(T value) { } else { union dill { unsigned int i[2]; unsigned long long ill; decltype(value) val; } dill_ = { .val = value }; retval = 1; + //Do a full rotate of the wave lanes, using dpp with "wave_fol1" control (ID: 0x134). + //wave_rol1 dpp rotates the value from each lane to one lane right across the whole wave. + //In doing so each lane gets a mask of matches with all other lanes in the wave. for (int i = 1; i < static_cast(warpSize); i++) { if constexpr (sizeof(value) == 8) dill_.ill = __builtin_amdgcn_mov_dpp(dill_.ill, 0x134 /*dpp_ctrl=wave_rol1*/, 0xf/*row_mask*/, 0xf/*clmn_mask*/, 1/*bound_ctrl*/); From cedeb5b68232f14f1693fb13be987fe002225c67 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Tue, 12 Aug 2025 22:00:51 +0000 Subject: [PATCH 6/8] correction --- hipamd/include/hip/amd_detail/amd_warp_sync_functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index df93401a4..225e6328b 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -241,7 +241,7 @@ unsigned long long __match_any(T value) { } else { union dill { unsigned int i[2]; unsigned long long ill; decltype(value) val; } dill_ = { .val = value }; retval = 1; - //Do a full rotate of the wave lanes, using dpp with "wave_fol1" control (ID: 0x134). + //Do a full rotate of the wave lanes, using dpp with "wave_rol1" control (ID: 0x134). //wave_rol1 dpp rotates the value from each lane to one lane right across the whole wave. //In doing so each lane gets a mask of matches with all other lanes in the wave. for (int i = 1; i < static_cast(warpSize); i++) { From 7d6f6030c7bab8abd87a44fb528269cfab735395 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Tue, 12 Aug 2025 22:25:48 +0000 Subject: [PATCH 7/8] correction2 --- hipamd/include/hip/amd_detail/amd_warp_sync_functions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 225e6328b..9b92c2e1f 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -242,8 +242,8 @@ unsigned long long __match_any(T value) { union dill { unsigned int i[2]; unsigned long long ill; decltype(value) val; } dill_ = { .val = value }; retval = 1; //Do a full rotate of the wave lanes, using dpp with "wave_rol1" control (ID: 0x134). - //wave_rol1 dpp rotates the value from each lane to one lane right across the whole wave. - //In doing so each lane gets a mask of matches with all other lanes in the wave. + //wave_rol1 dpp rotates the value from each lane to one lane left of it, across the whole wave. + //In doing so each lane gets a mask of matches with all other lanes in the wave in retval. for (int i = 1; i < static_cast(warpSize); i++) { if constexpr (sizeof(value) == 8) dill_.ill = __builtin_amdgcn_mov_dpp(dill_.ill, 0x134 /*dpp_ctrl=wave_rol1*/, 0xf/*row_mask*/, 0xf/*clmn_mask*/, 1/*bound_ctrl*/); From 341e662b9d6dca21fb48bf2517f8c72c4c9945f7 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi Date: Wed, 13 Aug 2025 18:04:47 +0000 Subject: [PATCH 8/8] Remove decltype for "unsigned long long". --- hipamd/include/hip/amd_detail/amd_warp_sync_functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 9b92c2e1f..2f5304c8c 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -249,7 +249,7 @@ unsigned long long __match_any(T value) { dill_.ill = __builtin_amdgcn_mov_dpp(dill_.ill, 0x134 /*dpp_ctrl=wave_rol1*/, 0xf/*row_mask*/, 0xf/*clmn_mask*/, 1/*bound_ctrl*/); else dill_.i[0] = __builtin_amdgcn_mov_dpp(dill_.i[0], 0x134 /*dpp_ctrl=wave_rol1*/, 0xf/*full*/, 0xf/*full*/, 1/*bound_ctrl*/); - retval |= ((decltype(mask))(dill_.val == value)) << i; + retval |= ((unsigned long long)(dill_.val == value)) << i; } //At this point each lane has a rotated match_any mask, where it is in the LSB. //So we just need to rotate the mask by the lane's actual position to get the correct mask.