From ee63b45e524bd61982861ea564b69a35442b58fb Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Tue, 12 Aug 2025 10:14:49 -0700 Subject: [PATCH] Fold unaligned vec4 load and store into function Summary: - Fold unaligned vec4 load and store into function Reviewed By: emlin Differential Revision: D79972929 --- fbgemm_gpu/codegen/genscript/optimizers.py | 29 ++++-------------- fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh | 32 ++++++++++++++++++++ 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py index a71204e6f3..2ffaaf94d8 100644 --- a/fbgemm_gpu/codegen/genscript/optimizers.py +++ b/fbgemm_gpu/codegen/genscript/optimizers.py @@ -1179,32 +1179,15 @@ def partial_rowwise_adam() -> Dict[str, Any]: Vec4T m_t; if (enable_optimizer_offloading) { - // When offloading is enabled, we need to ensure proper alignment - // Create a temporary aligned array on the stack - alignas(16) momentum1_ph_t local_momentum1[4]; - - // Load values from momentum1_start into the aligned array - #pragma unroll - for (int i = 0; i < 4; i++) { - local_momentum1[i] = momentum1_start[d + i]; - } - - // Use the aligned array for computation - m_t = Vec4T(local_momentum1); + // When offloading is enabled, we need to ensure proper alignment, so + // first copy to a temporary aligned array before loading to Vec4T + m_t = vec4_load_unaligned(momentum1_start + d); m_t.mul_(beta1); m_t.fma_(grad, 1.0 - beta1); - - // Store results back to the aligned array - m_t.store(local_momentum1); - - // Copy results back to momentum1_start - #pragma unroll - for (int i = 0; i < 4; i++) { - momentum1_start[d + i] = local_momentum1[i]; - } + vec4_store_unaligned(m_t, momentum1_start + d); + } else { - // When not offloading, we can directly use momentum1_start - // This avoids the extra copy operations and temporary array + // When offloading is not enabled, we can directly use momentum1_start m_t = Vec4T(&momentum1_start[d]); m_t.mul_(beta1); m_t.fma_(grad, 1.0 - beta1); diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh index 10e722a4fb..8a77ba5967 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh @@ -611,4 +611,36 @@ DEVICE_INLINE Vec4T vec4_acc( return s; } +template +DEVICE_INLINE Vec4T vec4_load_unaligned(const T* src) { + // src is not guaranteed to have proper alignment. + // Create a temporary aligned array on the stack. + alignas(16) T temp[4]; + + // Load values from src into the byte-aligned array +#pragma unroll + for (auto i = 0; i < 4; i++) { + temp[i] = src[i]; + } + + // Then load the aligned array values into Vec4T + return Vec4T(temp); +} + +template +DEVICE_INLINE void vec4_store_unaligned(const Vec4T& vec, T* dst) { + // dst is not guaranteed to have proper alignment. + // Create a temporary aligned array on the stack. + alignas(16) T temp[4]; + + // Store Vec4T values into the byte-aligned array + vec.store(temp); + + // Then store the aligned array values into dst +#pragma unroll + for (auto i = 0; i < 4; i++) { + dst[i] = temp[i]; + } +} + } // namespace fbgemm_gpu