[AMD][Gluon] Expose buffer_load and buffer_store (#7738)

zwu-2025 · web-flow · commit 376b9b933e07 · 2025-08-05T12:28:15.000-07:00
Expose AMD buffer_load and buffer_store Gluon. Example usage looks like:

```
def buffer_ldst_kernel(x, y):
    layout: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[1, 1], threads_per_warp=[1, 64], warps_per_cta=[4, 1],  order=[1, 0])
    offsets = ttgl.arange(0, 64 * 64, layout=layout)
    a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets)
    ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets)
```
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -20,6 +20,7 @@ namespace tt = triton;
 namespace ttg = triton::gpu;
 namespace ttng = triton::nvidia_gpu;
 namespace gluon = mlir::triton::gluon;
+namespace ttag = mlir::triton::amdgpu;
 
 // Helper to check if an MLIR type or attribute has a verifier method.
 template <typename AttrOrType>
@@ -246,7 +247,8 @@ void init_gluon_ir(py::module &&m) {
              auto ctx = self.getContext();
              return self.getChecked<ttg::MemDescType>(
                  shape, elementType, layout,
-                 ttg::SharedMemorySpaceAttr::get(ctx), /*mutableMemory=*/true,
+                 ttg::SharedMemorySpaceAttr::get(ctx),
+                 /*mutableMemory=*/true,
                  /*allocShape=*/allocShape);
            })
       .def("get_tensor_mem_desc_ty",
@@ -256,7 +258,8 @@ void init_gluon_ir(py::module &&m) {
              auto ctx = self.getContext();
              return self.getChecked<ttg::MemDescType>(
                  shape, elementType, layout,
-                 ttng::TensorMemorySpaceAttr::get(ctx), /*mutableMemory=*/true,
+                 ttng::TensorMemorySpaceAttr::get(ctx),
+                 /*mutableMemory=*/true,
                  /*allocShape=*/allocShape);
            })
       .def("get_blocked_layout",
@@ -404,8 +407,8 @@ void init_gluon_ir(py::module &&m) {
               tt::CacheModifier cacheModifier,
               tt::EvictionPolicy evictionPolicy, bool isVolatile) {
              self.create<ttg::AsyncCopyGlobalToLocalOp>(
-                 pointer, smem, mask, /*other*/ Value{}, cacheModifier,
-                 evictionPolicy, isVolatile);
+                 pointer, smem, mask,
+                 /*other*/ Value{}, cacheModifier, evictionPolicy, isVolatile);
            })
       .def("create_async_copy_mbarrier_arrive",
            [](GluonOpBuilder &self, Value mbarrier, bool incrementCount) {
@@ -622,11 +625,24 @@ void init_gluon_ir(py::module &&m) {
              return self.create<ttg::WarpSpecializeOp>(
                  resultTypes, explicitCaptures, partitionNumWarps);
            })
+      .def("create_buffer_load",
+           [](GluonOpBuilder &self, Type resultType, Value ptr, Value offsets,
+              Value mask, Value other, tt::CacheModifier cache) -> Value {
+             return self.create<ttag::BufferLoadOp>(resultType, ptr, offsets,
+                                                    Value() /*stride*/, cache,
+                                                    mask, other);
+           })
+      .def("create_buffer_store",
+           [](GluonOpBuilder &self, Value storedValue, Value ptr, Value offsets,
+              Value mask, tt::CacheModifier cache) {
+             self.create<ttag::BufferStoreOp>(storedValue, ptr, offsets,
+                                              Value() /*stride*/, cache, mask);
+           })
       .def("create_buffer_load_to_local",
            [](GluonOpBuilder &self, Value dest, Value ptr, Value offsets,
               Value mask, Value other, Value stride,
               tt::CacheModifier cacheModifier) {
-             self.create<triton::amdgpu::BufferLoadToLocalOp>(
+             self.create<ttag::BufferLoadToLocalOp>(
                  dest, ptr, offsets, mask, other, stride, cacheModifier);
            });
 
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -1605,3 +1605,109 @@ def kernel(ptr):
   }
 }
 """)
+
+
+@gluon.jit
+def buffer_load_store_kernel(x, y):
+    layout: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[1, 1], threads_per_warp=[1, 64], warps_per_cta=[4, 1],
+                                                order=[1, 0])
+
+    offsets = ttgl.arange(0, 64 * 64).reshape(64, 64)
+    offsets = ttgl.convert_layout(offsets, layout=layout)
+    mask = ttgl.full((64, 64), 1, tl.int1, layout=layout)
+    other = ttgl.full((64, 64), 1.0, tl.float32, layout=layout)
+    a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
+    ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
+
+    a = ttgl.amd.cdna4.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
+    ttgl.amd.cdna4.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+def test_buffer_load_store(target):
+    x = MockTensor(ttgl.float32)
+    y = MockTensor(ttgl.float32)
+    module = run_parser(buffer_load_store_kernel, *make_args(x, y), target=target)
+
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @buffer_load_store_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #gluon.auto_encoding>
+    %1 = tt.reshape %0 : tensor<4096xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #gluon.auto_encoding>
+    %2 = ttg.convert_layout %1 : tensor<64x64xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #blocked>
+    %true = arith.constant true
+    %cst = arith.constant dense<true> : tensor<64x64xi1, #blocked>
+    %cst_0 = arith.constant 1.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %3 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
+    %4 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
+    tt.return
+  }
+}
+""")
+
+
+@gluon.jit
+def buffer_load_store_with_broadcast_kernel(x, y):
+    layout: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[1, 1], threads_per_warp=[1, 64], warps_per_cta=[4, 1],
+                                                order=[1, 0])
+
+    offsets = ttgl.arange(0, 64 * 64).reshape(64, 64)
+    offsets = ttgl.convert_layout(offsets, layout=layout)
+    other = ttgl.full((64, 64), 1.0, tl.float32, layout=layout)
+
+    mask = ttgl.full((64, 1), 1, tl.int1, layout=layout)
+    a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
+    ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
+
+    mask = ttgl.full((1, 64), 1, tl.int1, layout=layout)
+    a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
+    ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
+
+    other = 1.0
+    a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
+    ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+def test_buffer_load_store_with_broadcast(target):
+    x = MockTensor(ttgl.float32)
+    y = MockTensor(ttgl.float32)
+    module = run_parser(buffer_load_store_with_broadcast_kernel, *make_args(x, y), target=target)
+
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @buffer_load_store_with_broadcast_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #gluon.auto_encoding>
+    %1 = tt.reshape %0 : tensor<4096xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #gluon.auto_encoding>
+    %2 = ttg.convert_layout %1 : tensor<64x64xi32, #gluon.auto_encoding> -> tensor<64x64xi32, #blocked>
+    %cst = arith.constant 1.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %true = arith.constant true
+    %cst_1 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
+    %3 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
+    %4 = amdgpu.buffer_load %arg0[%2], %3, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    %5 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
+    amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    %true_2 = arith.constant true
+    %cst_3 = arith.constant dense<true> : tensor<1x64xi1, #blocked>
+    %6 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
+    %7 = amdgpu.buffer_load %arg0[%2], %6, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    %8 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
+    amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    %cst_4 = arith.constant 1.000000e+00 : f32
+    %9 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %10 = amdgpu.buffer_load %arg0[%2], %9, %cst_5 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    %11 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
+    amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = ca : tensor<64x64xf32, #blocked>
+    tt.return
+  }
+}
+""")
diff --git a/python/triton/experimental/gluon/language/amd/cdna3/__init__.py b/python/triton/experimental/gluon/language/amd/cdna3/__init__.py
@@ -1,8 +1,29 @@
-from ..._core import builtin, int32, uint32
-from ..._semantic import _check
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+from triton.experimental.gluon.language import _core as ttgl
 from triton._C.libtriton import ir
+from ..._core import builtin, int32, uint32, _unwrap_if_constexpr
+from ..._semantic import _check
+
+if TYPE_CHECKING:
+    from ..._semantic import GluonSemantic
+
+__all__ = ["buffer_load_to_shared", "buffer_load", "buffer_store"]
 
-__all__ = ["buffer_load_to_shared"]
+
+def _verify_buffer_load_store(ptr, offsets, mask, other=None):
+    assert ptr.type.is_ptr(), "ptr must be a scalar pointer type"
+
+    assert isinstance(offsets.type, ttgl.distributed_type), "expected offsets type to be a distributed_type"
+    assert offsets.dtype.is_int32() or offsets.dtype.is_uint32(), "offsets element type must be int32 or uint32"
+
+    element_type = ptr.type.scalar.element_ty
+
+    if other is not None:
+        assert mask is not None, "when other is not None, mask should not be None"
+        assert other.shape == offsets.shape, "other shape must match the offsets shape"
+        assert other.dtype == element_type, "other must have the same data type as ptr scalar type"
 
 
 @builtin
@@ -32,3 +53,60 @@ def buffer_load_to_shared(dest, ptr, offsets, mask=None, other=None, cache_modif
     cache_modifier = _semantic._str_to_load_cache_modifier(cache_modifier)
 
     builder.create_buffer_load_to_local(dest.handle, ptr.handle, offsets.handle, mask, other, stride, cache_modifier)
+
+
+@builtin
+def buffer_load(ptr, offsets, mask=None, other=None, cache=None, _semantic=None):
+    """
+    AMD buffer load from global memory via a scalar base pointer and a tensor of
+    offsets instead of a tensor of pointers. This operation will load data
+    directly into registers.
+
+    Args:
+        ptr (pointer to scalar): Global memory scalar base pointer to load from.
+        offsets (tensor): Offsets tensor for the load operation.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        other (tensor, optional): Tensor providing default values for masked elements. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+
+    other = _unwrap_if_constexpr(other)
+    if other is not None:
+        offsets, other = _semantic.broadcast_impl_value(offsets, other)
+
+    _verify_buffer_load_store(ptr, offsets, mask, other)
+
+    other = other.handle if other is not None else ir.value()
+    mask = mask.handle if mask is not None else ir.value()
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
+
+    ret_ty = offsets.type.with_element_ty(ptr.type.scalar.element_ty)
+    builder = _semantic.builder
+    handle = builder.create_buffer_load(ret_ty.to_ir(builder), ptr.handle, offsets.handle, mask, other, cache_modifier)
+    return ttgl.tensor(handle, ret_ty)
+
+
+@builtin
+def buffer_store(stored_value, ptr, offsets, mask, cache=None, _semantic: GluonSemantic = None):
+    """
+    AMD buffer store a tensor directly to global memory via a scalar base pointer and a tensor of
+    offsets instead of a tensor of pointers.
+    Args:
+        stored_value (tensor to be stored): The tensor to be stored to global memory.
+        ptr (pointer to scalar): Global memory scalar base pointer to store to.
+        offsets (tensor): Offsets tensor for the store operation.
+        mask (tensor, optional): Mask tensor for predicated store. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+
+    _verify_buffer_load_store(ptr, offsets, mask)
+
+    mask = mask.handle if mask is not None else ir.value()
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
+
+    _semantic.builder.create_buffer_store(stored_value.handle, ptr.handle, offsets.handle, mask, cache_modifier)
diff --git a/python/triton/experimental/gluon/language/amd/cdna4/__init__.py b/python/triton/experimental/gluon/language/amd/cdna4/__init__.py
@@ -1,3 +1,3 @@
-from ..cdna3 import buffer_load_to_shared
+from ..cdna3 import buffer_load_to_shared, buffer_load, buffer_store
 
-__all__ = ["buffer_load_to_shared"]
+__all__ = ["buffer_load_to_shared", "buffer_load", "buffer_store"]