[Gluon] Add warpgroup_mma_accumulator object (#7760)

peterbell10 · web-flow · commit dd5823453bcc · 2025-08-05T22:34:26.000+01:00
diff --git a/python/test/gluon/test_consan.py b/python/test/gluon/test_consan.py
@@ -562,7 +562,7 @@ def multibuffered_loop_wgmma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE:
 
     mma_layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1],
                                                              instr_shape=[16, 32, 16])
-    acc = ttgl.zeros([XBLOCK, XBLOCK], ttgl.float32, mma_layout)
+    acc = hopper.warpgroup_mma_init(ttgl.zeros([XBLOCK, XBLOCK], ttgl.float32, mma_layout))
 
     smemA = ttgl.allocate_shared_memory(ttgl.float16, [num_buffers, XBLOCK, XBLOCK], input_desc.layout)
     smemB = ttgl.allocate_shared_memory(ttgl.float16, [num_buffers, XBLOCK, XBLOCK], input_desc.layout)
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -584,7 +584,11 @@ def warpgroup_mma_kernel(nvmma_layout: ttgl.constexpr, acc_layout: ttgl.constexp
     a = ttgl.allocate_shared_memory(ttgl.float16, [128, 128], nvmma_layout)
     b = ttgl.allocate_shared_memory(ttgl.float16, [128, 128], nvmma_layout)
     acc = ttgl.full([128, 128], 0, dtype=ttgl.float16, layout=acc_layout)
-    hopper.warpgroup_mma(a, b, acc)
+    acc = hopper.warpgroup_mma(a, b, acc)
+    ttgl.static_assert(isinstance(acc, ttgl.tensor))
+
+    acc = hopper.warpgroup_mma(a, b, acc, is_async=True)
+    ttgl.static_assert(isinstance(acc, hopper.warpgroup_mma_accumulator))
 
 
 def test_warpgroup_mma():
@@ -608,6 +612,8 @@ def test_warpgroup_mma():
     %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma>
     %true = arith.constant true
     %2 = ttng.warp_group_dot %0, %1, %cst_0, %true {inputPrecision = 0 : i32} : !ttg.memdesc<128x128xf16, #shared, #smem, mutable> * !ttg.memdesc<128x128xf16, #shared, #smem, mutable> -> tensor<128x128xf16, #mma>
+    %true_1 = arith.constant true
+    %3 = ttng.warp_group_dot %0, %1, %2, %true_1 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xf16, #shared, #smem, mutable> * !ttg.memdesc<128x128xf16, #shared, #smem, mutable> -> tensor<128x128xf16, #mma>
     tt.return
   }
 }
@@ -617,8 +623,9 @@ def test_warpgroup_mma():
 @gluon.jit
 def warpgroup_mma_wait_kernel():
     layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1], instr_shape=[16, 32, 16])
-    acc = ttgl.full([128, 128], 0, dtype=ttgl.float16, layout=layout)
+    acc = hopper.warpgroup_mma_init(ttgl.full([128, 128], 0, dtype=ttgl.float16, layout=layout))
     acc = hopper.warpgroup_mma_wait(num_outstanding=1, deps=[acc])
+    _ = acc + acc
 
 
 def test_warpgroup_mma_wait():
@@ -631,6 +638,7 @@ def test_warpgroup_mma_wait():
     %cst = arith.constant 0.000000e+00 : f16
     %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma>
     %0 = ttng.warp_group_dot_wait %cst_0 {pendings = 1 : i32} : tensor<128x128xf16, #mma>
+    %1 = arith.addf %0, %0 : tensor<128x128xf16, #mma>
     tt.return
   }
 }
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -1,8 +1,13 @@
+from __future__ import annotations
 from triton.compiler.code_generator import unflatten_ir_values
 from ..ampere import async_copy
 from . import mbarrier, tma
 from ... import _core
 
+from typing import List, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    from triton._C.libtriton import ir
+
 __all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma", "warpgroup_mma_wait"]
 
 
@@ -18,6 +23,43 @@ def fence_async_shared(cluster=False, _semantic=None):
     _semantic.builder.create_fence_async_shared(cluster)
 
 
+class warpgroup_mma_accumulator_type(_core.base_type):
+    tensor_type: _core.dtype
+
+    def __init__(self, tensor_type: _core.dtype):
+        self.tensor_type = tensor_type
+
+    def __str__(self) -> str:
+        return f"warpgroup_mma_accumulator<{self.tensor_type}>"
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[warpgroup_mma_accumulator, int]:
+        return warpgroup_mma_accumulator(handles[cursor], self.tensor_type), cursor + 1
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        self.tensor_type._flatten_ir_types(builder, out)
+
+    def mangle(self) -> str:
+        return f"FT{self.tensor_type.mangle()}FT"
+
+
+class warpgroup_mma_accumulator(_core.base_value):
+    handle: ir.value
+    type: warpgroup_mma_accumulator_type
+
+    def __init__(self, handle, tensor_type: _core.dtype):
+        self.handle = handle
+        self.type = warpgroup_mma_accumulator_type(tensor_type)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+
+@_core.builtin
+def warpgroup_mma_init(value, _semantic):
+    assert isinstance(value, _core.tensor)
+    return warpgroup_mma_accumulator(value.handle, value.type)
+
+
 @_core.builtin
 def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_acc=None, is_async=False,
                   _semantic=None):
@@ -35,7 +77,7 @@ def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_
         is_async (bool): Whether operation is asynchronous. Defaults to False.
 
     Returns:
-        tensor: Result of warpgroup MMA operation.
+        tensor or warpgroup_mma_accumulator: Returns the result if synchronous, or a token to load the value once computed if asynchronous.
     """
     use_acc = _semantic.to_tensor(use_acc)
 
@@ -59,7 +101,11 @@ def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_
 
     handle = _semantic.builder.create_warpgroup_mma(a.handle, b.handle, acc.handle, use_acc.handle, precision,
                                                     max_num_imprecise_acc, is_async)
-    return _core.tensor(handle, acc.type)
+    tensor_ty = acc.type.tensor_type if isinstance(acc, warpgroup_mma_accumulator) else acc.type
+    if is_async:
+        return warpgroup_mma_accumulator(handle, tensor_ty)
+    else:
+        return _core.tensor(handle, tensor_ty)
 
 
 @_core.builtin
@@ -71,10 +117,13 @@ def warpgroup_mma_wait(num_outstanding=0, deps=None, _semantic=None):
         num_outstanding (int): Number of outstanding warpgroup MMA operations to wait for. Defaults to 0.
         deps (Sequence[tensor]): List of dependencies that need to be kept alive while the mma is unfinished.
     """
+    if deps is None:
+        raise ValueError("warpgroup_mma_wait deps must be given")
     deps_handles = [x.handle for x in deps] if deps is not None else []
     num_outstanding = _core._unwrap_if_constexpr(num_outstanding)
     results = _semantic.builder.create_warpgroup_mma_wait(deps_handles, num_outstanding)
-    results = tuple(unflatten_ir_values(results, [dep.type for dep in deps]))
-    if len(results) == 1:
-        return results[0]
+    result_types = [dep.type.tensor_type if isinstance(dep, warpgroup_mma_accumulator) else dep.type for dep in deps]
+    results = unflatten_ir_values(results, result_types)
+    if len(deps) == 1:
+        return next(results)
     return tuple(results)
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/tma.py b/python/triton/experimental/gluon/language/nvidia/hopper/tma.py
@@ -42,7 +42,7 @@ def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
         self.strides_type._flatten_ir_types(builder, out)
 
     def mangle(self) -> str:
-        return f"TD{self.block_type.mangle}_{self.layout.mangle()}TD"
+        return f"TD{self.block_type.mangle()}_{self.layout.mangle()}TD"
 
 
 class tensor_descriptor(base_value):