[FRONTEND] Support for ragged TMAs (#7783)

apgoucher · web-flow · commit ba9bd401d7cd · 2025-08-05T22:03:35.000+01:00
This allows us to use higher-dimensional TMA descriptors to emulate
ragged-batching support with automatic bounds checking.
diff --git a/python/test/unit/cuda/test_tma_descriptor.py b/python/test/unit/cuda/test_tma_descriptor.py
@@ -1,6 +1,8 @@
 from contextlib import nullcontext
 import pytest
 import torch
+import triton
+from triton.tools.ragged_tma import create_ragged_descriptor, load_ragged, store_ragged
 from triton.tools.tensor_descriptor import TensorDescriptor
 
 
@@ -44,3 +46,46 @@ def test_2d_tma_descriptor_exception(M, N, BLOCK_M, BLOCK_N, expect_error_n, exp
     ctx = pytest.raises(exc_type, match=match) if expect_error else nullcontext()
     with ctx:
         _ = TensorDescriptor.from_tensor(A, [BLOCK_M, BLOCK_N])
+
+
+@triton.jit
+def example_load_store_kernel(X, Y, x_off, y_off, x_size, y_size):
+
+    data = load_ragged(X, x_off, x_size, [0, 0])
+    store_ragged(Y, y_off, y_size, [0, 0], data)
+
+
+@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
+def test_ragged_tma(dtype):
+
+    if not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 9:
+        pytest.skip("Test requires Hopper or Blackwell target.")
+        return
+
+    dtype = getattr(torch, dtype)
+
+    src = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
+    ref = torch.randn((1024, 80), dtype=torch.float32, device="cuda").to(dtype)
+    dst = 1.0 * ref
+
+    X = create_ragged_descriptor(src, [32, 128])
+    Y = create_ragged_descriptor(dst, [32, 128])
+
+    x_off = 42
+    y_off = 51
+    x_size = 17
+    y_size = 24
+
+    example_load_store_kernel[(1, )](X, Y, x_off, y_off, x_size, y_size)
+
+    # the initial and final segments are unchanged:
+    res0 = torch.equal(dst[:y_off], ref[:y_off])
+    res1 = torch.equal(dst[y_off + y_size:], ref[y_off + y_size:])
+
+    # this segment will be copied verbatim from src:
+    res2 = torch.equal(dst[y_off:y_off + x_size], src[x_off:x_off + x_size])
+
+    # this segment will have read OOB zeroes and written them here:
+    res3 = torch.all(dst[y_off + x_size:y_off + y_size] == 0.0).item()
+
+    assert [res0, res1, res2, res3] == [True, True, True, True]
diff --git a/python/triton/tools/ragged_tma.py b/python/triton/tools/ragged_tma.py
@@ -0,0 +1,81 @@
+import triton
+import triton.language as tl
+from triton.tools.tensor_descriptor import TensorDescriptor
+
+# fmt: off
+
+def create_ragged_descriptor(T, block_shape):
+    """
+    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
+    which behaves like a concatenation (along the first axis) of subarrays
+    of potentially unequal size.
+
+    The load_ragged and store_ragged device functions can be used to read
+    and write from subarrays T[batch_offset : batch_offset + batch_size]
+    with hardware bounds-checking preventing any sort of leakage outside
+    the subarray.
+    """
+
+    block_shape = list(block_shape)
+    tensor_shape = list(T.shape)
+
+    assert 2 <= len(tensor_shape) <= 3, "ragged tensors must have dimension 2 or 3"
+    assert len(tensor_shape) == len(block_shape), "block shape must match tensor shape"
+
+    max_int = 0x7fff0000
+    billion = 0x40000000  # == 2**30
+
+    assert tensor_shape[0] <= billion, "number of rows may not exceed 2**30"
+
+    # we prepend an extra two dimensions and rely on the fact that pointers
+    # have 64-bit wraparound semantics:
+    tma_stride = [2**34 - T.stride(0), T.stride(0)] + [T.stride(i) for i in range(len(tensor_shape))]
+    tma_shape  = [max_int, max_int, billion] + tensor_shape[1:]
+    box_shape  = [1, 1] + block_shape
+
+    return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
+
+
+@triton.jit
+def to_ragged_indices(batch_offset, batch_size, row):
+    """
+    Helper function for load_ragged and store_ragged.
+    """
+
+    billion = 0x40000000  # == 2**30
+    x = billion - batch_size + row
+    y = batch_offset + batch_size
+
+    return billion, y, x
+
+
+@triton.jit
+def load_ragged(TMA, batch_offset, batch_size, coords):
+    """
+    Read from a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where reading outside the subarray gives zeros.
+
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.load().
+    """
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[0])
+    data = TMA.load([c0, c1, c2] + coords[1:])
+    data = tl.reshape(data, data.shape[2:])
+    return data
+
+
+@triton.jit
+def store_ragged(TMA, batch_offset, batch_size, coords, data):
+    """
+    Write to a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where writes outside the subarray are masked
+    correctly.
+
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.store().
+    """
+
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[0])
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.store([c0, c1, c2] + coords[1:], data)