Add tile.count (#955)

oulgen · web-flow · commit 3ccf510110be · 2025-10-15T17:09:16.000-07:00
diff --git a/helion/language/tile_interface.py b/helion/language/tile_interface.py
@@ -45,6 +45,15 @@ def block_size(self) -> int:
 
         return tile_block_size(self)
 
+    @property
+    def count(self) -> int:
+        """
+        Alias for :func:`~helion.language.tile_count`, which retrieves the number of tiles.
+        """
+        from .tile_ops import tile_count
+
+        return tile_count(self)
+
     @property
     def id(self) -> int:
         """
diff --git a/helion/language/tile_ops.py b/helion/language/tile_ops.py
@@ -177,6 +177,53 @@ def _(tile: RefTile) -> int:
     return tile._block_size
 
 
+@_decorators.api(tiles_as_sizes=True)
+def tile_count(tile: TileInterface) -> int:
+    """
+    Retrieve the number of tiles along the given tile dimension.
+    This is equivalent to ``cdiv(tile_end, tile.block_size)`` when iterating
+    from 0, and more generally ``cdiv(end - begin, block_size)`` for an
+    iteration space [begin, end).
+
+    This can also be written as: `tile.count`.
+    """
+    raise exc.NotInsideKernel
+
+
+@_decorators.register_fake(tile_count)
+def _(tile: torch.SymInt) -> torch.SymInt:
+    index = _disable_flatten_get_tile(tile)
+    result = CompileEnvironment.current().cached_create_unbacked_symint(
+        ("tile_count", tile)
+    )
+    _register_tile_symbol_origin(result, index)
+    return result
+
+
+@_decorators.codegen(tile_count)
+def _(state: CodegenState) -> ast.AST:
+    index = _disable_flatten_get_tile(state.proxy_arg(0))
+    # Use device loop metadata to get end and block size
+    end_var = (
+        state.codegen.active_device_loops[index][-1]
+        .block_id_to_info[index]
+        .end_var_name
+    )
+    block_size_var = state.device_function.block_size_var(index)
+    if block_size_var is None:
+        block_size_var = "1"
+    return expr_from_string(f"tl.cdiv({end_var}, {block_size_var})")
+
+
+@_decorators.ref(tile_count)
+def _(tile: RefTile) -> int:
+    # Number of tiles covering [begin, end) at granularity block_size
+    begin = tile._slice.start
+    end = tile._slice.stop
+    bs = tile._block_size
+    return (end - begin + bs - 1) // bs
+
+
 @_decorators.api(tiles_as_sizes=True)
 def tile_id(tile: TileInterface) -> int:
     """
diff --git a/helion/language/tile_proxy.py b/helion/language/tile_proxy.py
@@ -34,9 +34,14 @@ class Tile(TileInterface, torch.Tensor):
 
     Tile's can be used as indices to tensors, e.g. `tensor[tile]`.  Tile's
     can also be use as sizes for allocations, e.g. `torch.empty([tile])`.
-    There are also properties such as :meth:`tile.index <index>`, :meth:`tile.begin <begin>`,
-    :meth:`tile.end <end>`, :meth:`tile.id <id>` and :meth:`tile.block_size <block_size>` that can be used to retrieve various
-    information about the tile.
+    There are also properties such as
+    * :meth:`tile.index <index>`
+    * :meth:`tile.begin <begin>`
+    * :meth:`tile.end <end>`
+    * :meth:`tile.id <id>`
+    * :meth:`tile.block_size <block_size>`
+    * :meth:`tile.count <count>`
+    that can be used to retrieve various information about the tile.
 
     Masking is implicit for tiles, so if the final tile is smaller than
     the block size loading that tile will only load the valid elements
diff --git a/test/test_indexing.expected b/test/test_indexing.expected
@@ -434,6 +434,48 @@ def arange_block_size_mul(x: torch.Tensor, *, _launcher=_default_launcher):
     _launcher(_helion_arange_block_size_mul, (triton.cdiv(64, _BLOCK_SIZE_0),), ones, out, _BLOCK_SIZE_0, 2 * _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return out
 
+--- assertExpectedJournal(TestIndexing.test_tile_count_top_level)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_fn(out, n, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < n
+    tile_count = tl.cdiv(n, _BLOCK_SIZE_0)
+    tl.store(out + indices_0 * 1, tile_count, mask_0)
+
+def fn(n: int, device: torch.device, *, _launcher=_default_launcher):
+    out = torch.zeros([n], dtype=torch.int32, device=device)
+    _BLOCK_SIZE_0 = 64
+    _launcher(_helion_fn, (triton.cdiv(n, _BLOCK_SIZE_0),), out, n, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
+    return out
+
+--- assertExpectedJournal(TestIndexing.test_tile_count_with_begin_end)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_fn(out, begin, end, _BLOCK_SIZE_0: tl.constexpr):
+    tile_count = tl.cdiv(end + -1 * begin, _BLOCK_SIZE_0)
+    tl.store(out + tl.zeros([], tl.int32), tile_count, None)
+
+def fn(begin: int, end: int, device: torch.device, *, _launcher=_default_launcher):
+    out = torch.zeros([1], dtype=torch.int32, device=device)
+    _BLOCK_SIZE_0 = 32
+    _launcher(_helion_fn, (triton.cdiv(end + -1 * begin, _BLOCK_SIZE_0),), out, begin, end, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
+    return out
+
 --- assertExpectedJournal(TestIndexing.test_tile_with_offset_block_ptr)
 from __future__ import annotations
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -46,6 +46,42 @@ def reduction_sum(x: torch.Tensor) -> torch.Tensor:
 
 
 class TestIndexing(RefEagerTestBase, TestCase):
+    @skipIfRefEager(
+        "Test is block size dependent which is not supported in ref eager mode"
+    )
+    def test_tile_count_top_level(self):
+        @helion.kernel
+        def fn(n: int, device: torch.device) -> torch.Tensor:
+            out = torch.zeros([n], dtype=torch.int32, device=device)
+            for tile in hl.tile(n, block_size=64):
+                out[tile] = tile.count
+            return out
+
+        n = 100
+        code, result = code_and_output(fn, (n, DEVICE))
+        expected = torch.full([n], (n + 64 - 1) // 64, dtype=torch.int32, device=DEVICE)
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedJournal(code)
+
+    @skipIfRefEager(
+        "Test is block size dependent which is not supported in ref eager mode"
+    )
+    def test_tile_count_with_begin_end(self):
+        @helion.kernel
+        def fn(begin: int, end: int, device: torch.device) -> torch.Tensor:
+            out = torch.zeros([1], dtype=torch.int32, device=device)
+            for tile in hl.tile(begin, end, block_size=32):
+                out[0] = tile.count
+            return out
+
+        begin, end = 10, 97
+        code, result = code_and_output(fn, (begin, end, DEVICE))
+        expected = torch.tensor(
+            [(end - begin + 32 - 1) // 32], dtype=torch.int32, device=DEVICE
+        )
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedJournal(code)
+
     def test_arange(self):
         @helion.kernel
         def arange(length: int, device: torch.device) -> torch.Tensor: