[Pallas:MGPU] Add Pallas lowering for TMEM slices under WG semantic.

allanrenucci · Google-ML-Automation · commit d846dcfbb035 · 2025-11-13T10:12:19.000-08:00
We lower TMEM slices to `memref.subview`.

PiperOrigin-RevId: 831909484
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -1449,7 +1449,10 @@ def _bubble_up(untransform_fn, data):
         indices = _bubble_up(
             lambda t, idxs: t.untransform_index(mlir_dtype, idxs), indices
         )
-        if isinstance(transformed_ref, tcgen05.TMEMRef):
+        if (
+            isinstance(transformed_ref, tcgen05.TMEMRef)
+            and ctx.module_ctx.lowering_semantics == mgpu.LoweringSemantics.Lane
+        ):
           transformed_ref = transformed_ref.slice(*indices)
         else:
           transformed_ref = mgpu.memref_slice(transformed_ref, indices)
diff --git a/jax/experimental/mosaic/gpu/dialect_lowering.py b/jax/experimental/mosaic/gpu/dialect_lowering.py
@@ -1441,6 +1441,29 @@ def _memref_subview_op_lowering_rule(
 ) -> Sequence[ir.Value]:
   del ctx
 
+  if any(s != 1 for s in op.static_strides):
+    raise NotImplementedError("SubViewOp only supports static strides of 1.")
+  if op.sizes:
+    raise NotImplementedError("SubViewOp only supports static sizes.")
+  src_ty = ir.MemRefType(op.source.type)
+
+  if utils.is_memref_transposed(src_ty):
+    raise NotImplementedError("SubViewOp does not support transposed memrefs.")
+
+  if utils.is_tmem_ref(src_ty):
+    [in_tmem_layout] = inference_utils.in_tmem_layouts(op)
+    [out_tmem_layout] = inference_utils.out_tmem_layouts(op)
+    assert in_tmem_layout == out_tmem_layout
+    ref = _tmem_ref_from_ir(op.source, in_tmem_layout)
+    indices = []
+    dynamic_offset_index = 0
+    for offset, size in zip(op.static_offsets, op.static_sizes, strict=True):
+      if ir.ShapedType.is_dynamic_size(offset):
+        offset = op.offsets[dynamic_offset_index]
+        dynamic_offset_index += 1
+      indices.append(utils.DynamicSlice(offset, size))
+    return [_tmem_ref_to_ir(ref.slice(*indices))]
+
   in_transforms = inference_utils.in_transforms(op)[0]
   out_transforms = inference_utils.out_transforms(op)[0]
 
@@ -1449,22 +1472,11 @@ def _memref_subview_op_lowering_rule(
         "SubViewOp transforms for the input and output refs must be identical."
     )
 
-  if any(s != 1 for s in op.static_strides):
-    raise NotImplementedError(
-        "SubViewOp only supports static strides of 1."
-    )
-
-  if utils.is_memref_transposed(op.source.type):
-    raise NotImplementedError(
-        "SubViewOp does not support transposed memrefs."
-    )
-
   unwrapped_source_ref = unwrap_transformed_memref(op.source, in_transforms)
   swizzle, transforms = swizzle_and_transforms_from_transforms_attr(out_transforms)
   if swizzle != mgpu.SwizzlingMode.kNoSwizzle:
-    source_ty = ir.MemRefType(op.source.type)
-    swizzle_elems = swizzle * 8 // utils.bitwidth(source_ty.element_type)
-    source_strides, _ = source_ty.get_strides_and_offset()
+    swizzle_elems = swizzle * 8 // utils.bitwidth(src_ty.element_type)
+    source_strides, _ = src_ty.get_strides_and_offset()
     for stride, offset, size in zip(
         source_strides, op.static_offsets, op.static_sizes, strict=True
     ):
@@ -1774,6 +1786,14 @@ def _tmem_ref_from_ir(
   return tcgen05.TMEMRef(tmem_addr, shape, el_ty, tmem_layout)
 
 
+def _tmem_ref_to_ir(ref: tcgen05.TMEMRef) -> ir.Value:
+  """Returns an IR value from a TMEMRef."""
+  type = ir.MemRefType.get(ref.shape, ref.dtype, memory_space=mgpu_utils.tmem())
+  cast = builtin.UnrealizedConversionCastOp([type], [ref.address])
+  cast.attributes["layout"] = layouts_lib.to_layout_attr(ref.layout)
+  return cast.result
+
+
 @_register_lowering(mgpu.TcGen05MMAOp)
 def _tcgen05_mma_op_lowering_rule(
     ctx: LoweringContext, op: mgpu.TcGen05MMAOp
@@ -2155,6 +2175,7 @@ def _should_lower(op: ir.OpView) -> bool:
       op.OPERATION_NAME.startswith("mosaic_gpu.")  # pytype: disable=attribute-error
       or inference_utils.should_have_layout(op)
       or inference_utils.should_have_transforms(op)
+      or inference_utils.should_have_tmem_layout(op)
       or any(bool(b) for r in op.regions for b in r)  # Does it have subblocks?
   )
 
diff --git a/jax/experimental/mosaic/gpu/equations.py b/jax/experimental/mosaic/gpu/equations.py
@@ -529,6 +529,8 @@ def holds(self) -> bool | None:
         tiling = t
       case RegisterLayout(value=fa.TiledLayout() as layout):
         tiling = layout.base_tile_shape
+      case TMEMLayout(value):
+        tiling = value.base_tile_shape
       case _:
         return None
 
diff --git a/jax/experimental/mosaic/gpu/layout_inference.py b/jax/experimental/mosaic/gpu/layout_inference.py
@@ -1450,7 +1450,7 @@ def _memref_subview_equation_system(
   dest = ValueSite(op, VariableType.RESULT, 0)
   source_dest_var = ctx.producer_ref(source)
 
-  if any(map(lambda s: s != 1, op.static_strides)):
+  if any(s != 1 for s in op.static_strides):
     raise NotImplementedError(
         f"Only unit strides are supported but got {op.static_strides}."
     )
@@ -1473,7 +1473,7 @@ def _memref_subview_equation_system(
     if ir.ShapedType.is_dynamic_size(size):
       tiling_multiple = []
     else:
-      src_type = ir.ShapedType(op.source.type)
+      src_type = ir.MemRefType(op.source.type)
       divisibility_constraint = math.gcd(size, src_type.shape[i])
       if isinstance(offset, int):
         divisibility_constraint = math.gcd(divisibility_constraint, offset)
diff --git a/tests/mosaic/gpu_equations_test.py b/tests/mosaic/gpu_equations_test.py
@@ -21,6 +21,7 @@
 from jax.experimental.mosaic.gpu import equations
 from jax.experimental.mosaic.gpu import fragmented_array as fa
 from jax.experimental.mosaic.gpu import launch_context as lc
+from jax.experimental.mosaic.gpu import tcgen05
 
 config.parse_flags_with_absl()
 
@@ -469,6 +470,10 @@ def test_divides_constraints_are_satisfied_by_divisor_tiling(self):
     with self.subTest("RegisterLayout"):
       tiling = equations.RegisterLayout(fa.WGMMA_LAYOUT)
       self.assertTrue(equations.Divides(tiling, (0, 64)).holds())
+    with self.subTest("TMEMLayout"):
+      layout = tcgen05.tmem_default_layout(packing=1)
+      tiling = equations.TMEMLayout(layout)
+      self.assertTrue(equations.Divides(tiling, (0, 64)).holds())
 
   def test_divides_constraints_are_not_satisfied_by_non_divisor_tiling(self):
     with self.subTest("SMEMTiling"):
@@ -477,6 +482,10 @@ def test_divides_constraints_are_not_satisfied_by_non_divisor_tiling(self):
     with self.subTest("RegisterLayout"):
       tiling = equations.RegisterLayout(fa.WGMMA_LAYOUT)
       self.assertFalse(equations.Divides(tiling, (3, 64)).holds())
+    with self.subTest("TMEMLayout"):
+      layout = tcgen05.tmem_default_layout(packing=1)
+      tiling = equations.TMEMLayout(layout)
+      self.assertFalse(equations.Divides(tiling, (3, 64)).holds())
 
   def test_reduce_merges_divides_constraints_on_same_variable(self):
     v0, v1 = equations.Variable(0), equations.Variable(1)
diff --git a/tests/mosaic/gpu_layout_inference_test.py b/tests/mosaic/gpu_layout_inference_test.py
@@ -1660,36 +1660,52 @@ def test_infer_transforms_for_memref_cast_op(self, annotate_producer):
   def test_infer_transforms_for_subview_raises_on_slice_incompatible_with_tile(
       self, annotate_input
   ):
-    shape = (2, 64, 64)
-    elt_ty = ir.BF16Type.get()
-
-    in_ref_ty = ir.MemRefType.get(shape, elt_ty, memory_space=mgpu.utils.smem())
-    out_ref_ty = ir.MemRefType.get((2, 64, 32), elt_ty, memory_space=mgpu.utils.smem())
-
     with ir.InsertionPoint(self.module.body):
+      in_ref_ty = ir.MemRefType.get(
+          (2, 64, 64), ir.BF16Type.get(), memory_space=mgpu.utils.smem()
+      )
       [in_ref] = undefs(in_ref_ty)
 
       transforms = ir.ArrayAttr.get([
-        mgpu.dialect.TileTransformAttr.get((32, 16)),
-        mgpu.dialect.SwizzleTransformAttr.get(32),
+          mgpu.dialect.TileTransformAttr.get((32, 16)),
+          mgpu.dialect.SwizzleTransformAttr.get(32),
       ])
 
       if annotate_input:
         in_ref = mgpu.dialect.with_transforms(in_ref, transforms)
 
-      subview_op = memref.SubViewOp(
-          out_ref_ty,
-          in_ref,
-          [],
-          [],
-          [],
-          static_offsets = [1, 0, 0],
-          static_sizes = [2, 64, 8],
-          static_strides = [1, 1, 1]
+      out_ref = memref.subview(
+          in_ref, offsets=[1, 0, 0], sizes=[2, 64, 8], strides=[1, 1, 1]
+      )
+
+      if not annotate_input:
+        mgpu.dialect.with_transforms(out_ref, transforms)
+
+    with self.assertRaisesRegex(ValueError, "Failed to infer"):
+      mgpu.infer_layout(self.module)
+
+  @parameterized.parameters([False, True])
+  def test_infer_tmem_layouts_for_subview_raises_on_slice_incompatible_with_tile(
+      self, annotate_input
+  ):
+    with ir.InsertionPoint(self.module.body):
+      in_ref_ty = ir.MemRefType.get(
+          (128, 64), ir.BF16Type.get(), memory_space=mgpu.utils.tmem()
+      )
+      [in_ref] = undefs(in_ref_ty)
+
+      layout = tcgen05.tmem_default_layout(packing=1)
+      layout_attr = layouts.to_layout_attr(layout)
+
+      if annotate_input:
+        in_ref = mgpu.dialect.tmem_layout_cast(in_ref, layout_attr)
+
+      out_ref = memref.subview(
+          in_ref, offsets=[1, 0], sizes=[2, 64], strides=[1, 1]
       )
 
       if not annotate_input:
-        mgpu.dialect.with_transforms(subview_op.result, transforms)
+        mgpu.dialect.tmem_layout_cast(out_ref, layout_attr)
 
     with self.assertRaisesRegex(ValueError, "Failed to infer"):
       mgpu.infer_layout(self.module)
@@ -1807,13 +1823,10 @@ def test_infer_transforms_for_sibling_subviews_and_distant_op(
   def test_infer_transforms_for_subview_handles_dynamic_offsets(
       self, annotate_input
   ):
-    shape = (32, 32, 32, 32)
-    elt_ty = ir.BF16Type.get()
-
-    in_ref_ty = ir.MemRefType.get(shape, elt_ty, memory_space=mgpu.utils.smem())
-    out_ref_ty = ir.MemRefType.get((16, 16, 32, 32), elt_ty, memory_space=mgpu.utils.smem())
-
     with ir.InsertionPoint(self.module.body):
+      in_ref_ty = ir.MemRefType.get(
+          (32, 32, 32, 32), ir.BF16Type.get(), memory_space=mgpu.utils.smem()
+      )
       [in_ref] = undefs(in_ref_ty)
 
       transforms = ir.ArrayAttr.get([
@@ -1825,34 +1838,55 @@ def test_infer_transforms_for_subview_handles_dynamic_offsets(
         in_ref = mgpu.dialect.with_transforms(in_ref, transforms)
 
       c = lambda x: arith.constant(ir.IntegerType.get_signless(32), x)
-      subview_op = memref.SubViewOp(
-          out_ref_ty,
+      out_ref = memref.subview(
           in_ref,
-          [c(16), c(4), arith.muli(c(8), c(3))],
-          [],
-          [],
-          static_offsets=[
-              ir.ShapedType.get_dynamic_size(),
-              ir.ShapedType.get_dynamic_size(),
-              ir.ShapedType.get_dynamic_size(),
-              0,
-          ],
-          static_sizes=[16, 16, 32, 32],
-          static_strides=[1, 1, 1, 1],
+          offsets=[c(16), c(4), arith.muli(c(8), c(3)), 0],
+          sizes=[16, 16, 32, 32],
+          strides=[1, 1, 1, 1],
       )
 
       if not annotate_input:
-        mgpu.dialect.with_transforms(subview_op.result, transforms)
+        mgpu.dialect.with_transforms(out_ref, transforms)
 
     mgpu.infer_layout(self.module)
-
     self.assertSequenceEqual(
-        inference_utils.in_transforms(subview_op), [transforms]
+        inference_utils.in_transforms(out_ref.owner), [transforms]
     )
     self.assertSequenceEqual(
-        inference_utils.out_transforms(subview_op), [transforms]
+        inference_utils.out_transforms(out_ref.owner), [transforms]
     )
 
+  @parameterized.parameters([False, True])
+  def test_infer_tmem_layouts_for_subview_handles_dynamic_offsets(
+      self, annotate_input
+  ):
+    with ir.InsertionPoint(self.module.body):
+      in_ref_ty = ir.MemRefType.get(
+          (128, 256), ir.BF16Type.get(), memory_space=mgpu.utils.tmem()
+      )
+      [in_ref] = undefs(in_ref_ty)
+
+      layout = tcgen05.tmem_default_layout(packing=1)
+      layout_attr = layouts.to_layout_attr(layout)
+
+      if annotate_input:
+        in_ref = mgpu.dialect.tmem_layout_cast(in_ref, layout_attr)
+
+      c = lambda x: arith.constant(ir.IntegerType.get_signless(32), x)
+      out_ref = memref.subview(
+          in_ref,
+          offsets=[c(0), arith.muli(c(16), c(4))],
+          sizes=[128, 128],
+          strides=[1, 1],
+      )
+
+      if not annotate_input:
+        mgpu.dialect.tmem_layout_cast(out_ref, layout_attr)
+
+    mgpu.infer_layout(self.module)
+    self.checkInTmemLayouts(out_ref.owner, [layout])
+    self.checkOutTmemLayouts(out_ref.owner, [layout])
+
   def test_custom_primitive_op_retains_transforms(self):
     with ir.InsertionPoint(self.module.body):
       transforms = ir.ArrayAttr.get([
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -5570,6 +5570,46 @@ def body(ctx, x, y, x_out, y_out, tmem):
     self.assertArraysEqual(x_out, x)
     self.assertArraysEqual(y_out, y)
 
+  def test_tmem_subview(self):
+    def body(ctx, in_ref, out_ref, tmem):
+      del ctx
+      # GMEM -> Registers -> TMEM
+      in_reg = mgpu_dialect.vector_load(in_ref)
+      slice_in = memref.subview(
+          tmem, offsets=[0, 8], sizes=[128, 200], strides=[1, 1]
+      )
+      slice_in = memref.subview(
+          slice_in, offsets=[0, 0], sizes=[128, 128], strides=[1, 1]
+      )
+      mgpu_dialect.async_store_tmem(in_reg, slice_in)
+      tcgen05.commit_tmem()
+
+      def dynamic_idx(idx: int) -> ir.Value:
+        idx_type = ir.IndexType.get()
+        return arith.constant(idx_type, idx)
+
+      # TMEM -> Registers -> GMEM
+      slice_out = memref.subview(
+          tmem,
+          offsets=[dynamic_idx(0), dynamic_idx(8)],
+          sizes=[128, 128],
+          strides=[1, 1],
+      )
+      out_reg = mgpu_dialect.async_load_tmem(slice_out)
+      mgpu_dialect.vector_store(out_reg, out_ref)
+
+    kernel = mgpu.as_gpu_kernel(
+        body,
+        grid=(1, 1, 1),
+        block=(128, 1, 1),
+        in_shape=jax.ShapeDtypeStruct((128, 128), jnp.float32),
+        out_shape=jax.ShapeDtypeStruct((128, 128), jnp.float32),
+        smem_scratch_shape=mgpu.TMEM((128, 256), jnp.float32),
+        thread_semantics=mgpu.LoweringSemantics.Warpgroup,
+    )
+    x = self.prng.uniform(-100, 100, (128, 128)).astype(jnp.float32)
+    self.assertArraysEqual(kernel(x), x)
+
 
 class UtilsTest(TestCase):
   @parameterized.parameters(
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -3204,13 +3204,8 @@ def test_print_layout_tmem(self):
     )
     def kernel(o_ref, tmem_ref):
       del o_ref
-      if self.LOWERING_SEMANTICS == plgpu.LoweringSemantics.Lane:
-        # Slicing TMEM to make sure we handle transforms correctly.
-        plgpu.print_layout("tmem: {}", tmem_ref.at[:, :128])
-      else:
-        # TODO(b/415721295): Remove this branch once TMEM slicing is supported
-        # for WG semantics.
-        plgpu.print_layout("tmem: {}", tmem_ref)
+      # Slicing TMEM to make sure we handle transforms correctly.
+      plgpu.print_layout("tmem: {}", tmem_ref.at[:, :128])
 
     with self.capture_stdout() as output:
       jax.block_until_ready(kernel())
@@ -3412,7 +3407,6 @@ def kernel(x_ref, y_ref, tmem_ref, smem_ref, barrier_ref):
     np.testing.assert_array_equal(x_result, x + 1)
 
   def test_tmem_column_slicing(self):
-    self.skip_if_wg_semantics()
     transforms = self.default_transforms(dtype=jnp.float32)
     @functools.partial(
         self.kernel,
@@ -3806,7 +3800,6 @@ def kernel(a_gmem, b_gmem, out_gmem,
     np.testing.assert_allclose(result, x @ y, rtol=1e-3)
 
   def test_matmul_with_sliced_accumulator(self):
-    self.skip_if_wg_semantics()  # Slicing TMEM is not supported.
     dtype = jnp.bfloat16
     shape = (128, 128)
     tmem_shape = (128, 2 * 128)