[Mosaic] Allow padding in small tiling row shuffle reshape.

WindQAQ · Google-ML-Automation · commit f5f32adc35bf · 2025-11-11T13:48:51.000-08:00
We just need to make sure vreg-slice is lane aligned, e.g., each row is either fully occupied or fully padded, and only last vreg contains padding on tiled dims. To cover more cases, try to infer 1d tiling with implicit second minor.

For example, reshape vector&lt;16x512x56x128xbf16&gt; to vector&lt;16x512x7168xbf16&gt; can use in tiling = (16, 128) and out tiling = (1, 256) and make it no-op.

PiperOrigin-RevId: 831053083
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
@@ -5610,6 +5610,12 @@ LogicalResult reshape_rule(RewriteContext& ctx, Operation& op,
       layout_in.vregSlice(ctx.target_shape);
   const std::array<int64_t, 2> dst_vreg_slice =
       layout_out.vregSlice(ctx.target_shape);
+  auto dst_vregs_shape = layout_out.tileArrayShape(
+      /*src_is_implicit=*/false, /*res_is_implicit=*/true, dst_shape,
+      ctx.target_shape);
+  auto src_vregs_shape = layout_in.tileArrayShape(
+      /*src_is_implicit=*/false, /*res_is_implicit=*/true, src_shape,
+      ctx.target_shape);
   if (layout_in.tiling() == layout_out.tiling() &&
       layout_in.offsets() == layout_out.offsets() &&
       src_tiled_dims == dst_tiled_dims) {
@@ -5650,40 +5656,53 @@ LogicalResult reshape_rule(RewriteContext& ctx, Operation& op,
     no_op = true;
   }
 
-  auto can_use_row_shuffle = [&ctx](ArrayRef<int64_t> shape,
-                                    VectorLayout layout,
-                                    std::array<int64_t, 2> vreg_slice) {
-    if (shape.size() < 2) {
+  bool can_use_row_shuffle = [&]() {
+    if (!llvm::isPowerOf2_32(bitwidth)) {
       return false;
     }
-    // vreg must not be padded.
-    if (shape.back() % vreg_slice[1] != 0 ||
-        shape[shape.size() - 2] % vreg_slice[0] != 0) {
+    if (layout_in.offsets() != LayoutOffsets{0, 0} ||
+        layout_out.offsets() != LayoutOffsets{0, 0}) {
       return false;
     }
-    if (!llvm::isPowerOf2_32(layout.bitwidth())) {
-      return false;
-    }
-    if (layout.offsets() != LayoutOffsets{0, 0}) {
-      return false;
-    }
-    if (layout.implicit_dim() != VectorLayout::ImplicitDim::kNone) {
+
+    auto is_lane_aligned = [&](std::array<int64_t, 2> tiled_ishape,
+                               VectorLayout layout) -> bool {
+      bool is_1d_tiling =
+          layout.tiling() ==
+          std::array<int64_t, 2>{1, ctx.target_shape[1] * layout.packing()};
+      auto vreg_slice = layout.vregSlice(ctx.target_shape);
+      return is_1d_tiling || tiled_ishape[1] % vreg_slice[1] == 0;
+    };
+
+    if (!is_lane_aligned(src_tiled_dims, layout_in) ||
+        !is_lane_aligned(dst_tiled_dims, layout_out)) {
       return false;
     }
-    // 2d tiling.
-    if (layout.tiling()[0] <= ctx.target_shape[0] * layout.packing() &&
-        layout.tiling()[1] == ctx.target_shape[1] &&
-        shape.back() == vreg_slice[1]) {
+
+    auto has_padding = [&](std::array<int64_t, 2> tiled_ishape,
+                           VectorLayout layout) -> bool {
+      auto vreg_slice = layout.vregSlice(ctx.target_shape);
+      bool is_1d_tiling =
+          layout.tiling() ==
+          std::array<int64_t, 2>{1, ctx.target_shape[1] * layout.packing()};
+      if (is_1d_tiling) {
+        return tiled_ishape[1] % vreg_slice[1] != 0;
+      }
+      return (tiled_ishape[0] % vreg_slice[0] != 0) ||
+             (tiled_ishape[1] != vreg_slice[1]);
+    };
+
+    bool src_vreg_has_padding = has_padding(src_tiled_dims, layout_in);
+    bool dst_vreg_has_padding = has_padding(dst_tiled_dims, layout_out);
+    if (!src_vreg_has_padding && !dst_vreg_has_padding) {
       return true;
     }
-    // 1d tiling.
-    if (layout.tiling() ==
-            std::array<int64_t, 2>{1, ctx.target_shape[1] * layout.packing()} &&
-        shape.back() % vreg_slice[1] == 0) {
-      return true;
+    if (src_vreg_has_padding && dst_vreg_has_padding) {
+      return llvm::product_of(src_tiled_dims) ==
+             llvm::product_of(dst_tiled_dims);
     }
     return false;
-  };
+  }();
 
   FAILUREOR_ASSIGN_OR_RETURN(
       xla::Array<Value> src_vregs,
@@ -5715,15 +5734,11 @@ LogicalResult reshape_rule(RewriteContext& ctx, Operation& op,
           layout_out.tileArrayImplicitShape(dst_shape, ctx.target_shape));
       return dst_vregs_local;
     } else if (
-        // Row shuffle within a vreg if there is no padding and each vreg holds
-        // a contiguous slice of the flattened data.
-        can_use_row_shuffle(src_shape, layout_in, src_vreg_slice) &&
-        can_use_row_shuffle(dst_shape, layout_out, dst_vreg_slice)) {
+        // Row shuffle within a vreg if each vreg holds a contiguous slice of
+        // the flattened data.
+        can_use_row_shuffle) {
       auto [sublane_count, lane_count] = ctx.target_shape;
-      auto dst_vregs_shape =
-          layout_out.tileArrayShape(false, false, dst_shape, ctx.target_shape);
-      auto src_vregs_shape =
-          layout_in.tileArrayShape(false, false, src_shape, ctx.target_shape);
+      src_vregs.Reshape(dst_vregs_shape);
       if (bitwidth == 32) {
         // For 32 bit data, a sublane is effectively a physical row.
         std::array<int64_t, 2> src_sublane_slice = {
@@ -5845,8 +5860,20 @@ LogicalResult reshape_rule(RewriteContext& ctx, Operation& op,
         // with tiling (16, 128) and then to (8, 512) with tiling (8, 128).
         const int64_t src_sublane_tiling = layout_in.tiling()[0];
         const int64_t dst_sublane_tiling = layout_out.tiling()[0];
+        const int64_t native_sublane_tiling =
+            ctx.target_shape[0] * layout_in.packing();
         CHECK(llvm::isPowerOf2_64(static_cast<uint64_t>(src_sublane_tiling)));
         CHECK(llvm::isPowerOf2_64(static_cast<uint64_t>(dst_sublane_tiling)));
+        CHECK(
+            llvm::isPowerOf2_64(static_cast<uint64_t>(native_sublane_tiling)));
+        // (target_shape[0] * packing, target_shape[1]) <->
+        // (1, target_shape[1] * packing) is a no-op.
+        if ((src_sublane_tiling == 1 &&
+             dst_sublane_tiling == native_sublane_tiling) ||
+            (src_sublane_tiling == native_sublane_tiling &&
+             dst_sublane_tiling == 1)) {
+          return src_vregs;
+        }
         tpu::PackFormat unpack_format, pack_format;
         if (src_sublane_tiling > dst_sublane_tiling) {
           unpack_format = tpu::PackFormat::kInterleaved;
@@ -5887,7 +5914,6 @@ LogicalResult reshape_rule(RewriteContext& ctx, Operation& op,
                   src_vreg->getLoc(), src_vreg->getType(), dst_vreg);
             });
       }
-      src_vregs.Reshape(dst_vregs_shape);
       return src_vregs;
     } else if (
         // Lower shape_casts for {32/16/8}-bit types where the minor dimension
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc
@@ -1631,42 +1631,59 @@ class VectorLayoutInferer {
       return success();
     }
 
-    // Find the small tiling such that there is not padding and each vreg holds
-    // a continuous slice of the flatten data.
+    // Find the small tiling such that each vreg holds a continuous slice of the
+    // flatten data and each row is either fully occupied or is all padding.
     auto small_second_minor_tiling_layout =
-        [&](ArrayRef<int64_t> shape) -> std::optional<VectorLayout> {
+        [&](ArrayRef<int64_t> shape,
+            ImplicitDim implicit_dim) -> std::optional<VectorLayout> {
+      if (!llvm::isPowerOf2_32(bitwidth)) {
+        return std::nullopt;
+      }
+
       const int64_t elements_per_vreg = native_tiling[0] * native_tiling[1];
+      bool aligned_1d_tiling = shape.back() % elements_per_vreg == 0;
+      // Force 1d tiling with implicit second minor.
+      if (implicit_dim == ImplicitDim::kSecondMinor || aligned_1d_tiling) {
+        return VectorLayout(bitwidth, {0, 0}, {1, target_shape_[1] * packing},
+                            implicit_dim);
+      }
+
+      CHECK_EQ(implicit_dim, ImplicitDim::kNone);
       if (shape.size() < 2) {
         return std::nullopt;
       }
-      if (!llvm::isPowerOf2_32(bitwidth)) {
+      int64_t second_minor_tiling = elements_per_vreg / shape.back();
+      if (elements_per_vreg % shape.back() != 0 ||
+          second_minor_tiling % packing != 0 ||
+          second_minor_tiling > native_tiling[0]) {
         return std::nullopt;
       }
-      int64_t second_minor_tiling = elements_per_vreg / shape.back();
-      bool can_use_1d_tiling = shape.back() % elements_per_vreg == 0;
-      std::array<int64_t, 2> tiling;
-      if (can_use_1d_tiling) {
-        tiling = {1, target_shape_[1] * packing};
-      } else if (elements_per_vreg % shape.back() == 0 &&
-                 second_minor_tiling % packing == 0 &&
-                 second_minor_tiling <= native_tiling[0]) {
-        tiling = {second_minor_tiling, target_shape_[1]};
-      } else {
+      auto layout =
+          VectorLayout(bitwidth, {0, 0},
+                       {second_minor_tiling, target_shape_[1]}, implicit_dim);
+      // Must be lane-aligned. This makes sure vreg is one-to-one mapping.
+      if (shape.back() != layout.vregSlice(target_shape_)[1]) {
         return std::nullopt;
       }
       // TODO(b/440370770): Preserve replicated offsets.
-      auto layout = VectorLayout(bitwidth, {0, 0}, tiling, ImplicitDim::kNone);
+      return layout;
+    };
+
+    auto has_padding = [&](std::array<int64_t, 2> tiled_ishape,
+                           VectorLayout layout) -> bool {
       auto vreg_slice = layout.vregSlice(target_shape_);
-      if ((shape.back() != vreg_slice[1] && !can_use_1d_tiling) ||
-          shape[shape.size() - 2] % vreg_slice[0] != 0) {
-        return std::nullopt;
+      bool is_1d_tiling = layout.tiling() ==
+                          std::array<int64_t, 2>{1, target_shape_[1] * packing};
+      if (is_1d_tiling) {
+        return tiled_ishape[1] % vreg_slice[1] != 0;
       }
-      return layout;
+      return (tiled_ishape[0] % vreg_slice[0] != 0) ||
+             (tiled_ishape[1] != vreg_slice[1]);
     };
 
-    // Use the small tiling if there's no padding and each vreg holds a
-    // contiguous slice of the flattened data. It makes reshape a row shuffle
-    // within a vreg.
+    // Use the small tiling if each vreg holds a contiguous slice of the
+    // flattened data and each row is either fully occupied or is all
+    // padding. It makes reshape a row shuffle within a vreg.
     //
     // For example,
     // - (4, 256) with (4, 128) tiling to (1, 1024) with (1, 128) tiling is
@@ -1675,16 +1692,43 @@ class VectorLayoutInferer {
     // - (4, 256) with (4, 128) tiling to (2, 512) with (2, 128) tiling is
     //  to shuffle sublane from [0, 1, 2, 3, 4, 5, 6, 7] to
     //  [0, 2, 4, 6, 1, 3, 5, 7]
-    auto src_small_second_minor_tiling_layout =
-        small_second_minor_tiling_layout(src_shape);
-    auto res_small_second_minor_tiling_layout =
-        small_second_minor_tiling_layout(res_shape);
-
-    if (src_small_second_minor_tiling_layout.has_value() &&
-        res_small_second_minor_tiling_layout.has_value()) {
-      setLayout(op, *src_small_second_minor_tiling_layout,
-                *res_small_second_minor_tiling_layout);
-      return success();
+    //
+    // Use implicit second minor to simplify the logic a bit.
+    for (ImplicitDim src_implicit_dim :
+         {ImplicitDim::kNone, ImplicitDim::kSecondMinor}) {
+      for (ImplicitDim res_implicit_dim :
+           {ImplicitDim::kNone, ImplicitDim::kSecondMinor}) {
+        auto src_small_second_minor_tiling_layout =
+            small_second_minor_tiling_layout(src_shape, src_implicit_dim);
+        auto res_small_second_minor_tiling_layout =
+            small_second_minor_tiling_layout(res_shape, res_implicit_dim);
+        if (!src_small_second_minor_tiling_layout.has_value() ||
+            !res_small_second_minor_tiling_layout.has_value()) {
+          continue;
+        }
+        auto src_layout = *src_small_second_minor_tiling_layout;
+        auto res_layout = *res_small_second_minor_tiling_layout;
+        auto src_tiled_ishape = src_layout.getImplicitTiledDims(src_shape, 1);
+        auto res_tiled_ishape = res_layout.getImplicitTiledDims(res_shape, 1);
+        bool src_vreg_has_padding = has_padding(src_tiled_ishape, src_layout);
+        bool res_vreg_has_padding = has_padding(res_tiled_ishape, res_layout);
+        if (!src_vreg_has_padding && !res_vreg_has_padding) {
+          // No padding on either side, e.g., reshape i32 (8, 128) to (4, 256)
+          // with input tiling (8, 128) and output tiling (4, 128).
+          setLayout(op, src_layout, res_layout);
+          return success();
+        }
+        if (src_vreg_has_padding && res_vreg_has_padding) {
+          // Padding on both sides, e.g., reshape i32 (10, 128) to (5, 256)
+          // with input tiling (8, 128) and output tiling (4, 128). We need to
+          // make sure only the last vreg in tiled dims is padded.
+          if (llvm::product_of(src_tiled_ishape) ==
+              llvm::product_of(res_tiled_ishape)) {
+            setLayout(op, src_layout, res_layout);
+            return success();
+          }
+        }
+      }
     }
 
     // Shape casts for {32/16/8}-bit vector types with rank >= 2.