fixup! fixup! [mlir][linalg] Enable scalable vectorization of linalg.unpack (WIP)

banach-space · banach-space · commit d06a197c121e · 2025-07-30T12:58:38.000Z
Improve documentation + fix test after rebasing on top of * #150602
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1850,6 +1850,13 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
                           ArrayRef<int64_t> inputVectorSizes,
                           ArrayRef<bool> inputScalableVecDims,
                           SmallVectorImpl<Value> &newResults) {
+  if (!inputVectorSizes.empty()) {
+    assert(inputVectorSizes.size() ==
+               unpackOp.getDestRank() + unpackOp.getSourceRank() &&
+           "Invalid number of input vector sizes!");
+    assert(inputVectorSizes.size() == inputScalableVecDims.size() &&
+           "Incompatible number of vector sizes and vector scalable flags!");
+  }
 
   // TODO: Introduce a parent class that will handle the insertion point update.
   OpBuilder::InsertionGuard g(rewriter);
@@ -1865,44 +1872,41 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
 
   auto destSize = unpackOp.getDestRank();
 
-  if (!inputVectorSizes.empty()) {
-    assert(inputVectorSizes.size() == destSize + sourceShape.size() &&
-           "Incorrect number of input vector sizes");
-  }
-
-  SmallVector<bool> readScalableVectorFlags;
-  SmallVector<bool> writeScalableVectorFlags;
+  // 1. Obtain vector sizes for the read and write operation.s
   SmallVector<int64_t> readVectorSizes;
   SmallVector<int64_t> writeVectorSizes;
+  SmallVector<bool> readScalableVectorFlags;
+  SmallVector<bool> writeScalableVectorFlags;
 
-  // Split input-vector-sizes into vector sizes for the read and write
-  // operations.
+  // CASE 1: Vector sizes are user-specified.
+  // 1.0 This is the trivial case, simply split the input vector sizes.
   if (!inputVectorSizes.empty()) {
     readVectorSizes.append(inputVectorSizes.begin(),
                            inputVectorSizes.begin() + sourceShape.size());
     writeVectorSizes.append(inputVectorSizes.begin() + sourceShape.size(),
                             inputVectorSizes.end());
-  }
-  if (!inputScalableVecDims.empty()) {
     readScalableVectorFlags.append(inputScalableVecDims.begin(),
                                    inputScalableVecDims.begin() +
                                        sourceShape.size());
     writeScalableVectorFlags.append(inputScalableVecDims.begin() +
                                         sourceShape.size(),
                                     inputScalableVecDims.end());
-  } else {
-    readScalableVectorFlags = SmallVector<bool>(sourceShape.size(), false);
-    writeScalableVectorFlags = SmallVector<bool>(destSize, false);
   }
 
-  // writeVectorSizes is the shape of the vector that will be used to do final
-  // write on the destination tensor. It is set like this: Let's say the
-  // source tensor is rank 'M' and the dest tensor rank 'N', where N <= M.
-  // Thus:
-  // 1. writeVectorSizes = sourceShape.take_front(N)
-  // 2. if outer_dims_perms is present: do that permutation on writeVectorSizes.
-  // 3. multiply all the locations in vectorSize pointed by innerDimPos by the
-  //    innerTiles attribute value.
+  // CASE 2: Vector sizes have to be inferred.
+  //
+  // 1.1 Infer vector sizes for the write operation.
+  //
+  // Let:
+  //    * rank(source tensor) = 'M'
+  //    * rank(dest tensor) = 'N',
+  // and N <= M. The steps are:
+  //  1. writeVectorSizes = sourceShape.take_front(N)
+  //  2. Multiply all the locations in writeVectorSize pointed by inner_dims_pos
+  //     by the corresponding values from the `inner_tiles` attribute value.
+  //  3. If outer_dims_perms is present, permutate writeVectorSizes accordingly.
+  //
+  // Note, this will only work when all sizes are static!
   if (writeVectorSizes.empty()) {
     if (ShapedType::isDynamicShape(sourceShape))
       return failure();
@@ -1916,28 +1920,17 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
     useInBoundsInsteadOfMasking = true;
   }
 
-  // readVectorSizes is the size of tensor used to read and apply mask. It is
-  // set like this: Let's say the vectorSize (VS) array is size 'N' and
-  // the sourceShape(SS) is 'M' where M >= N and InnerTileSizes (IT) of
-  // size M-N
-  // Thus:
-  // - initially: readVectorSizes = vectorInputSizes
-  // - Divide all the readMaskShape locations pointed by innerDimPos
-  //   by the innerTileSize attribute value.
-  // - if outer_dims_perms is present: do that permutation on readVectorSizes.
-  // - Append the remaining shape from SS
-  // E.g. let's say let's say unpackTensorType.getShape() = <8x8x32x16>
-  // inner Dim Pos = [0, 1] and Inner Tiles = [32, 16], vector_sizes are [512,
-  // 128] and outer_dims_perm is [1, 0] then read shape is:
-  //   ReadVectorSizes(initial): [512, 128]
-  //   Final Value(after innerDim Adjustment): [512/32, 128/16]
-  //                                           = [16, 8]
-  //   After applying outer_dims_perm: [8, 16]
-  //   After appending the rest of the sourceShape: [8, 16, 32, 16]
-
+  // 1.2 Infer vector sizes for the read operation.
+  //
+  // The steps are:
+  //  1. readVectorSizes = vectorInputSizes
+  //  2. Take readVectorSizes from 1. and divide all locations pointed by
+  //     the inner_dims_pos attribyte by the `inner_tiles` attribute value.
+  //  3. If outer_dims_perms is present, permutate readVectorSizes accordingly.
+  //  4. Append the remaining sizes from the source tensor.
+  //
+  // Note, this will only work when all sizes are static!
   if (readVectorSizes.empty()) {
-    // Compute read-vector-sizes based on the write-vector-sizes and inner tile
-    // sizes. Note, this will only work when all sizes are static.
     readVectorSizes = writeVectorSizes;
     for (auto [index, size] : enumerate(innerTiles)) {
       readVectorSizes[innerDimPos[index]] =
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -943,23 +943,22 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x16x2xf32>
 func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
-// CHECK: %[[C0:.*]] = arith.constant 0
-// CHECK: %[[C01:.*]] = arith.constant 0
-// CHECK: %[[C02:.*]] = arith.constant 0
-// CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG_1]], %[[C02]] : tensor<?x?x16x2xf32>
-// CHECK: %[[C1:.*]] = arith.constant 1
-// CHECK: %[[DIM6:.*]] = tensor.dim %[[ARG_1]], %[[C1]] : tensor<?x?x16x2xf32>
-// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
-// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
-// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
-// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
-// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
-// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
-// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[SRC]]
-// CHECK: return %[[write0]]
- %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
- return %ret : tensor<?x?xf32>
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
+  // CHECK: %[[DIM_0:.*]] = tensor.dim %[[SRC]], %[[C0_1]] : tensor<?x?x16x2xf32>
+  // CHECK: %[[C1:.*]] = arith.constant 1
+  // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1]] : tensor<?x?x16x2xf32>
+  // CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+  // CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+  // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
+  // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
+  // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
+  // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x16xf32> to vector<4x16xf32>
+  // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
+  // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+  // CHECK: return %[[WRITE]]
+  %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+  return %ret : tensor<?x?xf32>
 }
 module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -975,10 +974,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x16x2xf32>
 func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
-  // CHECK: %[[C0:.*]] = arith.constant 0
-  // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32>
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32>
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
   // CHECK: %[[C01:.*]] = arith.constant 0
   // CHECK: %[[C02:.*]] = arith.constant 0
@@ -1011,10 +1006,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x?x2xf32>
 func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor<?x?xf32>, %src: tensor<?x?x?x2xf32>) -> tensor<?x?xf32> {
-  // CHECK: %[[C0:.*]] = arith.constant 0
-  // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32>
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32>
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
   // CHECK: %[[C01:.*]] = arith.constant 0
   // CHECK: %[[C02:.*]] = arith.constant 0