From 9f6afe18bceeca2b9d6e26368be2e06bbaf870a9 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Mon, 21 Jul 2025 16:33:54 +0000
Subject: [PATCH 1/9] [AMDGPU] fold memref.subview into amdgpu.gather_to_lds

---
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |  6 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  | 12 ++++
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |  3 +-
 .../AMDGPU/Transforms/FoldSubviewOps.cpp      | 65 +++++++++++++++++++
 .../Dialect/AMDGPU/amdgpu-fold-subviews.mlir  | 50 ++++++++++++++
 5 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
 create mode 100644 mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index cc2f543e79f69..a61903609aaff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,8 +22,9 @@ class ConversionTarget;
 namespace amdgpu {
 
 #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
+#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS
 #define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
+#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
@@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
 void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
                                             PatternBenefit benefit = 1);
 
+void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
+                                          PatternBenefit benefit = 1);
+
 } // namespace amdgpu
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 8d0e6829ab0cc..7529511b0ea76 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
     "memref::MemRefDialect"
   ];
 }
+
+def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
+  let summary = "Fold subview operations into their parent operations";
+  let description = [{
+    This pass identifies `memref.subview` source of `GatherToLDSOp` and
+    attempts to fold the source op, potentially simplifying the overall
+    operation and improving performance.
+  }];
+  let dependentDialects = [
+    "memref::MemRefDialect"
+  ];
+}
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 17bbe54ea6c0c..20621ec0d55a4 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  ResolveStridedMetadata.cpp
+  FoldSubviewOps.cpp
   MaskedloadToLoad.cpp
+  ResolveStridedMetadata.cpp
 
   ADDITIONAL_HEADER_DIRS
   {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
new file mode 100644
index 0000000000000..a962f7a2526b2
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -0,0 +1,65 @@
+//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::amdgpu {
+#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+} // namespace mlir::amdgpu
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+
+namespace {
+struct AmdgpuFoldSubviewOpsPass
+    : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
+          AmdgpuFoldSubviewOpsPass> {
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateAmdgpuFoldSubviewOpsPatterns(patterns);
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
+  using OpRewritePattern<GatherToLDSOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(GatherToLDSOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    // Check if the source is a subview operation:
+    auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
+    if (!subviewOp)
+      return rewriter.notifyMatchFailure(
+          loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+
+    SmallVector<Value> sourceIndices;
+    mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+        rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
+        subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
+
+    rewriter.replaceOpWithNewOp<GatherToLDSOp>(
+        op, subviewOp.getSource(), sourceIndices, op.getDst(),
+        op.getDstIndices(), op.getTransferType());
+
+    return success();
+  }
+};
+} // namespace
+
+void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
+}
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
new file mode 100644
index 0000000000000..d582991c3622f
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_memref
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_memref(%offset_i: index, %offset_j: index) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK:  %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+  %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+  %mem = memref.alloc() : memref<64x128xf16>
+  %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+    : vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @subview_folding_offset
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+
+  // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+  // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+
+  // CHECK:  %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+  %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+  %mem = memref.alloc() : memref<64x128xf16>
+  %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+    : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
+  func.return
+}

From 71fe3aa49154184123546c40c72d695680be7133 Mon Sep 17 00:00:00 2001
From: Alan Li <alan.li@me.com>
Date: Mon, 21 Jul 2025 14:21:05 -0400
Subject: [PATCH 2/9] Update
 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index a962f7a2526b2..7b81800f07ab2 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -43,7 +43,7 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
     auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
     if (!subviewOp)
       return rewriter.notifyMatchFailure(
-          loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+          loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");
 
     SmallVector<Value> sourceIndices;
     mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(

From bd4ade5466db59f84e88dc62773c38a40bb05c77 Mon Sep 17 00:00:00 2001
From: Alan Li <alan.li@me.com>
Date: Mon, 21 Jul 2025 14:21:15 -0400
Subject: [PATCH 3/9] Update
 mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 7529511b0ea76..fad939ced9877 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -74,8 +74,8 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
 def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
   let summary = "Fold subview operations into their parent operations";
   let description = [{
-    This pass identifies `memref.subview` source of `GatherToLDSOp` and
-    attempts to fold the source op, potentially simplifying the overall
+    This pass identifies `memref.subview` sources of `GatherToLDSOp` and
+    attempts to fold the source ops, potentially simplifying the overall
     operation and improving performance.
   }];
   let dependentDialects = [

From 9552f4ed9b2857c79fedb2faab32cdaddd8dfda1 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Mon, 21 Jul 2025 14:49:21 -0400
Subject: [PATCH 4/9] linting

---
 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index 7b81800f07ab2..adbdf4b856bd5 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -43,7 +43,9 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
     auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
     if (!subviewOp)
       return rewriter.notifyMatchFailure(
-          loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");
+          loc, "GatherToLDSOp folding is currently supported only when the "
+               "source is a SubviewOp. This is one specific pattern, and other "
+               "scenarios may be added in the future.");
 
     SmallVector<Value> sourceIndices;
     mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(

From 5d6483db146deb10fc8a769ab613cb6def3ca083 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Mon, 21 Jul 2025 19:59:35 +0000
Subject: [PATCH 5/9] updating tests

---
 mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
index d582991c3622f..a0f02a9bc9340 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -8,7 +8,7 @@ func.func @test_memref(%offset_i: index, %offset_j: index) {
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK:  %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
@@ -37,7 +37,7 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
   // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
   // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
 
-  // CHECK:  %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>

From cf50f5f16b797da30f90f843ac42b3aac2ff4f9c Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Tue, 22 Jul 2025 23:41:48 -0400
Subject: [PATCH 6/9] Support Expandshape and collapse shape.

---
 .../mlir/Dialect/MemRef/Utils/MemRefUtils.h   | 37 ++++++++
 .../AMDGPU/Transforms/FoldSubviewOps.cpp      | 50 ++++++----
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 91 -------------------
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 66 ++++++++++++++
 4 files changed, 137 insertions(+), 107 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
index 34ad279a07a8b..dd3b3dea6ef26 100644
--- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -116,6 +116,43 @@ inline bool isSameViewOrTrivialAlias(MemrefValue a, MemrefValue b) {
 /// the source memref (i.e. implements ViewLikeOpInterface).
 MemrefValue skipViewLikeOps(MemrefValue source);
 
+/// Given the 'indices' of a load/store operation where the memref is a result
+/// of a expand_shape op, returns the indices w.r.t to the source memref of the
+/// expand_shape op. For example
+///
+/// %0 = ... : memref<12x42xf32>
+/// %1 = memref.expand_shape %0 [[0, 1], [2]]
+///    : memref<12x42xf32> into memref<2x6x42xf32>
+/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32
+///
+/// could be folded into
+///
+/// %2 = load %0[6 * i1 + i2, %i3] :
+///          memref<12x42xf32>
+LogicalResult resolveSourceIndicesExpandShape(
+    Location loc, PatternRewriter &rewriter,
+    memref::ExpandShapeOp expandShapeOp, ValueRange indices,
+    SmallVectorImpl<Value> &sourceIndices, bool startsInbounds);
+
+/// Given the 'indices' of a load/store operation where the memref is a result
+/// of a collapse_shape op, returns the indices w.r.t to the source memref of
+/// the collapse_shape op. For example
+///
+/// %0 = ... : memref<2x6x42xf32>
+/// %1 = memref.collapse_shape %0 [[0, 1], [2]]
+///    : memref<2x6x42xf32> into memref<12x42xf32>
+/// %2 = load %1[%i1, %i2] : memref<12x42xf32>
+///
+/// could be folded into
+///
+/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
+///          memref<2x6x42xf32>
+LogicalResult
+resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
+                                  memref::CollapseShapeOp collapseShapeOp,
+                                  ValueRange indices,
+                                  SmallVectorImpl<Value> &sourceIndices);
+
 } // namespace memref
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index adbdf4b856bd5..f005842d83306 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -11,7 +11,9 @@
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir::amdgpu {
 #define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
@@ -33,28 +35,44 @@ struct AmdgpuFoldSubviewOpsPass
   }
 };
 
-struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
-  using OpRewritePattern<GatherToLDSOp>::OpRewritePattern;
+struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
+  using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(GatherToLDSOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
-    // Check if the source is a subview operation:
-    auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
-    if (!subviewOp)
-      return rewriter.notifyMatchFailure(
-          loc, "GatherToLDSOp folding is currently supported only when the "
-               "source is a SubviewOp. This is one specific pattern, and other "
-               "scenarios may be added in the future.");
-
+    Value memrefSource;
     SmallVector<Value> sourceIndices;
-    mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-        rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
-        subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
+    llvm::TypeSwitch<Operation *>(op.getSrc().getDefiningOp())
+        .Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
+          // If the source is a SubViewOp, we can directly rewrite the
+          // GatherToLDSOp.
+          mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+              rewriter, loc, subviewOp.getMixedOffsets(),
+              subviewOp.getMixedStrides(), subviewOp.getDroppedDims(),
+              op.getSrcIndices(), sourceIndices);
+          memrefSource = subviewOp.getSource();
+        })
+        .Case<memref::ExpandShapeOp>([&](memref::ExpandShapeOp expandShapeOp) {
+          mlir::memref::resolveSourceIndicesExpandShape(
+              loc, rewriter, expandShapeOp, op.getSrcIndices(), sourceIndices,
+              false);
+          memrefSource = expandShapeOp.getViewSource();
+        })
+        .Case<memref::CollapseShapeOp>(
+            [&](memref::CollapseShapeOp collapseShapeOp) {
+              mlir::memref::resolveSourceIndicesCollapseShape(
+                  loc, rewriter, collapseShapeOp, op.getSrcIndices(),
+                  sourceIndices);
+              memrefSource = collapseShapeOp.getViewSource();
+            });
+
+    if (!memrefSource)
+      return failure();
 
-    rewriter.replaceOpWithNewOp<GatherToLDSOp>(
-        op, subviewOp.getSource(), sourceIndices, op.getDst(),
-        op.getDstIndices(), op.getTransferType());
+    rewriter.replaceOpWithNewOp<GatherToLDSOp>(op, memrefSource, sourceIndices,
+                                               op.getDst(), op.getDstIndices(),
+                                               op.getTransferType());
 
     return success();
   }
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 89be188af9129..24da447ad7685 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -44,97 +44,6 @@ using namespace mlir;
 // Utility functions
 //===----------------------------------------------------------------------===//
 
-/// Given the 'indices' of a load/store operation where the memref is a result
-/// of a expand_shape op, returns the indices w.r.t to the source memref of the
-/// expand_shape op. For example
-///
-/// %0 = ... : memref<12x42xf32>
-/// %1 = memref.expand_shape %0 [[0, 1], [2]]
-///    : memref<12x42xf32> into memref<2x6x42xf32>
-/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32
-///
-/// could be folded into
-///
-/// %2 = load %0[6 * i1 + i2, %i3] :
-///          memref<12x42xf32>
-static LogicalResult resolveSourceIndicesExpandShape(
-    Location loc, PatternRewriter &rewriter,
-    memref::ExpandShapeOp expandShapeOp, ValueRange indices,
-    SmallVectorImpl<Value> &sourceIndices, bool startsInbounds) {
-  SmallVector<OpFoldResult> destShape = expandShapeOp.getMixedOutputShape();
-
-  // Traverse all reassociation groups to determine the appropriate indices
-  // corresponding to each one of them post op folding.
-  for (ArrayRef<int64_t> group : expandShapeOp.getReassociationIndices()) {
-    assert(!group.empty() && "association indices groups cannot be empty");
-    int64_t groupSize = group.size();
-    if (groupSize == 1) {
-      sourceIndices.push_back(indices[group[0]]);
-      continue;
-    }
-    SmallVector<OpFoldResult> groupBasis =
-        llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; });
-    SmallVector<Value> groupIndices =
-        llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; });
-    Value collapsedIndex = rewriter.create<affine::AffineLinearizeIndexOp>(
-        loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds);
-    sourceIndices.push_back(collapsedIndex);
-  }
-  return success();
-}
-
-/// Given the 'indices' of a load/store operation where the memref is a result
-/// of a collapse_shape op, returns the indices w.r.t to the source memref of
-/// the collapse_shape op. For example
-///
-/// %0 = ... : memref<2x6x42xf32>
-/// %1 = memref.collapse_shape %0 [[0, 1], [2]]
-///    : memref<2x6x42xf32> into memref<12x42xf32>
-/// %2 = load %1[%i1, %i2] : memref<12x42xf32>
-///
-/// could be folded into
-///
-/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
-///          memref<2x6x42xf32>
-static LogicalResult
-resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
-                                  memref::CollapseShapeOp collapseShapeOp,
-                                  ValueRange indices,
-                                  SmallVectorImpl<Value> &sourceIndices) {
-  // Note: collapse_shape requires a strided memref, we can do this.
-  auto metadata = rewriter.create<memref::ExtractStridedMetadataOp>(
-      loc, collapseShapeOp.getSrc());
-  SmallVector<OpFoldResult> sourceSizes = metadata.getConstifiedMixedSizes();
-  for (auto [index, group] :
-       llvm::zip(indices, collapseShapeOp.getReassociationIndices())) {
-    assert(!group.empty() && "association indices groups cannot be empty");
-    int64_t groupSize = group.size();
-
-    if (groupSize == 1) {
-      sourceIndices.push_back(index);
-      continue;
-    }
-
-    SmallVector<OpFoldResult> basis =
-        llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
-    auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
-        loc, index, basis, /*hasOuterBound=*/true);
-    llvm::append_range(sourceIndices, delinearize.getResults());
-  }
-  if (collapseShapeOp.getReassociationIndices().empty()) {
-    auto zeroAffineMap = rewriter.getConstantAffineMap(0);
-    int64_t srcRank =
-        cast<MemRefType>(collapseShapeOp.getViewSource().getType()).getRank();
-    OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
-        rewriter, loc, zeroAffineMap, ArrayRef<OpFoldResult>{});
-    for (int64_t i = 0; i < srcRank; i++) {
-      sourceIndices.push_back(
-          getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
-    }
-  }
-  return success();
-}
-
 /// Helpers to access the memref operand for each op.
 template <typename LoadOrStoreOpTy>
 static Value getMemRefOperand(LoadOrStoreOpTy op) {
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index a50b4cfc74708..97fe3cb5b4705 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/STLExtras.h"
@@ -217,5 +218,70 @@ MemrefValue skipViewLikeOps(MemrefValue source) {
   return source;
 }
 
+LogicalResult resolveSourceIndicesExpandShape(
+    Location loc, PatternRewriter &rewriter,
+    memref::ExpandShapeOp expandShapeOp, ValueRange indices,
+    SmallVectorImpl<Value> &sourceIndices, bool startsInbounds) {
+  SmallVector<OpFoldResult> destShape = expandShapeOp.getMixedOutputShape();
+
+  // Traverse all reassociation groups to determine the appropriate indices
+  // corresponding to each one of them post op folding.
+  for (ArrayRef<int64_t> group : expandShapeOp.getReassociationIndices()) {
+    assert(!group.empty() && "association indices groups cannot be empty");
+    int64_t groupSize = group.size();
+    if (groupSize == 1) {
+      sourceIndices.push_back(indices[group[0]]);
+      continue;
+    }
+    SmallVector<OpFoldResult> groupBasis =
+        llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; });
+    SmallVector<Value> groupIndices =
+        llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; });
+    Value collapsedIndex = rewriter.create<affine::AffineLinearizeIndexOp>(
+        loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds);
+    sourceIndices.push_back(collapsedIndex);
+  }
+  return success();
+}
+
+LogicalResult
+resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
+                                  memref::CollapseShapeOp collapseShapeOp,
+                                  ValueRange indices,
+                                  SmallVectorImpl<Value> &sourceIndices) {
+  // Note: collapse_shape requires a strided memref, we can do this.
+  auto metadata = rewriter.create<memref::ExtractStridedMetadataOp>(
+      loc, collapseShapeOp.getSrc());
+  SmallVector<OpFoldResult> sourceSizes = metadata.getConstifiedMixedSizes();
+  for (auto [index, group] :
+       llvm::zip(indices, collapseShapeOp.getReassociationIndices())) {
+    assert(!group.empty() && "association indices groups cannot be empty");
+    int64_t groupSize = group.size();
+
+    if (groupSize == 1) {
+      sourceIndices.push_back(index);
+      continue;
+    }
+
+    SmallVector<OpFoldResult> basis =
+        llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
+    auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
+        loc, index, basis, /*hasOuterBound=*/true);
+    llvm::append_range(sourceIndices, delinearize.getResults());
+  }
+  if (collapseShapeOp.getReassociationIndices().empty()) {
+    auto zeroAffineMap = rewriter.getConstantAffineMap(0);
+    int64_t srcRank =
+        cast<MemRefType>(collapseShapeOp.getViewSource().getType()).getRank();
+    OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, zeroAffineMap, ArrayRef<OpFoldResult>{});
+    for (int64_t i = 0; i < srcRank; i++) {
+      sourceIndices.push_back(
+          getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
+    }
+  }
+  return success();
+}
+
 } // namespace memref
 } // namespace mlir

From 3db555db0fa24b54662bf449609479bba8933ea1 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Tue, 22 Jul 2025 23:52:39 -0400
Subject: [PATCH 7/9] update tests.

---
 .../AMDGPU/Transforms/FoldSubviewOps.cpp      | 76 +++++++++++--------
 .../Dialect/AMDGPU/amdgpu-fold-subviews.mlir  | 52 ++++++++++++-
 2 files changed, 94 insertions(+), 34 deletions(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index f005842d83306..95ba0a76d4510 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -18,12 +18,7 @@
 namespace mlir::amdgpu {
 #define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
-} // namespace mlir::amdgpu
-
-using namespace mlir;
-using namespace mlir::amdgpu;
 
-namespace {
 struct AmdgpuFoldSubviewOpsPass
     : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
           AmdgpuFoldSubviewOpsPass> {
@@ -43,32 +38,51 @@ struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
 
     Value memrefSource;
     SmallVector<Value> sourceIndices;
-    llvm::TypeSwitch<Operation *>(op.getSrc().getDefiningOp())
-        .Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
-          // If the source is a SubViewOp, we can directly rewrite the
-          // GatherToLDSOp.
-          mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-              rewriter, loc, subviewOp.getMixedOffsets(),
-              subviewOp.getMixedStrides(), subviewOp.getDroppedDims(),
-              op.getSrcIndices(), sourceIndices);
-          memrefSource = subviewOp.getSource();
-        })
-        .Case<memref::ExpandShapeOp>([&](memref::ExpandShapeOp expandShapeOp) {
-          mlir::memref::resolveSourceIndicesExpandShape(
-              loc, rewriter, expandShapeOp, op.getSrcIndices(), sourceIndices,
-              false);
-          memrefSource = expandShapeOp.getViewSource();
-        })
-        .Case<memref::CollapseShapeOp>(
-            [&](memref::CollapseShapeOp collapseShapeOp) {
-              mlir::memref::resolveSourceIndicesCollapseShape(
-                  loc, rewriter, collapseShapeOp, op.getSrcIndices(),
-                  sourceIndices);
-              memrefSource = collapseShapeOp.getViewSource();
+    auto foldResult =
+        llvm::TypeSwitch<Operation *, LogicalResult>(
+            op.getSrc().getDefiningOp())
+            .Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
+              // If the source is a SubViewOp, we can directly rewrite the
+              // GatherToLDSOp.
+              mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+                  rewriter, loc, subviewOp.getMixedOffsets(),
+                  subviewOp.getMixedStrides(), subviewOp.getDroppedDims(),
+                  op.getSrcIndices(), sourceIndices);
+              memrefSource = subviewOp.getSource();
+              return success();
+            })
+            .Case<memref::ExpandShapeOp>(
+                [&](memref::ExpandShapeOp expandShapeOp) {
+                  if (failed(mlir::memref::resolveSourceIndicesExpandShape(
+                          loc, rewriter, expandShapeOp, op.getSrcIndices(),
+                          sourceIndices, false))) {
+                    return failure();
+                  }
+                  memrefSource = expandShapeOp.getViewSource();
+                  return success();
+                })
+            .Case<memref::CollapseShapeOp>(
+                [&](memref::CollapseShapeOp collapseShapeOp) {
+                  if (failed(mlir::memref::resolveSourceIndicesCollapseShape(
+                          loc, rewriter, collapseShapeOp, op.getSrcIndices(),
+                          sourceIndices))) {
+                    return failure();
+                  }
+                  memrefSource = collapseShapeOp.getViewSource();
+                  return success();
+                })
+            .Default([&](Operation *op) {
+              // If the source is not a SubViewOp, ExpandShapeOp, or
+              // CollapseShapeOp, we cannot fold the GatherToLDSOp.
+              return rewriter.notifyMatchFailure(
+                  op,
+                  "source producer is not one of SubViewOp, ExpandShapeOp, or "
+                  "CollapseShapeOp");
             });
 
-    if (!memrefSource)
+    if (failed(foldResult)) {
       return failure();
+    }
 
     rewriter.replaceOpWithNewOp<GatherToLDSOp>(op, memrefSource, sourceIndices,
                                                op.getDst(), op.getDstIndices(),
@@ -77,9 +91,9 @@ struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
     return success();
   }
 };
-} // namespace
 
-void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
+void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
+                                          PatternBenefit benefit) {
   patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
 }
+} // namespace mlir::amdgpu
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
index a0f02a9bc9340..2c1b1a652fe1e 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s
+// RUN: mlir-opt --amdgpu-fold-subview-ops --split-input-file %s | FileCheck %s
 
 #gpu_lds_addrspace = 3
 
-// CHECK: func @test_memref
+// CHECK: func @test_subview_folding
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
-func.func @test_memref(%offset_i: index, %offset_j: index) {
+func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
@@ -48,3 +48,49 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
     : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
   func.return
 }
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_expand_shape
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+  // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
+  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3>
+
+  %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+  %mem = memref.alloc() : memref<8192xf16>
+  %expand = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %expand[%offset_i, %offset_j], %alloc[%c0, %c0]
+    : vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu_lds_addrspace>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_collapse_shape
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
+  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+  %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+  %mem = memref.alloc() : memref<64x128xf16>
+  %collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0]
+    : vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace>
+  func.return
+}

From d6746b955d060549a2fb91105f3422a2f9996b03 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Wed, 23 Jul 2025 08:36:32 -0400
Subject: [PATCH 8/9] Rename and update

---
 mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td  | 10 +++++-----
 mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt      |  2 +-
 .../{FoldSubviewOps.cpp => FoldMemRefsOps.cpp}         | 10 +++++-----
 ...gpu-fold-subviews.mlir => amdgpu-fold-memrefs.mlir} |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)
 rename mlir/lib/Dialect/AMDGPU/Transforms/{FoldSubviewOps.cpp => FoldMemRefsOps.cpp} (94%)
 rename mlir/test/Dialect/AMDGPU/{amdgpu-fold-subviews.mlir => amdgpu-fold-memrefs.mlir} (98%)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index fad939ced9877..76b8c825ac272 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -71,12 +71,12 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
   ];
 }
 
-def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
-  let summary = "Fold subview operations into their parent operations";
+def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-memrefs-ops"> {
+  let summary = "Fold memref operations into their parent operations";
   let description = [{
-    This pass identifies `memref.subview` sources of `GatherToLDSOp` and
-    attempts to fold the source ops, potentially simplifying the overall
-    operation and improving performance.
+    This pass identifies memref operations (subview, expand_shape, collapse_shape)
+    that are sources of `GatherToLDSOp` and attempts to fold the source ops,
+    potentially simplifying the overall operation and improving performance.
   }];
   let dependentDialects = [
     "memref::MemRefDialect"
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 20621ec0d55a4..3b0c072ed1217 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  FoldSubviewOps.cpp
+  FoldMemRefsOps.cpp
   MaskedloadToLoad.cpp
   ResolveStridedMetadata.cpp
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
similarity index 94%
rename from mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
rename to mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 95ba0a76d4510..73923af1329db 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -1,4 +1,4 @@
-//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===//
+//===- FoldSubviewOps.cpp - AMDGPU fold subview ops -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,9 +19,9 @@ namespace mlir::amdgpu {
 #define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
-struct AmdgpuFoldSubviewOpsPass
+struct AmdgpuFoldMemRefOpsPass
     : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
-          AmdgpuFoldSubviewOpsPass> {
+          AmdgpuFoldMemRefOpsPass> {
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populateAmdgpuFoldSubviewOpsPatterns(patterns);
@@ -30,7 +30,7 @@ struct AmdgpuFoldSubviewOpsPass
   }
 };
 
-struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
+struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(GatherToLDSOp op,
                                 PatternRewriter &rewriter) const override {
@@ -94,6 +94,6 @@ struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
 
 void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
                                           PatternBenefit benefit) {
-  patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
+  patterns.add<FoldMemRefOpsIntoGatherToLDSOp>(patterns.getContext(), benefit);
 }
 } // namespace mlir::amdgpu
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
similarity index 98%
rename from mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
rename to mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 2c1b1a652fe1e..a751a4ac1158e 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --amdgpu-fold-subview-ops --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s
 
 #gpu_lds_addrspace = 3
 

From 2d653d05755e960a0f9345cf73530256d8a4bc23 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Wed, 23 Jul 2025 10:20:03 -0400
Subject: [PATCH 9/9] Fix according to comments

---
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h    |  6 +++---
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td   |  2 +-
 .../AMDGPU/Transforms/FoldMemRefsOps.cpp       | 18 ++++++++----------
 .../Dialect/AMDGPU/amdgpu-fold-memrefs.mlir    | 18 ++++++++----------
 4 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index a61903609aaff..58b9c74b2f8e0 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,7 +22,7 @@ class ConversionTarget;
 namespace amdgpu {
 
 #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS
+#define GEN_PASS_DECL_AMDGPUFOLDMEMREFOPSPASS
 #define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
 #define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
 #define GEN_PASS_REGISTRATION
@@ -39,8 +39,8 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
 void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
                                             PatternBenefit benefit = 1);
 
-void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
-                                          PatternBenefit benefit = 1);
+void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
+                                         PatternBenefit benefit = 1);
 
 } // namespace amdgpu
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 76b8c825ac272..8664f971cabde 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -71,7 +71,7 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
   ];
 }
 
-def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-memrefs-ops"> {
+def AmdgpuFoldMemRefOpsPass : Pass<"amdgpu-fold-memrefs-ops"> {
   let summary = "Fold memref operations into their parent operations";
   let description = [{
     This pass identifies memref operations (subview, expand_shape, collapse_shape)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 73923af1329db..a3fdc7ee385ed 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -12,21 +12,19 @@
 #include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir::amdgpu {
-#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
+#define GEN_PASS_DEF_AMDGPUFOLDMEMREFOPSPASS
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
-struct AmdgpuFoldMemRefOpsPass
-    : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
-          AmdgpuFoldMemRefOpsPass> {
+struct AmdgpuFoldMemRefOpsPass final
+    : amdgpu::impl::AmdgpuFoldMemRefOpsPassBase<AmdgpuFoldMemRefOpsPass> {
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
-    populateAmdgpuFoldSubviewOpsPatterns(patterns);
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
-      signalPassFailure();
+    populateAmdgpuFoldMemRefOpsPatterns(patterns);
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
   }
 };
 
@@ -92,8 +90,8 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
   }
 };
 
-void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
-                                          PatternBenefit benefit) {
+void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
+                                         PatternBenefit benefit) {
   patterns.add<FoldMemRefOpsIntoGatherToLDSOp>(patterns.getContext(), benefit);
 }
 } // namespace mlir::amdgpu
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index a751a4ac1158e..57afa127c9da8 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -5,10 +5,10 @@
 // CHECK: func @test_subview_folding
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
@@ -30,14 +30,12 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
 // CHECK: func @subview_folding_offset
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
   // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
-
-  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
@@ -56,11 +54,11 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_expand_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
-  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
@@ -79,11 +77,11 @@ func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_collapse_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
-  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>