From 9f6afe18bceeca2b9d6e26368be2e06bbaf870a9 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 21 Jul 2025 16:33:54 +0000 Subject: [PATCH 1/9] [AMDGPU] fold memref.subview into amdgpu.gather_to_lds --- .../mlir/Dialect/AMDGPU/Transforms/Passes.h | 6 +- .../mlir/Dialect/AMDGPU/Transforms/Passes.td | 12 ++++ .../Dialect/AMDGPU/Transforms/CMakeLists.txt | 3 +- .../AMDGPU/Transforms/FoldSubviewOps.cpp | 65 +++++++++++++++++++ .../Dialect/AMDGPU/amdgpu-fold-subviews.mlir | 50 ++++++++++++++ 5 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp create mode 100644 mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h index cc2f543e79f69..a61903609aaff 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h @@ -22,8 +22,9 @@ class ConversionTarget; namespace amdgpu { #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS -#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS +#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS #define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS +#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS #define GEN_PASS_REGISTRATION #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" @@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns, void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns, PatternBenefit benefit = 1); +void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns, + PatternBenefit benefit = 1); + } // namespace amdgpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td index 8d0e6829ab0cc..7529511b0ea76 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> { "memref::MemRefDialect" ]; } + +def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> { + let summary = "Fold subview operations into their parent operations"; + let description = [{ + This pass identifies `memref.subview` source of `GatherToLDSOp` and + attempts to fold the source op, potentially simplifying the overall + operation and improving performance. + }]; + let dependentDialects = [ + "memref::MemRefDialect" + ]; +} #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_ diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt index 17bbe54ea6c0c..20621ec0d55a4 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt @@ -1,7 +1,8 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms EmulateAtomics.cpp - ResolveStridedMetadata.cpp + FoldSubviewOps.cpp MaskedloadToLoad.cpp + ResolveStridedMetadata.cpp ADDITIONAL_HEADER_DIRS {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp new file mode 100644 index 0000000000000..a962f7a2526b2 --- /dev/null +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp @@ -0,0 +1,65 @@ +//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h" + +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir::amdgpu { +#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" +} // namespace mlir::amdgpu + +using namespace mlir; +using namespace mlir::amdgpu; + +namespace { +struct AmdgpuFoldSubviewOpsPass + : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase< + AmdgpuFoldSubviewOpsPass> { + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + populateAmdgpuFoldSubviewOpsPatterns(patterns); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) + signalPassFailure(); + } +}; + +struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(GatherToLDSOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + + // Check if the source is a subview operation: + auto subviewOp = dyn_cast(op.getSrc().getDefiningOp()); + if (!subviewOp) + return rewriter.notifyMatchFailure( + loc, "GatherToLDSOp can only be folded if the source is a SubviewOp"); + + SmallVector sourceIndices; + mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( + rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(), + subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices); + + rewriter.replaceOpWithNewOp( + op, subviewOp.getSource(), sourceIndices, op.getDst(), + op.getDstIndices(), op.getTransferType()); + + return success(); + } +}; +} // namespace + +void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns( + RewritePatternSet &patterns, PatternBenefit benefit) { + patterns.add(patterns.getContext(), benefit); +} diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir new file mode 100644 index 0000000000000..d582991c3622f --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir @@ -0,0 +1,50 @@ +// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s + +#gpu_lds_addrspace = 3 + +// CHECK: func @test_memref +// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index +func.func @test_memref(%offset_i: index, %offset_j: index) { + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> + // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> + // CHECK: %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> + + %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> + %mem = memref.alloc() : memref<64x128xf16> + %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>> + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0] + : vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace> + func.return +} + +// ----- + +#gpu_lds_addrspace = 3 + +// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)> +// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)> + +// CHECK: func @subview_folding_offset +// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index +func.func @subview_folding_offset(%offset_i: index, %offset_j: index) { + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> + // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> + + // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]] + // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]] + + // CHECK: %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> + + %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> + %mem = memref.alloc() : memref<64x128xf16> + %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>> + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0] + : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace> + func.return +} From 71fe3aa49154184123546c40c72d695680be7133 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 21 Jul 2025 14:21:05 -0400 Subject: [PATCH 2/9] Update mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp index a962f7a2526b2..7b81800f07ab2 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp @@ -43,7 +43,7 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern { auto subviewOp = dyn_cast(op.getSrc().getDefiningOp()); if (!subviewOp) return rewriter.notifyMatchFailure( - loc, "GatherToLDSOp can only be folded if the source is a SubviewOp"); + loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future."); SmallVector sourceIndices; mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( From bd4ade5466db59f84e88dc62773c38a40bb05c77 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 21 Jul 2025 14:21:15 -0400 Subject: [PATCH 3/9] Update mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td index 7529511b0ea76..fad939ced9877 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -74,8 +74,8 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> { def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> { let summary = "Fold subview operations into their parent operations"; let description = [{ - This pass identifies `memref.subview` source of `GatherToLDSOp` and - attempts to fold the source op, potentially simplifying the overall + This pass identifies `memref.subview` sources of `GatherToLDSOp` and + attempts to fold the source ops, potentially simplifying the overall operation and improving performance. }]; let dependentDialects = [ From 9552f4ed9b2857c79fedb2faab32cdaddd8dfda1 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 21 Jul 2025 14:49:21 -0400 Subject: [PATCH 4/9] linting --- mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp index 7b81800f07ab2..adbdf4b856bd5 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp @@ -43,7 +43,9 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern { auto subviewOp = dyn_cast(op.getSrc().getDefiningOp()); if (!subviewOp) return rewriter.notifyMatchFailure( - loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future."); + loc, "GatherToLDSOp folding is currently supported only when the " + "source is a SubviewOp. This is one specific pattern, and other " + "scenarios may be added in the future."); SmallVector sourceIndices; mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( From 5d6483db146deb10fc8a769ab613cb6def3ca083 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 21 Jul 2025 19:59:35 +0000 Subject: [PATCH 5/9] updating tests --- mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir index d582991c3622f..a0f02a9bc9340 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir @@ -8,7 +8,7 @@ func.func @test_memref(%offset_i: index, %offset_j: index) { // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> - // CHECK: %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: amdgpu.gather_to_lds %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]] // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> @@ -37,7 +37,7 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) { // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]] // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]] - // CHECK: %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]] // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> From cf50f5f16b797da30f90f843ac42b3aac2ff4f9c Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 22 Jul 2025 23:41:48 -0400 Subject: [PATCH 6/9] Support Expandshape and collapse shape. --- .../mlir/Dialect/MemRef/Utils/MemRefUtils.h | 37 ++++++++ .../AMDGPU/Transforms/FoldSubviewOps.cpp | 50 ++++++---- .../MemRef/Transforms/FoldMemRefAliasOps.cpp | 91 ------------------- mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 66 ++++++++++++++ 4 files changed, 137 insertions(+), 107 deletions(-) diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h index 34ad279a07a8b..dd3b3dea6ef26 100644 --- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h +++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h @@ -116,6 +116,43 @@ inline bool isSameViewOrTrivialAlias(MemrefValue a, MemrefValue b) { /// the source memref (i.e. implements ViewLikeOpInterface). MemrefValue skipViewLikeOps(MemrefValue source); +/// Given the 'indices' of a load/store operation where the memref is a result +/// of a expand_shape op, returns the indices w.r.t to the source memref of the +/// expand_shape op. For example +/// +/// %0 = ... : memref<12x42xf32> +/// %1 = memref.expand_shape %0 [[0, 1], [2]] +/// : memref<12x42xf32> into memref<2x6x42xf32> +/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32 +/// +/// could be folded into +/// +/// %2 = load %0[6 * i1 + i2, %i3] : +/// memref<12x42xf32> +LogicalResult resolveSourceIndicesExpandShape( + Location loc, PatternRewriter &rewriter, + memref::ExpandShapeOp expandShapeOp, ValueRange indices, + SmallVectorImpl &sourceIndices, bool startsInbounds); + +/// Given the 'indices' of a load/store operation where the memref is a result +/// of a collapse_shape op, returns the indices w.r.t to the source memref of +/// the collapse_shape op. For example +/// +/// %0 = ... : memref<2x6x42xf32> +/// %1 = memref.collapse_shape %0 [[0, 1], [2]] +/// : memref<2x6x42xf32> into memref<12x42xf32> +/// %2 = load %1[%i1, %i2] : memref<12x42xf32> +/// +/// could be folded into +/// +/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] : +/// memref<2x6x42xf32> +LogicalResult +resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter, + memref::CollapseShapeOp collapseShapeOp, + ValueRange indices, + SmallVectorImpl &sourceIndices); + } // namespace memref } // namespace mlir diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp index adbdf4b856bd5..f005842d83306 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp @@ -11,7 +11,9 @@ #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/TypeSwitch.h" namespace mlir::amdgpu { #define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS @@ -33,28 +35,44 @@ struct AmdgpuFoldSubviewOpsPass } }; -struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(GatherToLDSOp op, PatternRewriter &rewriter) const override { Location loc = op.getLoc(); - // Check if the source is a subview operation: - auto subviewOp = dyn_cast(op.getSrc().getDefiningOp()); - if (!subviewOp) - return rewriter.notifyMatchFailure( - loc, "GatherToLDSOp folding is currently supported only when the " - "source is a SubviewOp. This is one specific pattern, and other " - "scenarios may be added in the future."); - + Value memrefSource; SmallVector sourceIndices; - mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( - rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(), - subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices); + llvm::TypeSwitch(op.getSrc().getDefiningOp()) + .Case([&](memref::SubViewOp subviewOp) { + // If the source is a SubViewOp, we can directly rewrite the + // GatherToLDSOp. + mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( + rewriter, loc, subviewOp.getMixedOffsets(), + subviewOp.getMixedStrides(), subviewOp.getDroppedDims(), + op.getSrcIndices(), sourceIndices); + memrefSource = subviewOp.getSource(); + }) + .Case([&](memref::ExpandShapeOp expandShapeOp) { + mlir::memref::resolveSourceIndicesExpandShape( + loc, rewriter, expandShapeOp, op.getSrcIndices(), sourceIndices, + false); + memrefSource = expandShapeOp.getViewSource(); + }) + .Case( + [&](memref::CollapseShapeOp collapseShapeOp) { + mlir::memref::resolveSourceIndicesCollapseShape( + loc, rewriter, collapseShapeOp, op.getSrcIndices(), + sourceIndices); + memrefSource = collapseShapeOp.getViewSource(); + }); + + if (!memrefSource) + return failure(); - rewriter.replaceOpWithNewOp( - op, subviewOp.getSource(), sourceIndices, op.getDst(), - op.getDstIndices(), op.getTransferType()); + rewriter.replaceOpWithNewOp(op, memrefSource, sourceIndices, + op.getDst(), op.getDstIndices(), + op.getTransferType()); return success(); } diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp index 89be188af9129..24da447ad7685 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp @@ -44,97 +44,6 @@ using namespace mlir; // Utility functions //===----------------------------------------------------------------------===// -/// Given the 'indices' of a load/store operation where the memref is a result -/// of a expand_shape op, returns the indices w.r.t to the source memref of the -/// expand_shape op. For example -/// -/// %0 = ... : memref<12x42xf32> -/// %1 = memref.expand_shape %0 [[0, 1], [2]] -/// : memref<12x42xf32> into memref<2x6x42xf32> -/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32 -/// -/// could be folded into -/// -/// %2 = load %0[6 * i1 + i2, %i3] : -/// memref<12x42xf32> -static LogicalResult resolveSourceIndicesExpandShape( - Location loc, PatternRewriter &rewriter, - memref::ExpandShapeOp expandShapeOp, ValueRange indices, - SmallVectorImpl &sourceIndices, bool startsInbounds) { - SmallVector destShape = expandShapeOp.getMixedOutputShape(); - - // Traverse all reassociation groups to determine the appropriate indices - // corresponding to each one of them post op folding. - for (ArrayRef group : expandShapeOp.getReassociationIndices()) { - assert(!group.empty() && "association indices groups cannot be empty"); - int64_t groupSize = group.size(); - if (groupSize == 1) { - sourceIndices.push_back(indices[group[0]]); - continue; - } - SmallVector groupBasis = - llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; }); - SmallVector groupIndices = - llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; }); - Value collapsedIndex = rewriter.create( - loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds); - sourceIndices.push_back(collapsedIndex); - } - return success(); -} - -/// Given the 'indices' of a load/store operation where the memref is a result -/// of a collapse_shape op, returns the indices w.r.t to the source memref of -/// the collapse_shape op. For example -/// -/// %0 = ... : memref<2x6x42xf32> -/// %1 = memref.collapse_shape %0 [[0, 1], [2]] -/// : memref<2x6x42xf32> into memref<12x42xf32> -/// %2 = load %1[%i1, %i2] : memref<12x42xf32> -/// -/// could be folded into -/// -/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] : -/// memref<2x6x42xf32> -static LogicalResult -resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter, - memref::CollapseShapeOp collapseShapeOp, - ValueRange indices, - SmallVectorImpl &sourceIndices) { - // Note: collapse_shape requires a strided memref, we can do this. - auto metadata = rewriter.create( - loc, collapseShapeOp.getSrc()); - SmallVector sourceSizes = metadata.getConstifiedMixedSizes(); - for (auto [index, group] : - llvm::zip(indices, collapseShapeOp.getReassociationIndices())) { - assert(!group.empty() && "association indices groups cannot be empty"); - int64_t groupSize = group.size(); - - if (groupSize == 1) { - sourceIndices.push_back(index); - continue; - } - - SmallVector basis = - llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; }); - auto delinearize = rewriter.create( - loc, index, basis, /*hasOuterBound=*/true); - llvm::append_range(sourceIndices, delinearize.getResults()); - } - if (collapseShapeOp.getReassociationIndices().empty()) { - auto zeroAffineMap = rewriter.getConstantAffineMap(0); - int64_t srcRank = - cast(collapseShapeOp.getViewSource().getType()).getRank(); - OpFoldResult ofr = affine::makeComposedFoldedAffineApply( - rewriter, loc, zeroAffineMap, ArrayRef{}); - for (int64_t i = 0; i < srcRank; i++) { - sourceIndices.push_back( - getValueOrCreateConstantIndexOp(rewriter, loc, ofr)); - } - } - return success(); -} - /// Helpers to access the memref operand for each op. template static Value getMemRefOperand(LoadOrStoreOpTy op) { diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp index a50b4cfc74708..97fe3cb5b4705 100644 --- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp +++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/ADT/STLExtras.h" @@ -217,5 +218,70 @@ MemrefValue skipViewLikeOps(MemrefValue source) { return source; } +LogicalResult resolveSourceIndicesExpandShape( + Location loc, PatternRewriter &rewriter, + memref::ExpandShapeOp expandShapeOp, ValueRange indices, + SmallVectorImpl &sourceIndices, bool startsInbounds) { + SmallVector destShape = expandShapeOp.getMixedOutputShape(); + + // Traverse all reassociation groups to determine the appropriate indices + // corresponding to each one of them post op folding. + for (ArrayRef group : expandShapeOp.getReassociationIndices()) { + assert(!group.empty() && "association indices groups cannot be empty"); + int64_t groupSize = group.size(); + if (groupSize == 1) { + sourceIndices.push_back(indices[group[0]]); + continue; + } + SmallVector groupBasis = + llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; }); + SmallVector groupIndices = + llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; }); + Value collapsedIndex = rewriter.create( + loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds); + sourceIndices.push_back(collapsedIndex); + } + return success(); +} + +LogicalResult +resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter, + memref::CollapseShapeOp collapseShapeOp, + ValueRange indices, + SmallVectorImpl &sourceIndices) { + // Note: collapse_shape requires a strided memref, we can do this. + auto metadata = rewriter.create( + loc, collapseShapeOp.getSrc()); + SmallVector sourceSizes = metadata.getConstifiedMixedSizes(); + for (auto [index, group] : + llvm::zip(indices, collapseShapeOp.getReassociationIndices())) { + assert(!group.empty() && "association indices groups cannot be empty"); + int64_t groupSize = group.size(); + + if (groupSize == 1) { + sourceIndices.push_back(index); + continue; + } + + SmallVector basis = + llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; }); + auto delinearize = rewriter.create( + loc, index, basis, /*hasOuterBound=*/true); + llvm::append_range(sourceIndices, delinearize.getResults()); + } + if (collapseShapeOp.getReassociationIndices().empty()) { + auto zeroAffineMap = rewriter.getConstantAffineMap(0); + int64_t srcRank = + cast(collapseShapeOp.getViewSource().getType()).getRank(); + OpFoldResult ofr = affine::makeComposedFoldedAffineApply( + rewriter, loc, zeroAffineMap, ArrayRef{}); + for (int64_t i = 0; i < srcRank; i++) { + sourceIndices.push_back( + getValueOrCreateConstantIndexOp(rewriter, loc, ofr)); + } + } + return success(); +} + } // namespace memref } // namespace mlir From 3db555db0fa24b54662bf449609479bba8933ea1 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 22 Jul 2025 23:52:39 -0400 Subject: [PATCH 7/9] update tests. --- .../AMDGPU/Transforms/FoldSubviewOps.cpp | 76 +++++++++++-------- .../Dialect/AMDGPU/amdgpu-fold-subviews.mlir | 52 ++++++++++++- 2 files changed, 94 insertions(+), 34 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp index f005842d83306..95ba0a76d4510 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp @@ -18,12 +18,7 @@ namespace mlir::amdgpu { #define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" -} // namespace mlir::amdgpu - -using namespace mlir; -using namespace mlir::amdgpu; -namespace { struct AmdgpuFoldSubviewOpsPass : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase< AmdgpuFoldSubviewOpsPass> { @@ -43,32 +38,51 @@ struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern { Value memrefSource; SmallVector sourceIndices; - llvm::TypeSwitch(op.getSrc().getDefiningOp()) - .Case([&](memref::SubViewOp subviewOp) { - // If the source is a SubViewOp, we can directly rewrite the - // GatherToLDSOp. - mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( - rewriter, loc, subviewOp.getMixedOffsets(), - subviewOp.getMixedStrides(), subviewOp.getDroppedDims(), - op.getSrcIndices(), sourceIndices); - memrefSource = subviewOp.getSource(); - }) - .Case([&](memref::ExpandShapeOp expandShapeOp) { - mlir::memref::resolveSourceIndicesExpandShape( - loc, rewriter, expandShapeOp, op.getSrcIndices(), sourceIndices, - false); - memrefSource = expandShapeOp.getViewSource(); - }) - .Case( - [&](memref::CollapseShapeOp collapseShapeOp) { - mlir::memref::resolveSourceIndicesCollapseShape( - loc, rewriter, collapseShapeOp, op.getSrcIndices(), - sourceIndices); - memrefSource = collapseShapeOp.getViewSource(); + auto foldResult = + llvm::TypeSwitch( + op.getSrc().getDefiningOp()) + .Case([&](memref::SubViewOp subviewOp) { + // If the source is a SubViewOp, we can directly rewrite the + // GatherToLDSOp. + mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides( + rewriter, loc, subviewOp.getMixedOffsets(), + subviewOp.getMixedStrides(), subviewOp.getDroppedDims(), + op.getSrcIndices(), sourceIndices); + memrefSource = subviewOp.getSource(); + return success(); + }) + .Case( + [&](memref::ExpandShapeOp expandShapeOp) { + if (failed(mlir::memref::resolveSourceIndicesExpandShape( + loc, rewriter, expandShapeOp, op.getSrcIndices(), + sourceIndices, false))) { + return failure(); + } + memrefSource = expandShapeOp.getViewSource(); + return success(); + }) + .Case( + [&](memref::CollapseShapeOp collapseShapeOp) { + if (failed(mlir::memref::resolveSourceIndicesCollapseShape( + loc, rewriter, collapseShapeOp, op.getSrcIndices(), + sourceIndices))) { + return failure(); + } + memrefSource = collapseShapeOp.getViewSource(); + return success(); + }) + .Default([&](Operation *op) { + // If the source is not a SubViewOp, ExpandShapeOp, or + // CollapseShapeOp, we cannot fold the GatherToLDSOp. + return rewriter.notifyMatchFailure( + op, + "source producer is not one of SubViewOp, ExpandShapeOp, or " + "CollapseShapeOp"); }); - if (!memrefSource) + if (failed(foldResult)) { return failure(); + } rewriter.replaceOpWithNewOp(op, memrefSource, sourceIndices, op.getDst(), op.getDstIndices(), @@ -77,9 +91,9 @@ struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern { return success(); } }; -} // namespace -void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns( - RewritePatternSet &patterns, PatternBenefit benefit) { +void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns, + PatternBenefit benefit) { patterns.add(patterns.getContext(), benefit); } +} // namespace mlir::amdgpu diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir index a0f02a9bc9340..2c1b1a652fe1e 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir @@ -1,10 +1,10 @@ -// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s +// RUN: mlir-opt --amdgpu-fold-subview-ops --split-input-file %s | FileCheck %s #gpu_lds_addrspace = 3 -// CHECK: func @test_memref +// CHECK: func @test_subview_folding // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index -func.func @test_memref(%offset_i: index, %offset_j: index) { +func.func @test_subview_folding(%offset_i: index, %offset_j: index) { // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> @@ -48,3 +48,49 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) { : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace> func.return } + +// ----- + +#gpu_lds_addrspace = 3 + +// CHECK: func @test_expand_shape +// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index +func.func @test_expand_shape(%offset_i: index, %offset_j: index) { + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> + // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16> + // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3> + + %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> + %mem = memref.alloc() : memref<8192xf16> + %expand = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16> + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %expand[%offset_i, %offset_j], %alloc[%c0, %c0] + : vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu_lds_addrspace> + func.return +} + +// ----- + +#gpu_lds_addrspace = 3 + +// CHECK: func @test_collapse_shape +// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index +func.func @test_collapse_shape(%offset_i: index, %offset_j: index) { + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> + // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> + // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> + + %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> + %mem = memref.alloc() : memref<64x128xf16> + %collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16> + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0] + : vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace> + func.return +} From d6746b955d060549a2fb91105f3422a2f9996b03 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Wed, 23 Jul 2025 08:36:32 -0400 Subject: [PATCH 8/9] Rename and update --- mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td | 10 +++++----- mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt | 2 +- .../{FoldSubviewOps.cpp => FoldMemRefsOps.cpp} | 10 +++++----- ...gpu-fold-subviews.mlir => amdgpu-fold-memrefs.mlir} | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) rename mlir/lib/Dialect/AMDGPU/Transforms/{FoldSubviewOps.cpp => FoldMemRefsOps.cpp} (94%) rename mlir/test/Dialect/AMDGPU/{amdgpu-fold-subviews.mlir => amdgpu-fold-memrefs.mlir} (98%) diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td index fad939ced9877..76b8c825ac272 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -71,12 +71,12 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> { ]; } -def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> { - let summary = "Fold subview operations into their parent operations"; +def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-memrefs-ops"> { + let summary = "Fold memref operations into their parent operations"; let description = [{ - This pass identifies `memref.subview` sources of `GatherToLDSOp` and - attempts to fold the source ops, potentially simplifying the overall - operation and improving performance. + This pass identifies memref operations (subview, expand_shape, collapse_shape) + that are sources of `GatherToLDSOp` and attempts to fold the source ops, + potentially simplifying the overall operation and improving performance. }]; let dependentDialects = [ "memref::MemRefDialect" diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt index 20621ec0d55a4..3b0c072ed1217 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt @@ -1,6 +1,6 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms EmulateAtomics.cpp - FoldSubviewOps.cpp + FoldMemRefsOps.cpp MaskedloadToLoad.cpp ResolveStridedMetadata.cpp diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp similarity index 94% rename from mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp rename to mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp index 95ba0a76d4510..73923af1329db 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp @@ -1,4 +1,4 @@ -//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===// +//===- FoldSubviewOps.cpp - AMDGPU fold subview ops -----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -19,9 +19,9 @@ namespace mlir::amdgpu { #define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" -struct AmdgpuFoldSubviewOpsPass +struct AmdgpuFoldMemRefOpsPass : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase< - AmdgpuFoldSubviewOpsPass> { + AmdgpuFoldMemRefOpsPass> { void runOnOperation() override { RewritePatternSet patterns(&getContext()); populateAmdgpuFoldSubviewOpsPatterns(patterns); @@ -30,7 +30,7 @@ struct AmdgpuFoldSubviewOpsPass } }; -struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern { +struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(GatherToLDSOp op, PatternRewriter &rewriter) const override { @@ -94,6 +94,6 @@ struct FoldSubviewIntoGatherToLDSOp final : OpRewritePattern { void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns, PatternBenefit benefit) { - patterns.add(patterns.getContext(), benefit); + patterns.add(patterns.getContext(), benefit); } } // namespace mlir::amdgpu diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir similarity index 98% rename from mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir rename to mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir index 2c1b1a652fe1e..a751a4ac1158e 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --amdgpu-fold-subview-ops --split-input-file %s | FileCheck %s +// RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s #gpu_lds_addrspace = 3 From 2d653d05755e960a0f9345cf73530256d8a4bc23 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Wed, 23 Jul 2025 10:20:03 -0400 Subject: [PATCH 9/9] Fix according to comments --- .../mlir/Dialect/AMDGPU/Transforms/Passes.h | 6 +++--- .../mlir/Dialect/AMDGPU/Transforms/Passes.td | 2 +- .../AMDGPU/Transforms/FoldMemRefsOps.cpp | 18 ++++++++---------- .../Dialect/AMDGPU/amdgpu-fold-memrefs.mlir | 18 ++++++++---------- 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h index a61903609aaff..58b9c74b2f8e0 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h @@ -22,7 +22,7 @@ class ConversionTarget; namespace amdgpu { #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS -#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS +#define GEN_PASS_DECL_AMDGPUFOLDMEMREFOPSPASS #define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS #define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS #define GEN_PASS_REGISTRATION @@ -39,8 +39,8 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns, void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns, PatternBenefit benefit = 1); -void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns, - PatternBenefit benefit = 1); +void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns, + PatternBenefit benefit = 1); } // namespace amdgpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td index 76b8c825ac272..8664f971cabde 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -71,7 +71,7 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> { ]; } -def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-memrefs-ops"> { +def AmdgpuFoldMemRefOpsPass : Pass<"amdgpu-fold-memrefs-ops"> { let summary = "Fold memref operations into their parent operations"; let description = [{ This pass identifies memref operations (subview, expand_shape, collapse_shape) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp index 73923af1329db..a3fdc7ee385ed 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp @@ -12,21 +12,19 @@ #include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/WalkPatternRewriteDriver.h" #include "llvm/ADT/TypeSwitch.h" namespace mlir::amdgpu { -#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS +#define GEN_PASS_DEF_AMDGPUFOLDMEMREFOPSPASS #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" -struct AmdgpuFoldMemRefOpsPass - : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase< - AmdgpuFoldMemRefOpsPass> { +struct AmdgpuFoldMemRefOpsPass final + : amdgpu::impl::AmdgpuFoldMemRefOpsPassBase { void runOnOperation() override { RewritePatternSet patterns(&getContext()); - populateAmdgpuFoldSubviewOpsPatterns(patterns); - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) - signalPassFailure(); + populateAmdgpuFoldMemRefOpsPatterns(patterns); + walkAndApplyPatterns(getOperation(), std::move(patterns)); } }; @@ -92,8 +90,8 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern { } }; -void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns, - PatternBenefit benefit) { +void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns, + PatternBenefit benefit) { patterns.add(patterns.getContext(), benefit); } } // namespace mlir::amdgpu diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir index a751a4ac1158e..57afa127c9da8 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir @@ -5,10 +5,10 @@ // CHECK: func @test_subview_folding // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index func.func @test_subview_folding(%offset_i: index, %offset_j: index) { - // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> - // CHECK: amdgpu.gather_to_lds %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]] // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> @@ -30,14 +30,12 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) { // CHECK: func @subview_folding_offset // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index func.func @subview_folding_offset(%offset_i: index, %offset_j: index) { - // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> - + // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]] // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]] - - // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]] // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> @@ -56,11 +54,11 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) { // CHECK: func @test_expand_shape // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index func.func @test_expand_shape(%offset_i: index, %offset_j: index) { - // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16> + // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index - // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]] // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> @@ -79,11 +77,11 @@ func.func @test_expand_shape(%offset_i: index, %offset_j: index) { // CHECK: func @test_collapse_shape // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index func.func @test_collapse_shape(%offset_i: index, %offset_j: index) { - // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> + // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index - // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]] // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>