Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ class ConversionTarget;
namespace amdgpu {

#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS
#define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"

Expand All @@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
PatternBenefit benefit = 1);

void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
PatternBenefit benefit = 1);

} // namespace amdgpu
} // namespace mlir

Expand Down
12 changes: 12 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
"memref::MemRefDialect"
];
}

def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
let summary = "Fold subview operations into their parent operations";
let description = [{
This pass identifies `memref.subview` sources of `GatherToLDSOp` and
attempts to fold the source ops, potentially simplifying the overall
operation and improving performance.
}];
let dependentDialects = [
"memref::MemRefDialect"
];
}
#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
3 changes: 2 additions & 1 deletion mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
add_mlir_dialect_library(MLIRAMDGPUTransforms
EmulateAtomics.cpp
ResolveStridedMetadata.cpp
FoldSubviewOps.cpp
MaskedloadToLoad.cpp
ResolveStridedMetadata.cpp

ADDITIONAL_HEADER_DIRS
{$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
Expand Down
65 changes: 65 additions & 0 deletions mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"

#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

namespace mlir::amdgpu {
#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
} // namespace mlir::amdgpu

using namespace mlir;
using namespace mlir::amdgpu;

namespace {
struct AmdgpuFoldSubviewOpsPass
: public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
AmdgpuFoldSubviewOpsPass> {
void runOnOperation() override {
RewritePatternSet patterns(&getContext());
populateAmdgpuFoldSubviewOpsPatterns(patterns);
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
signalPassFailure();
}
};

struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
using OpRewritePattern<GatherToLDSOp>::OpRewritePattern;
LogicalResult matchAndRewrite(GatherToLDSOp op,
PatternRewriter &rewriter) const override {
Location loc = op.getLoc();

// Check if the source is a subview operation:
auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
if (!subviewOp)
return rewriter.notifyMatchFailure(
loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");

SmallVector<Value> sourceIndices;
mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);

rewriter.replaceOpWithNewOp<GatherToLDSOp>(
op, subviewOp.getSource(), sourceIndices, op.getDst(),
op.getDstIndices(), op.getTransferType());

return success();
}
};
} // namespace

void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns(
RewritePatternSet &patterns, PatternBenefit benefit) {
patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
}
50 changes: 50 additions & 0 deletions mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s

#gpu_lds_addrspace = 3

// CHECK: func @test_memref
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_memref(%offset_i: index, %offset_j: index) {
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>

%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
%subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>>
%c0 = arith.constant 0 : index
amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
: vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}

// -----

#gpu_lds_addrspace = 3

// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>

// CHECK: func @subview_folding_offset
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>

// CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
// CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]

// CHECK: %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>

%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
%subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>>
%c0 = arith.constant 0 : index
amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
: vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}
Loading