ROCm
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/AccelEmitter.h‎
Lines changed: 8 additions & 16 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/AccelEmitter.h‎
Lines changed: 8 additions & 16 deletions
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td‎
Lines changed: 25 additions & 0 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 18 additions & 20 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 18 additions & 20 deletions
diff --git a/‎mlir/lib/Dialect/Rock/IR/RockDialect.cpp‎
Lines changed: 25 additions & 22 deletions b/‎mlir/lib/Dialect/Rock/IR/RockDialect.cpp‎
Lines changed: 25 additions & 22 deletions
diff --git a/‎mlir/lib/Dialect/Rock/Transforms/BlockwiseGemmToThreadwise.cpp‎
Lines changed: 20 additions & 19 deletions b/‎mlir/lib/Dialect/Rock/Transforms/BlockwiseGemmToThreadwise.cpp‎
Lines changed: 20 additions & 19 deletions
@@ -100,10 +100,8 @@ struct AccelEmitter {
   /// is dependent on the type of accelerator we are targeting
   virtual Value
   wrapLDSBufferForLoad(OpBuilder &b, Location loc, Value buffer,
-                       int64_t blockSize, int64_t dInCopyPerThread,
-                       StringRef dName, bool rotateDWithK, bool directToLds,
-                       bool ldsLayoutDxK,
-                       bool doSplitKAcrossThreadsFirst = false) const = 0;
+                       const BlockwiseMatrixParamsAttr &matrixParams,
+                       int64_t blockSize, StringRef dName) const = 0;
 
   /// This functions creates the subtile views that is :
   /// 1) gridSubTileView :
@@ -177,12 +175,9 @@ struct MfmaEmitter : public AccelEmitter {
   void emitThreadwiseLoop(OpBuilder &b, Location loc, Value argA, Value argB,
                           Value bufferC, ValueRange regCOffset) override;
 
-  Value
-  wrapLDSBufferForLoad(OpBuilder &b, Location loc, Value buffer,
-                       int64_t blockSize, int64_t dInCopyPerThread,
-                       StringRef dName, bool rotateDWithK, bool directToLds,
-                       bool ldsLayoutDxK,
-                       bool doSplitKAcrossThreadsFirst = false) const override;
+  Value wrapLDSBufferForLoad(OpBuilder &b, Location loc, Value buffer,
+                             const BlockwiseMatrixParamsAttr &matrixParams,
+                             int64_t blockSize, StringRef dName) const override;
 
   FailureOr<RegsAsMatrixSubTiles> createAccelGemmOperandTransforms(
       OpBuilder &b, Location loc, int64_t kIters,
@@ -225,12 +220,9 @@ struct WmmaEmitter : public AccelEmitter {
   void emitThreadwiseLoop(OpBuilder &b, Location loc, Value argA, Value argB,
                           Value bufferC, ValueRange regCOffset) override;
 
-  Value
-  wrapLDSBufferForLoad(OpBuilder &b, Location loc, Value buffer,
-                       int64_t blockSize, int64_t dInCopyPerThread,
-                       StringRef dName, bool rotateDWithK, bool directToLds,
-                       bool ldsLayoutDxK,
-                       bool doSplitKAcrossThreadsFirst = false) const override;
+  Value wrapLDSBufferForLoad(OpBuilder &b, Location loc, Value buffer,
+                             const BlockwiseMatrixParamsAttr &matrixParams,
+                             int64_t blockSize, StringRef dName) const override;
 
   FailureOr<RegsAsMatrixSubTiles> createAccelGemmOperandTransforms(
       OpBuilder &b, Location loc, int64_t kIters,
 
@@ -556,4 +556,29 @@ def Rock_PrefillAttr : Rock_Attr<"Prefill"> {
   let assemblyFormat = "`<` params `>`";
 }
 
+def Rock_BlockwiseMatrixParamsAttr : Rock_Attr<"BlockwiseMatrixParams", []> {
+  let mnemonic = "blockwise_matrix_params";
+  let description = [{
+    Encapsulates rock.blockwise_load_tile and rock.blockwise_gemm_accel parameters.
+    - elementType: Element type of the matrix operation.
+    - elementTypeLoad: Element type of that was actually loaded from memory (before any input fusion).
+    - rotateDWithK: Trick to reduce LDS bank conflicts (see more info here: https://github.com/ROCm/rocMLIR/pull/1209)
+    - swapThreadIterSubDims: Trick to reduce LDS bank conflicts (see more info here: https://github.com/ROCm/rocMLIR/pull/1209)
+    - LDSLayoutDxK: Wheter the layout in LDS is DxK
+    - directToLDS: Wheter direct to LDS is enabled
+    - splitKAcrossThreadsFirst: Used for attention, when bypassing LDS for the result of the first GEMM, explanation here: https://github.com/ROCm/rocMLIR-internal/issues/1201#issuecomment-1898925539
+    - g: gemm parameter G
+    - d: gemm parameter D (could be M or N)
+    - inDPerThread: How many elements of D (M or N) each thread is going to load from memory.
+  }];
+  let parameters = (ins "Type":$elementType, "Type":$elementTypeLoad,
+      "bool":$rotateDWithK, "bool":$swapThreadIterSubDims, "bool":$LDSLayoutDxK,
+      "bool":$directToLDS, "bool":$splitKAcrossThreadsFirst, "int64_t":$g,
+      "int64_t":$d, "int64_t":$inDPerThread);
+
+  let assemblyFormat = [{
+    `<` struct(params) `>`
+  }];
+}
+
 #endif
@@ -1365,26 +1365,22 @@ def Rock_BlockwiseGemmAccelOp
               [AttrSizedOperandSegments,
                DeclareOpInterfaceMethods<RockGemmFeaturesInterface>,
                DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]>,
-      Arguments<(ins Optional<MemRefOf<LdsBufferTypes>>:$matrixA,
+      Arguments<(ins MemRefOf<LdsBufferTypes>:$bufferA,
+          MemRefOf<LdsBufferTypes>:$bufferB, MemRefOf<AccelResTypes>:$matrixC,
+          Rock_BlockwiseMatrixParamsAttr:$matrixParamsA,
+          Rock_BlockwiseMatrixParamsAttr:$matrixParamsB,
+          Optional<MemRefOf<LdsBufferTypes>>:$matrixA,
           Optional<MemRefOf<LdsBufferTypes>>:$matrixB,
           Optional<MemRefOf<LdsBufferTypes>>:$scaleA,
-          Optional<MemRefOf<LdsBufferTypes>>:$scaleB, I32Attr:$inMPerThread,
-          I32Attr:$inNPerThread, UnitAttr:$rotateMWithK, UnitAttr:$rotateNWithK,
-          UnitAttr:$loadAfromLDS, UnitAttr:$loadBfromLDS,
-          UnitAttr:$splitKAcrossThreadsFirstA,
-          UnitAttr:$splitKAcrossThreadsFirstB, UnitAttr:$directToLDS,
-          UnitAttr:$ldsLayoutMxK, UnitAttr:$ldsLayoutNxK,
-          MemRefOf<LdsBufferTypes>:$bufferA, MemRefOf<LdsBufferTypes>:$bufferB,
-          MemRefOf<AccelResTypes>:$matrixC,
+          Optional<MemRefOf<LdsBufferTypes>>:$scaleB,
           Optional<MemRefOf<LdsBufferTypes>>:$bufferScaleA,
           Optional<MemRefOf<LdsBufferTypes>>:$bufferScaleB,
-          TypeAttr:$elementTypeA, TypeAttr:$elementTypeB,
           OptionalAttr<Rock_GemmFeaturesAttr>:$features, I32Attr:$blockSize,
           RockAccelTuningParamAttrInterface:$params)> {
   let summary = "Blockwise GEMM accelerated version";
   let description = [{
     The `rock.blockwise_gemm_accel` op does GEMM at workgroup (block) level.
-    - Matrix A and Matrix B shall reside on LDS or registers (depending on loadAfromLDS and loadBfromLDS).
+    - Matrix A and Matrix B shall reside on registers (if matrixA or matrixB are passed, we load them from LDS).
     - Matrix C shall be vectors.
 
     The elements of matrices A and B should be vectors of length kpack, or
@@ -1410,11 +1406,11 @@ def Rock_BlockwiseLoadTileOp
           Arg<Optional<MemRefOf<LdsBufferTypes>>, "destination LDS">:$destLDS,
           Arg<Optional<MemRefOf<NativeMemoryOpTypes>>,
               "destination registers">:$destRegisters,
-          Rock_GemmLoadTileTypeAttr:$loadType, UnitAttr:$isA,
-          TypeAttr:$elementTypeA, TypeAttr:$elementTypeB, TypeAttr:$elementType,
-          TypeAttr:$elementLoadType, UnitAttr:$rotateWithK,
-          UnitAttr:$swapThreadIterSubDims, UnitAttr:$LDSLayoutDxK,
-          Variadic<Index>:$sourceIndices, I64Attr:$G, I64Attr:$M, I64Attr:$N,
+          Rock_GemmLoadTileTypeAttr:$loadType, TypeAttr:$elementType,
+          TypeAttr:$elementLoadType,
+          Rock_BlockwiseMatrixParamsAttr:$matrixParamsA,
+          Rock_BlockwiseMatrixParamsAttr:$matrixParamsB, UnitAttr:$isA,
+          Variadic<Index>:$sourceIndices,
           OptionalAttr<Rock_GemmFeaturesAttr>:$features, I32Attr:$blockSize,
           RockAccelTuningParamAttrInterface:$params)> {
   let summary =
@@ -1427,6 +1423,8 @@ def Rock_BlockwiseLoadTileOp
     - Default: Creates two stages, (1) load from memory, (2) write to LDS.
     - BypassLDS: Bypasses LDS and loads from device memory to registers directly (only one stage).
     - DoubleBuffer: Creates three stages, (1) load from memory, (2) write to LDS, (3) load to registers.
+    - DirectToLDSDefault: Same as Default, but a single stage loads from memory and writes to LDS.
+    - DirectToLDSDoubleBuffer: Same as DoubleBuffer, but a single stage loads from memory and writes to LDS.
 
     `isA` determines if we are loading an A matrix or B matrix. `G`, `M` and `N` are the GEMM sizes.
     `elementTypeA` and `elementTypeB` are used to construct AccelEmitter. They are data types for the Matrix A & B of the GEMMs. 
@@ -1464,9 +1462,9 @@ def Rock_ThreadwiseGemmOp
   let hasVerifier = 1;
 }
 
-// threadwise_accel_gemm
-def Rock_ThreadwiseAccelGemmOp
-    : Rock_Op<"threadwise_accel_gemm",
+// threadwise_gemm_accel
+def Rock_ThreadwiseGemmAccelOp
+    : Rock_Op<"threadwise_gemm_accel",
               [DeclareOpInterfaceMethods<RockGemmFeaturesInterface>,
                DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
                AttrSizedOperandSegments]>,
@@ -1481,7 +1479,7 @@ def Rock_ThreadwiseAccelGemmOp
           RockAccelTuningParamAttrInterface:$params)> {
   let summary = "Accelerated GEMM";
   let description = [{
-    The `rock.accel_gemm` op is an abstraction of doing GEMM based on an accelerator.
+    The `rock.threadwise_gemm_accel` op is an abstraction of doing GEMM based on an accelerator.
     It would employ a series of accelerator (e.g., mfma or wmma) operations.
 
     Matrices A and B reside in LDS, the buffers live in registers, C is a vector
 
@@ -2214,6 +2214,14 @@ LogicalResult BlockwiseLoadTileOp::verify() {
   GemmLoadTileType loadType = getLoadType();
   bool singleBuffer = loadType == GemmLoadTileType::Default ||
                       loadType == GemmLoadTileType::DirectToLDSDefault;
+  bool directToLDS = loadType == GemmLoadTileType::DirectToLDSDefault ||
+                     loadType == GemmLoadTileType::DirectToLDSDoubleBuffer;
+
+  bool paramsDirectToLDS = getIsA() ? getMatrixParamsA().getDirectToLDS()
+                                    : getMatrixParamsB().getDirectToLDS();
+
+  if (paramsDirectToLDS != directToLDS)
+    return emitOpError("Inconsistency between params and load type");
 
   if (!destLDS && loadType != GemmLoadTileType::BypassLDS)
     return emitOpError("destLDS must be set unless loadType is BypassLDS");
@@ -2261,20 +2269,17 @@ void BlockwiseGemmOp::getEffects(
 //===----------------------------------------------------------------------===//
 
 LogicalResult BlockwiseGemmAccelOp::verify() {
-  bool loadAFromLDS = getLoadAfromLDS();
-  bool loadBFromLDS = getLoadBfromLDS();
   bool hasA = getMatrixA() != nullptr;
   bool hasB = getMatrixB() != nullptr;
+  bool directToLDS = getMatrixParamsA().getDirectToLDS() ||
+                     getMatrixParamsB().getDirectToLDS();
 
-  if (loadAFromLDS && !hasA)
-    return emitOpError("If loadAFromLDS is enabled, matrixA must be non-null.");
-  if (loadBFromLDS && !hasB)
-    return emitOpError("If loadBFromLDS is enabled, matrixB must be non-null.");
-
-  if (hasA && getElementTypeOrSelfRecursive(getMatrixA()) != getElementTypeA())
+  if (hasA && getElementTypeOrSelfRecursive(getMatrixA()) !=
+                  getMatrixParamsA().getElementType())
     return emitOpError("ElementTypeA and matrixA element type don't match");
 
-  if (hasB && getElementTypeOrSelfRecursive(getMatrixB()) != getElementTypeB())
+  if (hasB && getElementTypeOrSelfRecursive(getMatrixB()) !=
+                  getMatrixParamsB().getElementType())
     return emitOpError("ElementTypeA and matrixA element type don't match");
 
   bool hasScaleABuffer = getBufferScaleA() != nullptr;
@@ -2289,9 +2294,9 @@ LogicalResult BlockwiseGemmAccelOp::verify() {
   StringAttr archAttr = rock::getArch(*this).value_or(
       StringAttr::get(this->getContext(), "gfx00"));
 
-  if (loadAFromLDS && loadBFromLDS)
+  if (hasA && hasB)
     if (failed(verifyGemmTypes(*this, rock::getFeatures(*this), archAttr, aType,
-                               bType, cType)))
+                               bType, directToLDS ? nullptr : cType)))
       return failure();
   auto verifyMatrixAndScale = [&](bool loadFromLds, Value matrix, Value lds,
                                   Value bufferScale, ShapedType bufferType,
@@ -2363,12 +2368,12 @@ LogicalResult BlockwiseGemmAccelOp::verify() {
   };
 
   // Verify matrix A and its scales
-  if (failed(verifyMatrixAndScale(loadAFromLDS, getMatrixA(), getScaleA(),
+  if (failed(verifyMatrixAndScale(hasA, getMatrixA(), getScaleA(),
                                   getBufferScaleA(), aBufferType, "A")))
     return failure();
 
   // Verify matrix B and its scales
-  if (failed(verifyMatrixAndScale(loadBFromLDS, getMatrixB(), getScaleB(),
+  if (failed(verifyMatrixAndScale(hasB, getMatrixB(), getScaleB(),
                                   getBufferScaleB(), bBufferType, "B")))
     return failure();
 
@@ -2380,7 +2385,7 @@ LogicalResult BlockwiseGemmAccelOp::verify() {
 }
 
 SmallVector<mlir::Type> BlockwiseGemmAccelOp::getTypesForFeature() {
-  return {getMatrixA().getType()};
+  return {getMatrixParamsA().getElementType()};
 }
 
 void BlockwiseGemmAccelOp::getEffects(
@@ -2398,17 +2403,15 @@ void BlockwiseGemmAccelOp::getEffects(
     effects.emplace_back(read, &getBufferScaleBMutable()[0]);
   }
   // if we load from LDS, we need to write to registers
-  if (getLoadAfromLDS()) {
-    assert(getMatrixA() != nullptr);
+  if (getMatrixA() != nullptr) {
     effects.emplace_back(read, &getMatrixAMutable()[0]);
     effects.emplace_back(write, &getBufferAMutable());
     if (getScaleA()) {
       effects.emplace_back(read, &getScaleAMutable()[0]);
       effects.emplace_back(write, &getBufferScaleAMutable()[0]);
     }
   }
-  if (getLoadBfromLDS()) {
-    assert(getMatrixB() != nullptr);
+  if (getMatrixB() != nullptr) {
     effects.emplace_back(read, &getMatrixBMutable()[0]);
     effects.emplace_back(write, &getBufferBMutable());
     if (getScaleB()) {
@@ -2443,13 +2446,13 @@ void ThreadwiseGemmOp::getEffects(
 }
 
 //===----------------------------------------------------------------------===//
-// ThreadwiseAccelGemmOp
+// ThreadwiseGemmAccelOp
 //===----------------------------------------------------------------------===//
-SmallVector<mlir::Type> ThreadwiseAccelGemmOp::getTypesForFeature() {
+SmallVector<mlir::Type> ThreadwiseGemmAccelOp::getTypesForFeature() {
   return {getMatrixA().getType()};
 }
 
-LogicalResult ThreadwiseAccelGemmOp::verify() {
+LogicalResult ThreadwiseGemmAccelOp::verify() {
   ShapedType aType = cast<ShapedType>(getMatrixA().getType());
   ShapedType bType = cast<ShapedType>(getMatrixB().getType());
 
@@ -2489,7 +2492,7 @@ LogicalResult ThreadwiseAccelGemmOp::verify() {
   return success();
 }
 
-void ThreadwiseAccelGemmOp::getEffects(
+void ThreadwiseGemmAccelOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   if (getScaleA()) {
     auto *read = MemoryEffects::Read::get();
 
@@ -411,11 +411,13 @@ struct BlockwiseGemmAccelRewritePattern
     int64_t kpackPerBlock = tuningParams.getKpackPerBlock();
     int64_t mPerWave = tuningParams.getMPerWave();
     int64_t nPerWave = tuningParams.getNPerWave();
-    bool loadAFromLDS = adaptor.getLoadAfromLDS();
-    bool loadBFromLDS = adaptor.getLoadBfromLDS();
+    bool loadAFromLDS = adaptor.getMatrixA() != nullptr;
+    bool loadBFromLDS = adaptor.getMatrixB() != nullptr;
+    BlockwiseMatrixParamsAttr matrixParamsA = op.getMatrixParamsA();
+    BlockwiseMatrixParamsAttr matrixParamsB = op.getMatrixParamsB();
 
-    Type dataTypeA = adaptor.getElementTypeA();
-    Type dataTypeB = adaptor.getElementTypeB();
+    Type dataTypeA = matrixParamsA.getElementType();
+    Type dataTypeB = matrixParamsB.getElementType();
 
     auto features = rock::getFeatures(op);
     auto accelEmitterPtr = rock::accel::AccelEmitter::select(
@@ -447,7 +449,8 @@ struct BlockwiseGemmAccelRewritePattern
                << "kpackPerBlock: " << kpackPerBlock << "\n"
                << "loadAFromLDS: " << loadAFromLDS << "\n"
                << "loadBFromLDS: " << loadBFromLDS << "\n"
-               << "rotateMWithK: " << op.getRotateMWithK() << "\n"
+               << "rotateMWithK: " << matrixParamsA.getRotateDWithK() << "\n"
+               << "rotateNWithK: " << matrixParamsB.getRotateDWithK() << "\n"
                << "bufferA type: " << adaptor.getBufferA().getType() << "\n"
                << "bufferB type: " << adaptor.getBufferB().getType() << "\n");
 
@@ -466,24 +469,20 @@ struct BlockwiseGemmAccelRewritePattern
     // considered a temporary hack until we have a proper way of "searching"
     // through different schedules (either heuristically or automatically)
 
-    bool directToLDS = op.getDirectToLDS();
     Value wrappedLDSBufferForLoadA, wrappedLDSBufferForLoadB;
     if (loadAFromLDS) {
       wrappedLDSBufferForLoadA = accelEmitterPtr->wrapLDSBufferForLoad(
-          b, loc, op.getMatrixA(), op.getBlockSize(), op.getInMPerThread(), "m",
-          op.getRotateMWithK(), directToLDS, op.getLdsLayoutMxK(),
-          op.getSplitKAcrossThreadsFirstA());
+          b, loc, op.getMatrixA(), matrixParamsA, op.getBlockSize(), "m");
     }
     if (loadBFromLDS) {
       wrappedLDSBufferForLoadB = accelEmitterPtr->wrapLDSBufferForLoad(
-          b, loc, op.getMatrixB(), op.getBlockSize(), op.getInNPerThread(), "n",
-          op.getRotateNWithK(), directToLDS, op.getLdsLayoutNxK(),
-          op.getSplitKAcrossThreadsFirstB());
+          b, loc, op.getMatrixB(), matrixParamsB, op.getBlockSize(), "n");
     }
 
     auto loadBuffer = [&](Value buffer, Value wrappedLDSBufferForLoad,
                           Value loopVar, Type argType, int64_t repeats,
-                          bool loadFromLDS, bool isA) -> Value {
+                          bool loadFromLDS, bool directToLDS,
+                          bool isA) -> Value {
       Value inputBuffer = buffer;
       SmallVector<int64_t> shape;
       if (directToLDS) {
@@ -544,8 +543,9 @@ struct BlockwiseGemmAccelRewritePattern
       Value i = mLoop.getInductionVar();
 
       Value bufferA = adaptor.getBufferA();
-      bufferA = loadBuffer(bufferA, wrappedLDSBufferForLoadA, i, argTypeA,
-                           mRepeats, loadAFromLDS, true);
+      bufferA =
+          loadBuffer(bufferA, wrappedLDSBufferForLoadA, i, argTypeA, mRepeats,
+                     loadAFromLDS, matrixParamsA.getDirectToLDS(), true);
       Value viewA =
           accelEmitterPtr->generateThreadwiseViewBufferA(b, loc, bufferA);
 
@@ -556,8 +556,9 @@ struct BlockwiseGemmAccelRewritePattern
         Value j = nLoop.getInductionVar();
 
         Value bufferB = adaptor.getBufferB();
-        bufferB = loadBuffer(bufferB, wrappedLDSBufferForLoadB, j, argTypeB,
-                             nRepeats, loadBFromLDS, false);
+        bufferB =
+            loadBuffer(bufferB, wrappedLDSBufferForLoadB, j, argTypeB, nRepeats,
+                       loadBFromLDS, matrixParamsB.getDirectToLDS(), false);
         Value viewB =
             accelEmitterPtr->generateThreadwiseViewBufferB(b, loc, bufferB);
 
@@ -569,8 +570,8 @@ struct BlockwiseGemmAccelRewritePattern
           Value viewC = accelEmitterPtr->generateThreadwiseViewBufferC(
               b, loc, adaptor.getMatrixC());
           Value k = kLoop.getInductionVar();
-          ThreadwiseAccelGemmOp::create(b, loc, viewA, viewB, viewC,
-                                        /*aScale=*/nullptr, /*bScale=*/nullptr,
+          ThreadwiseGemmAccelOp::create(b, loc, viewA, viewB, viewC,
+                                        /*scaleA=*/nullptr, /*scaleB=*/nullptr,
                                         ValueRange{i, j, k},
                                         op.getFeaturesAttr(), tuningParams);
         }