-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[mlir][xegpu] Bug fix in UpdateNdOffset distribution. #150545
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir Author: Charitha Saumya (charithaintc) ChangesFull diff: https://github.com/llvm/llvm-project/pull/150545.diff 2 Files Affected:
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8957ea5399ea2..2088c3c7fc5ec 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -277,22 +277,13 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
descOp, "the tensor descriptor lacks layout attribute");
SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldValues;
- SmallVector<Type> newYieldTypes;
-
- for (Value operand : descOp->getOperands()) {
- newYieldValues.push_back(operand);
- newYieldTypes.push_back(operand.getType());
- }
rewriter.setInsertionPoint(warpOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, /* new yieled values = */ newYieldValues,
- /* new yielded types = */ newYieldTypes, newRetIndices);
+ rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
+ /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
- SmallVector<Value> newDescOperands;
- for (size_t i : newRetIndices) {
- newDescOperands.push_back(newWarpOp.getResult(i));
- }
+ SmallVector<Value> newDescOperands = llvm::map_to_vector(
+ newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::TensorDescType distributedTensorDescTy =
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
@@ -696,39 +687,30 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
warpOp, "warp result is not a xegpu::UpdateNdOffset op");
auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
unsigned operandIdx = operand->getOperandNumber();
- // new update op does not have layout attribute.
- xegpu::TensorDescType newTensorDescTy =
- updateOp.getTensorDescType().dropLayouts();
- SmallVector<Value, 3> newYieldValues;
- SmallVector<Type, 3> newYieldTypes;
- for (Value operand : updateOp->getOperands()) {
- newYieldValues.push_back(operand);
- if (isa<xegpu::TensorDescType>(operand.getType())) {
- newYieldTypes.push_back(newTensorDescTy);
- } else {
- newYieldTypes.push_back(operand.getType());
- }
- }
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
+ rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
+ newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newUpdateOperands;
- for (size_t i : newRetIndices) {
- // For the tensor descriptor operand, the layout attribute is dropped
- // after distribution. Types needs to be resolved in this case.
- if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
- newUpdateOperands.push_back(resolveDistributedTy(
- newWarpOp.getResult(i), newTensorDescTy, rewriter));
- } else {
- newUpdateOperands.push_back(newWarpOp.getResult(i));
- }
- }
+ // new update op does not have layout attribute.
+ xegpu::TensorDescType distributedTensorDescTy =
+ updateOp.getTensorDescType().dropLayouts();
+ SmallVector<Value> newUpdateOperands =
+ llvm::map_to_vector(newRetIndices, [&](size_t i) {
+ // For the tensor descriptor operand, the layout attribute is
+ // dropped after distribution. Types needs to be resolved in this
+ // case.
+ if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
+ return resolveDistributedTy(newWarpOp.getResult(i),
+ distributedTensorDescTy, rewriter);
+ }
+ return newWarpOp.getResult(i);
+ });
// Create a new update op outside the warp op.
auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
- updateOp->getAttrs());
+ rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
+ newUpdateOperands, updateOp->getAttrs());
xegpu::removeLayoutAttrs(newUpdateOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type with the original type.
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index e78ae4a17710b..4bfa797e2a9b3 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
// CHECK-LABEL: gpu.func @store_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
@@ -265,6 +265,28 @@ gpu.module @test {
}
}
+// -----
+// Explicitly check that update_nd_offset distributed tensor descriptor retains the layouts.
+// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
+// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: }
+// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch}
+// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+ gpu.func @check_update_nd_offset_distributed_tensor_desc() {
+ %c32 = arith.constant 32 : index
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf32>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
// -----
// CHECK-LABEL: gpu.func @prefetch_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
|
@llvm/pr-subscribers-mlir-gpu Author: Charitha Saumya (charithaintc) ChangesFull diff: https://github.com/llvm/llvm-project/pull/150545.diff 2 Files Affected:
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8957ea5399ea2..2088c3c7fc5ec 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -277,22 +277,13 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
descOp, "the tensor descriptor lacks layout attribute");
SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldValues;
- SmallVector<Type> newYieldTypes;
-
- for (Value operand : descOp->getOperands()) {
- newYieldValues.push_back(operand);
- newYieldTypes.push_back(operand.getType());
- }
rewriter.setInsertionPoint(warpOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, /* new yieled values = */ newYieldValues,
- /* new yielded types = */ newYieldTypes, newRetIndices);
+ rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
+ /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
- SmallVector<Value> newDescOperands;
- for (size_t i : newRetIndices) {
- newDescOperands.push_back(newWarpOp.getResult(i));
- }
+ SmallVector<Value> newDescOperands = llvm::map_to_vector(
+ newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::TensorDescType distributedTensorDescTy =
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
@@ -696,39 +687,30 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
warpOp, "warp result is not a xegpu::UpdateNdOffset op");
auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
unsigned operandIdx = operand->getOperandNumber();
- // new update op does not have layout attribute.
- xegpu::TensorDescType newTensorDescTy =
- updateOp.getTensorDescType().dropLayouts();
- SmallVector<Value, 3> newYieldValues;
- SmallVector<Type, 3> newYieldTypes;
- for (Value operand : updateOp->getOperands()) {
- newYieldValues.push_back(operand);
- if (isa<xegpu::TensorDescType>(operand.getType())) {
- newYieldTypes.push_back(newTensorDescTy);
- } else {
- newYieldTypes.push_back(operand.getType());
- }
- }
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
+ rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
+ newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newUpdateOperands;
- for (size_t i : newRetIndices) {
- // For the tensor descriptor operand, the layout attribute is dropped
- // after distribution. Types needs to be resolved in this case.
- if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
- newUpdateOperands.push_back(resolveDistributedTy(
- newWarpOp.getResult(i), newTensorDescTy, rewriter));
- } else {
- newUpdateOperands.push_back(newWarpOp.getResult(i));
- }
- }
+ // new update op does not have layout attribute.
+ xegpu::TensorDescType distributedTensorDescTy =
+ updateOp.getTensorDescType().dropLayouts();
+ SmallVector<Value> newUpdateOperands =
+ llvm::map_to_vector(newRetIndices, [&](size_t i) {
+ // For the tensor descriptor operand, the layout attribute is
+ // dropped after distribution. Types needs to be resolved in this
+ // case.
+ if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
+ return resolveDistributedTy(newWarpOp.getResult(i),
+ distributedTensorDescTy, rewriter);
+ }
+ return newWarpOp.getResult(i);
+ });
// Create a new update op outside the warp op.
auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
- updateOp->getAttrs());
+ rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
+ newUpdateOperands, updateOp->getAttrs());
xegpu::removeLayoutAttrs(newUpdateOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type with the original type.
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index e78ae4a17710b..4bfa797e2a9b3 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
// CHECK-LABEL: gpu.func @store_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
@@ -265,6 +265,28 @@ gpu.module @test {
}
}
+// -----
+// Explicitly check that update_nd_offset distributed tensor descriptor retains the layouts.
+// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
+// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: }
+// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch}
+// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+ gpu.func @check_update_nd_offset_distributed_tensor_desc() {
+ %c32 = arith.constant 32 : index
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf32>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
// -----
// CHECK-LABEL: gpu.func @prefetch_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
|
Hi @adam-smnk, Can you please take a look? |
Can you add a short description including:
|
added a description. |
Nit to description: you can skip the code snippets. Reproducer is now a test and honestly the produced error is confusing. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix itself looks fine 👍
AFAIK, the issue here was pretty generic as in type mismatch occurred during partial distribution.
Won't this problem occur for any other ops?
Thanks for the description. |
Other ops seems to be handled correctly. I made a mistake when constructing the UpdateNd op. |
This example fails in subgroup distribution:
With this error:
Reason is UpdateNdOffset source operand not retaining the layouts when it is yielded by the warp op.
warp_execute_on_lane0
op expects that TensorDesc type is unchanged during distribution out of its region. we use UnrealizedCasts to reconcile this mismatch outside the warpOp (viaresolveDistributedTy
)