Skip to content

Commit 2fa3b3c

Browse files
Merge branch 'develop' into quick_tune_code_review
2 parents 70b50ac + 58c991b commit 2fa3b3c

File tree

12 files changed

+975
-245
lines changed

12 files changed

+975
-245
lines changed

mlir/lib/Conversion/MIGraphXToTosa/MIGraphXToTosa.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ BroadcastConverter::matchAndRewrite(migraphx::BroadcastOp op, OpAdaptor adaptor,
584584
// because tosa does not have an explicit broadcast op
585585
auto oneTensor = rock::tosa::getOneTensor(rewriter, loc, outType);
586586
auto mulWithOne = rock::tosa::getMulOp(rewriter, loc, sameRankReshapedOp,
587-
oneTensor, elemType);
587+
oneTensor, newOutElementTy);
588588
rewriter.replaceOp(op, mulWithOne);
589589
return success();
590590
}

mlir/lib/Conversion/TosaToRock/TosaToRock.cpp

Lines changed: 398 additions & 139 deletions
Large diffs are not rendered by default.

mlir/test/Conversion/TosaToRock/tosa-to-rock-attention-causal.mlir

Lines changed: 125 additions & 0 deletions
Large diffs are not rendered by default.

mlir/test/Conversion/TosaToRock/tosa-to-rock-attention-lse.mlir

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,3 +345,46 @@ func.func @mlir_attention_single_token(%arg0: tensor<128xf32>, %arg1: tensor<256
345345
%collapsed_7 = tensor.collapse_shape %20 [[0, 1, 2]] : tensor<8x1x32xf32> into tensor<256xf32>
346346
return %collapsed_7, %collapsed_4 : tensor<256xf32>, tensor<8xf32>
347347
}
348+
349+
// CHECK-LABEL: @mlir_attention_lse_unfolded
350+
// CHECK: %[[lseBuffer:.+]] = bufferization.alloc_tensor() : tensor<8x1xf32>
351+
// CHECK: %{{.*}}, %[[lseOut:.*]] = rock.attention
352+
// CHECK: lse = %[[lseBuffer]] : tensor<8x1xf32>
353+
// CHECK: %[[lseExpanded:.*]] = tensor.expand_shape %[[lseOut]]
354+
// CHECK: %[[lseCollapsed:.*]] = tensor.collapse_shape %[[lseExpanded]]
355+
// CHECK: return %{{.*}}, %[[lseCollapsed]] : tensor<256xf32>, tensor<8xf32>
356+
func.func private @mlir_attention_lse_unfolded(%arg0: tensor<128xf32>, %arg1: tensor<256xf32>, %arg2: tensor<128xf32>) -> (tensor<256xf32>, tensor<8xf32>) attributes {arch = "##TOKEN_ARCH##", kernel} {
357+
%0 = tosa.const_shape {values = dense<256> : tensor<1xindex>} : () -> !tosa.shape<1>
358+
%1 = tosa.const_shape {values = dense<[8, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
359+
%2 = tosa.const_shape {values = dense<8> : tensor<1xindex>} : () -> !tosa.shape<1>
360+
%3 = tosa.const_shape {values = dense<[2, 4, 1, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
361+
%4 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
362+
%5 = tosa.const_shape {values = dense<[8, 32, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
363+
%6 = tosa.const_shape {values = dense<[8, 1, 32]> : tensor<3xindex>} : () -> !tosa.shape<3>
364+
%7 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
365+
%8 = "tosa.const"() <{values = dense<1.000000e+00> : tensor<2x2x2x1x32xf32>}> : () -> tensor<2x2x2x1x32xf32>
366+
%9 = tosa.const_shape {values = dense<[2, 2, 1, 1, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
367+
%10 = tosa.const_shape {values = dense<[2, 4, 1, 32]> : tensor<4xindex>} : () -> !tosa.shape<4>
368+
%expanded = tensor.expand_shape %arg0 [[0, 1, 2, 3, 4]] output_shape [2, 2, 1, 1, 32] : tensor<128xf32> into tensor<2x2x1x1x32xf32>
369+
%11 = tosa.mul %expanded, %8, %7 : (tensor<2x2x1x1x32xf32>, tensor<2x2x2x1x32xf32>, tensor<1xi8>) -> tensor<2x2x2x1x32xf32>
370+
%expanded_0 = tensor.expand_shape %arg2 [[0, 1, 2, 3, 4]] output_shape [2, 2, 1, 1, 32] : tensor<128xf32> into tensor<2x2x1x1x32xf32>
371+
%12 = tosa.mul %expanded_0, %8, %7 : (tensor<2x2x1x1x32xf32>, tensor<2x2x2x1x32xf32>, tensor<1xi8>) -> tensor<2x2x2x1x32xf32>
372+
%collapsed = tensor.collapse_shape %12 [[0], [1, 2], [3], [4]] : tensor<2x2x2x1x32xf32> into tensor<2x4x1x32xf32>
373+
%13 = tosa.transpose %collapsed {perms = array<i32: 0, 1, 3, 2>} : (tensor<2x4x1x32xf32>) -> tensor<2x4x32x1xf32>
374+
%expanded_1 = tensor.expand_shape %arg1 [[0, 1, 2]] output_shape [8, 1, 32] : tensor<256xf32> into tensor<8x1x32xf32>
375+
%collapsed_2 = tensor.collapse_shape %13 [[0, 1], [2], [3]] : tensor<2x4x32x1xf32> into tensor<8x32x1xf32>
376+
%14 = tosa.matmul %expanded_1, %collapsed_2, %4, %4 {acc_type = f32} : (tensor<8x1x32xf32>, tensor<8x32x1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<8x1x1xf32>
377+
%expanded_3 = tensor.expand_shape %14 [[0, 1], [2], [3]] output_shape [2, 4, 1, 1] : tensor<8x1x1xf32> into tensor<2x4x1x1xf32>
378+
%15 = tosa.sub %expanded_3, %expanded_3 : (tensor<2x4x1x1xf32>, tensor<2x4x1x1xf32>) -> tensor<2x4x1x1xf32>
379+
%16 = tosa.exp %15 : (tensor<2x4x1x1xf32>) -> tensor<2x4x1x1xf32>
380+
%17 = tosa.reciprocal %16 : (tensor<2x4x1x1xf32>) -> tensor<2x4x1x1xf32>
381+
%18 = tosa.mul %16, %17, %7 : (tensor<2x4x1x1xf32>, tensor<2x4x1x1xf32>, tensor<1xi8>) -> tensor<2x4x1x1xf32>
382+
%19 = tosa.log %16 : (tensor<2x4x1x1xf32>) -> tensor<2x4x1x1xf32>
383+
%20 = tosa.add %19, %expanded_3 : (tensor<2x4x1x1xf32>, tensor<2x4x1x1xf32>) -> tensor<2x4x1x1xf32>
384+
%collapsed_4 = tensor.collapse_shape %20 [[0, 1, 2, 3]] : tensor<2x4x1x1xf32> into tensor<8xf32>
385+
%collapsed_5 = tensor.collapse_shape %18 [[0, 1], [2], [3]] : tensor<2x4x1x1xf32> into tensor<8x1x1xf32>
386+
%collapsed_6 = tensor.collapse_shape %11 [[0, 1, 2], [3], [4]] : tensor<2x2x2x1x32xf32> into tensor<8x1x32xf32>
387+
%21 = tosa.matmul %collapsed_5, %collapsed_6, %4, %4 {acc_type = f32} : (tensor<8x1x1xf32>, tensor<8x1x32xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<8x1x32xf32>
388+
%collapsed_7 = tensor.collapse_shape %21 [[0, 1, 2]] : tensor<8x1x32xf32> into tensor<256xf32>
389+
return %collapsed_7, %collapsed_4 : tensor<256xf32>, tensor<8xf32>
390+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: rocmlir-gen -fut mlir_attention --arch %arch --clone-harness %s | rocmlir-driver -kernel-pipeline=migraphx,highlevel -host-pipeline=migraphx,highlevel | rocmlir-gen -ph -rand_min_int 0 -rand_max_int 1024 -rand_type_int_for_inputs=3 -rand 1 -rand_type float -fut mlir_attention_wrapper -RMS_threshold 0.01 --verifier clone - | rocmlir-driver -host-pipeline mhal -kernel-pipeline full -targets %arch | xmir-runner --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext,%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_async_runtime%shlibext --entry-point-result=void | FileCheck %s
2+
// CHECK: [1 1 1]
3+
4+
module {
5+
func.func @mlir_attention(%arg0: !migraphx.shaped<1x1x1xsi32, 1x1x1>, %arg1: !migraphx.shaped<1x96x1x128xf16, 12288x128x128x1>, %arg2: !migraphx.shaped<1x32x256x128xf16, 1048576x32768x128x1>, %arg3: !migraphx.shaped<1x32x256x128xf16, 1048576x32768x128x1>) -> !migraphx.shaped<1x1x4096xf16, 4096x4096x1> {
6+
%0 = migraphx.literal(dense<"0xtensor<256xsi32>) : <256xsi32, 1>
7+
%1 = migraphx.literal(dense<0xFC00> : tensor<1xf16>) : <1xf16, 1>
8+
%2 = migraphx.literal(dense<8.837890e-02> : tensor<1xf16>) : <1xf16, 1>
9+
%3 = migraphx.multibroadcast %0 {out_dyn_dims = [], out_lens = [1, 1, 1, 256]} : <256xsi32, 1> -> <1x1x1x256xsi32, 0x0x0x1>
10+
%4 = migraphx.multibroadcast %2 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1xf16, 1> -> <1x32x1x256xf16, 0x0x0x0>
11+
%5 = migraphx.broadcast %arg0 {axis = 0 : i64, out_lens = [1, 1, 1, 256]} : <1x1x1xsi32, 1x1x1> -> <1x1x1x256xsi32, 1x1x1x0>
12+
%6 = migraphx.greater %3, %5 : <1x1x1x256xsi32, 0x0x0x1>, <1x1x1x256xsi32, 1x1x1x0> -> <1x1x1x256xsi32, 0x0x0x1>
13+
%7 = migraphx.convert %6 {target_type = 0 : i64} : <1x1x1x256xsi32, 0x0x0x1> to <1x1x1x256xsi8, 0x0x0x1>
14+
%8 = migraphx.multibroadcast %7 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1x1x1x256xsi8, 0x0x0x1> -> <1x32x1x256xsi8, 0x0x0x1>
15+
%9 = migraphx.slice %arg1 {axes = [1], ends = [32], starts = [0]} : <1x96x1x128xf16, 12288x128x128x1> -> <1x32x1x128xf16, 12288x128x128x1>
16+
%10 = migraphx.transpose %arg2 {permutation = [0, 1, 3, 2]} : <1x32x256x128xf16, 1048576x32768x128x1> -> <1x32x128x256xf16, 1048576x32768x1x128>
17+
%11 = migraphx.dot %9, %10 : <1x32x1x128xf16, 12288x128x128x1>, <1x32x128x256xf16, 1048576x32768x1x128> -> <1x32x1x256xf16, 8192x256x256x1>
18+
%12 = migraphx.multibroadcast %1 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1xf16, 1> -> <1x32x1x256xf16, 0x0x0x0>
19+
%13 = migraphx.mul %11, %4 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x1x256xf16, 0x0x0x0> -> <1x32x1x256xf16, 8192x256x256x1>
20+
%14 = migraphx.where %8, %12, %13 : <1x32x1x256xsi8, 0x0x0x1>, <1x32x1x256xf16, 0x0x0x0>, <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
21+
%15 = migraphx.reshape %14 {dims = [1, 32, 1, 256]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
22+
%16 = migraphx.reduce_max %15 {axes = [3]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x1xf16, 32x1x1x1>
23+
%17 = migraphx.reshape %16 {dims = [1, 32, 1, 1]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x1xf16, 32x1x1x1>
24+
%18 = migraphx.multibroadcast %17 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x256xf16, 32x1x1x0>
25+
%19 = migraphx.sub %14, %18 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x1x256xf16, 32x1x1x0> -> <1x32x1x256xf16, 8192x256x256x1>
26+
%20 = migraphx.exp %19 : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
27+
%21 = migraphx.reshape %20 {dims = [1, 32, 1, 256]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
28+
%22 = migraphx.reduce_sum %21 {axes = [3]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x1xf16, 32x1x1x1>
29+
%23 = migraphx.reshape %22 {dims = [1, 32, 1, 1]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x1xf16, 32x1x1x1>
30+
%24 = migraphx.multibroadcast %23 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x256xf16, 32x1x1x0>
31+
%25 = migraphx.div %20, %24 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x1x256xf16, 32x1x1x0> -> <1x32x1x256xf16, 8192x256x256x1>
32+
%26 = migraphx.dot %25, %arg3 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x256x128xf16, 1048576x32768x128x1> -> <1x32x1x128xf16, 4096x128x128x1>
33+
%27 = migraphx.transpose %26 {permutation = [0, 2, 1, 3]} : <1x32x1x128xf16, 4096x128x128x1> -> <1x1x32x128xf16, 4096x128x128x1>
34+
%28 = migraphx.reshape %27 {dims = [1, 1, 4096]} : <1x1x32x128xf16, 4096x128x128x1> -> <1x1x4096xf16, 4096x4096x1>
35+
return %28 : !migraphx.shaped<1x1x4096xf16, 4096x4096x1>
36+
}
37+
}
38+
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: rocmlir-gen -fut mlir_attention --arch %arch --clone-harness %s | rocmlir-driver -kernel-pipeline=migraphx,highlevel -host-pipeline=migraphx,highlevel | rocmlir-gen -ph -rand_min_int 0 -rand_max_int 1024 -rand_type_int_for_inputs=3 -rand 1 -rand_type float -fut mlir_attention_wrapper -RMS_threshold 0.01 --verifier clone - | rocmlir-driver -host-pipeline mhal -kernel-pipeline full -targets %arch | xmir-runner --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext,%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_async_runtime%shlibext --entry-point-result=void | FileCheck %s
2+
// CHECK: [1 1 1]
3+
4+
module {
5+
func.func @mlir_attention(%arg0: !migraphx.shaped<1x96x1x128xf16, 12288x128x128x1>, %arg1: !migraphx.shaped<1x32x256x128xf16, 1048576x32768x128x1>, %arg2: !migraphx.shaped<1x1x1xsi32, 1x1x1>, %arg3: !migraphx.shaped<1x32x256x128xf16, 1048576x32768x128x1>) -> !migraphx.shaped<1x1x4096xf16, 4096x4096x1> {
6+
%0 = migraphx.literal(dense<"0xtensor<256xsi32>) : <256xsi32, 1>
7+
%1 = migraphx.literal(dense<0xFC00> : tensor<1xf16>) : <1xf16, 1>
8+
%2 = migraphx.literal(dense<8.837890e-02> : tensor<1xf16>) : <1xf16, 1>
9+
%3 = migraphx.multibroadcast %0 {out_dyn_dims = [], out_lens = [1, 1, 1, 256]} : <256xsi32, 1> -> <1x1x1x256xsi32, 0x0x0x1>
10+
%4 = migraphx.slice %arg0 {axes = [1], ends = [32], starts = [0]} : <1x96x1x128xf16, 12288x128x128x1> -> <1x32x1x128xf16, 12288x128x128x1>
11+
%5 = migraphx.transpose %arg1 {permutation = [0, 1, 3, 2]} : <1x32x256x128xf16, 1048576x32768x128x1> -> <1x32x128x256xf16, 1048576x32768x1x128>
12+
%6 = migraphx.dot %4, %5 : <1x32x1x128xf16, 12288x128x128x1>, <1x32x128x256xf16, 1048576x32768x1x128> -> <1x32x1x256xf16, 8192x256x256x1>
13+
%7 = migraphx.multibroadcast %1 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1xf16, 1> -> <1x32x1x256xf16, 0x0x0x0>
14+
%8 = migraphx.multibroadcast %2 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1xf16, 1> -> <1x32x1x256xf16, 0x0x0x0>
15+
%9 = migraphx.mul %6, %8 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x1x256xf16, 0x0x0x0> -> <1x32x1x256xf16, 8192x256x256x1>
16+
%10 = migraphx.broadcast %arg2 {axis = 0 : i64, out_lens = [1, 1, 1, 256]} : <1x1x1xsi32, 1x1x1> -> <1x1x1x256xsi32, 1x1x1x0>
17+
%11 = migraphx.greater %3, %10 : <1x1x1x256xsi32, 0x0x0x1>, <1x1x1x256xsi32, 1x1x1x0> -> <1x1x1x256xsi32, 0x0x0x1>
18+
%12 = migraphx.convert %11 {target_type = 0 : i64} : <1x1x1x256xsi32, 0x0x0x1> to <1x1x1x256xsi8, 0x0x0x1>
19+
%13 = migraphx.multibroadcast %12 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1x1x1x256xsi8, 0x0x0x1> -> <1x32x1x256xsi8, 0x0x0x1>
20+
%14 = migraphx.where %13, %7, %9 : <1x32x1x256xsi8, 0x0x0x1>, <1x32x1x256xf16, 0x0x0x0>, <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
21+
%15 = migraphx.reshape %14 {dims = [1, 32, 1, 256]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
22+
%16 = migraphx.reduce_max %15 {axes = [3]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x1xf16, 32x1x1x1>
23+
%17 = migraphx.reshape %16 {dims = [1, 32, 1, 1]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x1xf16, 32x1x1x1>
24+
%18 = migraphx.multibroadcast %17 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x256xf16, 32x1x1x0>
25+
%19 = migraphx.sub %14, %18 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x1x256xf16, 32x1x1x0> -> <1x32x1x256xf16, 8192x256x256x1>
26+
%20 = migraphx.exp %19 : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
27+
%21 = migraphx.reshape %20 {dims = [1, 32, 1, 256]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x256xf16, 8192x256x256x1>
28+
%22 = migraphx.reduce_sum %21 {axes = [3]} : <1x32x1x256xf16, 8192x256x256x1> -> <1x32x1x1xf16, 32x1x1x1>
29+
%23 = migraphx.reshape %22 {dims = [1, 32, 1, 1]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x1xf16, 32x1x1x1>
30+
%24 = migraphx.multibroadcast %23 {out_dyn_dims = [], out_lens = [1, 32, 1, 256]} : <1x32x1x1xf16, 32x1x1x1> -> <1x32x1x256xf16, 32x1x1x0>
31+
%25 = migraphx.div %20, %24 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x1x256xf16, 32x1x1x0> -> <1x32x1x256xf16, 8192x256x256x1>
32+
%26 = migraphx.dot %25, %arg3 : <1x32x1x256xf16, 8192x256x256x1>, <1x32x256x128xf16, 1048576x32768x128x1> -> <1x32x1x128xf16, 4096x128x128x1>
33+
%27 = migraphx.transpose %26 {permutation = [0, 2, 1, 3]} : <1x32x1x128xf16, 4096x128x128x1> -> <1x1x32x128xf16, 4096x128x128x1>
34+
%28 = migraphx.reshape %27 {dims = [1, 1, 4096]} : <1x1x32x128xf16, 4096x128x128x1> -> <1x1x4096xf16, 4096x4096x1>
35+
return %28 : !migraphx.shaped<1x1x4096xf16, 4096x4096x1>
36+
}
37+
}
38+

0 commit comments

Comments
 (0)