From 455430f8ee6a31a09396d4623c5762fe349142bf Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 12:36:14 +0000 Subject: [PATCH 1/7] Source kernel version for 09 tutorial from ARL-H machine Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 230 +++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index 604977f449..c45d893a8c 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -53,3 +53,233 @@ def test_regression_4441(device, tmp_path: pathlib.Path): module, function, n_regs, n_spills, n_max_threads = driver.active.utils.load_binary( kernel.name, kernel.kernel, kernel.metadata.shared, kernel.metadata.build_flags, not kernel.metadata.generate_native_code, device) + +def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): + # although the kernel is taken from the arl-h machine, the problem with it is also reproduced on pvc + ir = """ +#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> +#loc = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0) +#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> +#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> +#smem = #ttg.shared_memory +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} { + tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg4: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg5: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg6: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg7: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg8: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0)) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked2> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc1) + %cst_4 = arith.constant dense<64> : tensor<128x64xi32, #blocked2> loc(#loc1) + %cst_5 = arith.constant dense<64> : tensor<64x128xi32, #blocked1> loc(#loc1) + %0 = tt.get_program_id x : i32 loc(#loc2) + %1 = arith.addi %arg3, %c127_i32 : i32 loc(#loc58) + %2 = arith.divsi %1, %c128_i32 : i32 loc(#loc59) + %3 = arith.addi %arg4, %c127_i32 : i32 loc(#loc60) + %4 = arith.divsi %3, %c128_i32 : i32 loc(#loc61) + %5 = arith.muli %4, %c8_i32 : i32 loc(#loc7) + %6 = arith.divsi %0, %5 : i32 loc(#loc8) + %7 = arith.muli %6, %c8_i32 : i32 loc(#loc9) + %8 = arith.subi %2, %7 : i32 loc(#loc10) + %9 = arith.minsi %8, %c8_i32 : i32 loc(#loc11) + %10 = arith.remsi %0, %9 : i32 loc(#loc12) + %11 = arith.addi %7, %10 : i32 loc(#loc13) + %12 = arith.remsi %0, %5 : i32 loc(#loc14) + %13 = arith.divsi %12, %9 : i32 loc(#loc15) + %14 = arith.muli %11, %c128_i32 : i32 loc(#loc16) + %15 = arith.muli %13, %c128_i32 : i32 loc(#loc17) + %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc18) + %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc18) + %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc18) + %19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> loc(#loc18) + %20 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc19) + %21 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc19) + %22 = arith.addi %20, %16 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc19) + %23 = arith.addi %21, %17 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc19) + %24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc20) + %25 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> loc(#loc20) + %26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc20) + %27 = arith.addi %25, %19 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> loc(#loc20) + %28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc21) + %29 = arith.cmpi slt, %22, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc21) + %30 = arith.select %29, %22, %cst_2 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc22) + %31 = tt.splat %arg4 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc23) + %32 = arith.cmpi slt, %26, %31 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc23) + %33 = arith.select %32, %26, %cst_3 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc24) + %34 = tt.expand_dims %30 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2> loc(#loc25) + %35 = tt.splat %arg6 : i32 -> tensor<128x1xi32, #blocked2> loc(#loc26) + %36 = arith.muli %34, %35 : tensor<128x1xi32, #blocked2> loc(#loc26) + %37 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc27) + %38 = tt.expand_dims %37 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> loc(#loc27) + %39 = tt.broadcast %36 : tensor<128x1xi32, #blocked2> -> tensor<128x64xi32, #blocked2> loc(#loc28) + %40 = tt.broadcast %38 : tensor<1x64xi32, #blocked2> -> tensor<128x64xi32, #blocked2> loc(#loc28) + %41 = arith.addi %39, %40 : tensor<128x64xi32, #blocked2> loc(#loc28) + %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> loc(#loc29) + %43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> loc(#loc29) + %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc30) + %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc30) + %46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc31) + %47 = tt.splat %arg7 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc32) + %48 = arith.muli %46, %47 : tensor<1x128xi32, #blocked1> loc(#loc32) + %49 = tt.broadcast %45 : tensor<64x1xi32, #blocked1> -> tensor<64x128xi32, #blocked1> loc(#loc33) + %50 = tt.broadcast %48 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> loc(#loc33) + %51 = arith.addi %49, %50 : tensor<64x128xi32, #blocked1> loc(#loc33) + %52 = tt.splat %arg1 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> loc(#loc34) + %53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc34) + %54 = arith.addi %arg5, %c63_i32 : i32 loc(#loc62) + %55 = arith.divsi %54, %c64_i32 : i32 loc(#loc63) + %56 = arith.remsi %arg5, %c64_i32 : i32 loc(#loc36) + %57 = arith.cmpi eq, %56, %c0_i32 : i32 loc(#loc36) + %58 = arith.cmpi sgt, %arg5, %c64_i32 : i32 loc(#loc36) + %59 = arith.andi %57, %58 : i1 loc(#loc36) + %60 = scf.if %59 -> (tensor<128x128xf32, #blocked>) { + %79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1>) : i32 { + %80 = tt.load %arg11 : tensor<128x64x!tt.ptr, #blocked2> loc(#loc37) + %81 = tt.load %arg12 : tensor<64x128x!tt.ptr, #blocked1> loc(#loc38) + %82 = tt.fp_to_fp %80 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> loc(#loc39) + %83 = ttg.local_alloc %82 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> loc(#loc39) + %84 = ttg.local_load %83 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> loc(#loc39) + %85 = tt.fp_to_fp %81 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> loc(#loc39) + %86 = ttg.local_alloc %85 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> loc(#loc39) + %87 = ttg.local_load %86 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> loc(#loc39) + %88 = tt.dot %84, %87, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> loc(#loc39) + %89 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> loc(#loc40) + %90 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41) + scf.yield %88, %89, %90 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> loc(#loc42) + } loc(#loc36) + scf.yield %79#0 : tensor<128x128xf32, #blocked> loc(#loc36) + } else { + %79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1>) : i32 { + %80 = arith.muli %arg9, %c64_i32 : i32 loc(#loc43) + %81 = arith.subi %arg5, %80 : i32 loc(#loc44) + %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> loc(#loc45) + %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> loc(#loc45) + %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> loc(#loc37) + %85 = tt.load %arg11, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> loc(#loc37) + %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc46) + %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> loc(#loc46) + %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> loc(#loc38) + %89 = tt.load %arg12, %88, %cst_0 : tensor<64x128x!tt.ptr, #blocked1> loc(#loc38) + %90 = tt.fp_to_fp %85 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> loc(#loc39) + %91 = ttg.local_alloc %90 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> loc(#loc39) + %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> loc(#loc39) + %93 = tt.fp_to_fp %89 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> loc(#loc39) + %94 = ttg.local_alloc %93 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> loc(#loc39) + %95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> loc(#loc39) + %96 = tt.dot %92, %95, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> loc(#loc39) + %97 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> loc(#loc40) + %98 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41) + scf.yield %96, %97, %98 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> loc(#loc42) + } loc(#loc36) + scf.yield %79#0 : tensor<128x128xf32, #blocked> loc(#loc36) + } loc(#loc36) + %61 = arith.truncf %60 : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked> loc(#loc47) + %62 = tt.expand_dims %23 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1xi32, #blocked3> loc(#loc48) + %63 = tt.splat %arg8 : i32 -> tensor<128x1xi32, #blocked3> loc(#loc49) + %64 = arith.muli %63, %62 : tensor<128x1xi32, #blocked3> loc(#loc49) + %65 = tt.splat %arg2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked3> loc(#loc50) + %66 = tt.addptr %65, %64 : tensor<128x1x!tt.ptr, #blocked3>, tensor<128x1xi32, #blocked3> loc(#loc50) + %67 = tt.expand_dims %27 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x128xi32, #blocked3> loc(#loc51) + %68 = tt.broadcast %66 : tensor<128x1x!tt.ptr, #blocked3> -> tensor<128x128x!tt.ptr, #blocked3> loc(#loc52) + %69 = tt.broadcast %67 : tensor<1x128xi32, #blocked3> -> tensor<128x128xi32, #blocked3> loc(#loc52) + %70 = tt.addptr %68, %69 : tensor<128x128x!tt.ptr, #blocked3>, tensor<128x128xi32, #blocked3> loc(#loc52) + %71 = tt.splat %arg3 : i32 -> tensor<128x1xi32, #blocked3> loc(#loc53) + %72 = arith.cmpi slt, %62, %71 : tensor<128x1xi32, #blocked3> loc(#loc53) + %73 = tt.splat %arg4 : i32 -> tensor<1x128xi32, #blocked3> loc(#loc54) + %74 = arith.cmpi slt, %67, %73 : tensor<1x128xi32, #blocked3> loc(#loc54) + %75 = tt.broadcast %72 : tensor<128x1xi1, #blocked3> -> tensor<128x128xi1, #blocked3> loc(#loc55) + %76 = tt.broadcast %74 : tensor<1x128xi1, #blocked3> -> tensor<128x128xi1, #blocked3> loc(#loc55) + %77 = arith.andi %75, %76 : tensor<128x128xi1, #blocked3> loc(#loc55) + %78 = ttg.convert_layout %61 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #blocked3> loc(#loc56) + tt.store %70, %78, %77 : tensor<128x128x!tt.ptr, #blocked3> loc(#loc56) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":136:24) +#loc3 = loc("/home/runner/intel-xpu-backend-for-triton/python/triton/language/standard.py":40:22) +#loc4 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":137:27) +#loc5 = loc("/home/runner/intel-xpu-backend-for-triton/python/triton/language/standard.py":40:28) +#loc6 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":138:27) +#loc7 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":139:38) +#loc8 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":140:22) +#loc9 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":141:29) +#loc10 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":142:35) +#loc11 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":142:48) +#loc12 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":143:33) +#loc13 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":143:27) +#loc14 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":144:19) +#loc15 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":144:40) +#loc16 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":146:22) +#loc17 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":147:22) +#loc18 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":149:37) +#loc19 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":149:24) +#loc20 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":150:24) +#loc21 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":151:33) +#loc22 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":151:45) +#loc23 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":152:33) +#loc24 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":152:45) +#loc25 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:30) +#loc26 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:41) +#loc27 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:60) +#loc28 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:53) +#loc29 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:22) +#loc30 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:29) +#loc31 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:60) +#loc32 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:71) +#loc33 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:52) +#loc34 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:22) +#loc35 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":162:33) +#loc36 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":162:22) +#loc37 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:20) +#loc38 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":164:20) +#loc39 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":165:35) +#loc40 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":166:18) +#loc41 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":167:18) +#loc42 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":167:8) +#loc43 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:59) +#loc44 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:55) +#loc45 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:51) +#loc46 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":164:51) +#loc47 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":172:27) +#loc48 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:41) +#loc49 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:33) +#loc50 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:21) +#loc51 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:72) +#loc52 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:52) +#loc53 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":177:33) +#loc54 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":177:58) +#loc55 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":177:39) +#loc56 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":178:21) +#loc57 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":178:4) +#loc58 = loc(callsite(#loc3 at #loc4)) +#loc59 = loc(callsite(#loc5 at #loc4)) +#loc60 = loc(callsite(#loc3 at #loc6)) +#loc61 = loc(callsite(#loc5 at #loc6)) +#loc62 = loc(callsite(#loc3 at #loc35)) +#loc63 = loc(callsite(#loc5 at #loc35)) + """ + + temp_file = tmp_path / "test_regression_4441.ttgir" + temp_file.write_text(ir) + kernel = triton.compile(str(temp_file)) + + from triton.runtime.driver import driver + device = driver.active.get_current_device() + + # try to catch: + # L0 build module failed. Log: IGC: Internal Compiler Error: Segmentation violation + # Error during Intel loadBinary: Triton Error [ZE]: 0x70000004 + # RuntimeError: Triton Error [ZE]: 0x70000004 + module, function, n_regs, n_spills, n_max_threads = driver.active.utils.load_binary( + kernel.name, kernel.kernel, kernel.metadata.shared, kernel.metadata.build_flags, + not kernel.metadata.generate_native_code, device) From b008c9bab425bbd06f8736fbc1052ca22f59b55d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 12:38:50 +0000 Subject: [PATCH 2/7] remove 'loc' Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 335 +++++++++------------ 1 file changed, 136 insertions(+), 199 deletions(-) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index c45d893a8c..36d8242ad4 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -54,6 +54,7 @@ def test_regression_4441(device, tmp_path: pathlib.Path): kernel.name, kernel.kernel, kernel.metadata.shared, kernel.metadata.build_flags, not kernel.metadata.generate_native_code, device) + def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): # although the kernel is taken from the arl-h machine, the problem with it is also reproduced on pvc ir = """ @@ -61,215 +62,151 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> #blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked3 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#loc = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0) #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} { - tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg4: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg5: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg6: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg7: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0), %arg8: i32 {tt.divisibility = 16 : i32} loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":126:0)) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> loc(#loc1) - %c63_i32 = arith.constant 63 : i32 loc(#loc1) - %c127_i32 = arith.constant 127 : i32 loc(#loc1) - %c1_i32 = arith.constant 1 : i32 loc(#loc1) - %c0_i32 = arith.constant 0 : i32 loc(#loc1) - %c64_i32 = arith.constant 64 : i32 loc(#loc1) - %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked1> loc(#loc1) - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked2> loc(#loc1) - %c8_i32 = arith.constant 8 : i32 loc(#loc1) - %c128_i32 = arith.constant 128 : i32 loc(#loc1) - %cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc1) - %cst_3 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc1) - %cst_4 = arith.constant dense<64> : tensor<128x64xi32, #blocked2> loc(#loc1) - %cst_5 = arith.constant dense<64> : tensor<64x128xi32, #blocked1> loc(#loc1) - %0 = tt.get_program_id x : i32 loc(#loc2) - %1 = arith.addi %arg3, %c127_i32 : i32 loc(#loc58) - %2 = arith.divsi %1, %c128_i32 : i32 loc(#loc59) - %3 = arith.addi %arg4, %c127_i32 : i32 loc(#loc60) - %4 = arith.divsi %3, %c128_i32 : i32 loc(#loc61) - %5 = arith.muli %4, %c8_i32 : i32 loc(#loc7) - %6 = arith.divsi %0, %5 : i32 loc(#loc8) - %7 = arith.muli %6, %c8_i32 : i32 loc(#loc9) - %8 = arith.subi %2, %7 : i32 loc(#loc10) - %9 = arith.minsi %8, %c8_i32 : i32 loc(#loc11) - %10 = arith.remsi %0, %9 : i32 loc(#loc12) - %11 = arith.addi %7, %10 : i32 loc(#loc13) - %12 = arith.remsi %0, %5 : i32 loc(#loc14) - %13 = arith.divsi %12, %9 : i32 loc(#loc15) - %14 = arith.muli %11, %c128_i32 : i32 loc(#loc16) - %15 = arith.muli %13, %c128_i32 : i32 loc(#loc17) - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc18) - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc18) - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc18) - %19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> loc(#loc18) - %20 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc19) - %21 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc19) - %22 = arith.addi %20, %16 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc19) - %23 = arith.addi %21, %17 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc19) - %24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc20) - %25 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> loc(#loc20) - %26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc20) - %27 = arith.addi %25, %19 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> loc(#loc20) - %28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc21) - %29 = arith.cmpi slt, %22, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc21) - %30 = arith.select %29, %22, %cst_2 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc22) - %31 = tt.splat %arg4 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc23) - %32 = arith.cmpi slt, %26, %31 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc23) - %33 = arith.select %32, %26, %cst_3 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc24) - %34 = tt.expand_dims %30 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2> loc(#loc25) - %35 = tt.splat %arg6 : i32 -> tensor<128x1xi32, #blocked2> loc(#loc26) - %36 = arith.muli %34, %35 : tensor<128x1xi32, #blocked2> loc(#loc26) - %37 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc27) - %38 = tt.expand_dims %37 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> loc(#loc27) - %39 = tt.broadcast %36 : tensor<128x1xi32, #blocked2> -> tensor<128x64xi32, #blocked2> loc(#loc28) - %40 = tt.broadcast %38 : tensor<1x64xi32, #blocked2> -> tensor<128x64xi32, #blocked2> loc(#loc28) - %41 = arith.addi %39, %40 : tensor<128x64xi32, #blocked2> loc(#loc28) - %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> loc(#loc29) - %43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> loc(#loc29) - %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc30) - %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc30) - %46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc31) - %47 = tt.splat %arg7 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc32) - %48 = arith.muli %46, %47 : tensor<1x128xi32, #blocked1> loc(#loc32) - %49 = tt.broadcast %45 : tensor<64x1xi32, #blocked1> -> tensor<64x128xi32, #blocked1> loc(#loc33) - %50 = tt.broadcast %48 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> loc(#loc33) - %51 = arith.addi %49, %50 : tensor<64x128xi32, #blocked1> loc(#loc33) - %52 = tt.splat %arg1 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> loc(#loc34) - %53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc34) - %54 = arith.addi %arg5, %c63_i32 : i32 loc(#loc62) - %55 = arith.divsi %54, %c64_i32 : i32 loc(#loc63) - %56 = arith.remsi %arg5, %c64_i32 : i32 loc(#loc36) - %57 = arith.cmpi eq, %56, %c0_i32 : i32 loc(#loc36) - %58 = arith.cmpi sgt, %arg5, %c64_i32 : i32 loc(#loc36) - %59 = arith.andi %57, %58 : i1 loc(#loc36) + tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> + %c63_i32 = arith.constant 63 : i32 + %c127_i32 = arith.constant 127 : i32 + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %c64_i32 = arith.constant 64 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked1> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked2> + %c8_i32 = arith.constant 8 : i32 + %c128_i32 = arith.constant 128 : i32 + %cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %cst_3 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %cst_4 = arith.constant dense<64> : tensor<128x64xi32, #blocked2> + %cst_5 = arith.constant dense<64> : tensor<64x128xi32, #blocked1> + %0 = tt.get_program_id x : i32 + %1 = arith.addi %arg3, %c127_i32 : i32 + %2 = arith.divsi %1, %c128_i32 : i32 + %3 = arith.addi %arg4, %c127_i32 : i32 + %4 = arith.divsi %3, %c128_i32 : i32 + %5 = arith.muli %4, %c8_i32 : i32 + %6 = arith.divsi %0, %5 : i32 + %7 = arith.muli %6, %c8_i32 : i32 + %8 = arith.subi %2, %7 : i32 + %9 = arith.minsi %8, %c8_i32 : i32 + %10 = arith.remsi %0, %9 : i32 + %11 = arith.addi %7, %10 : i32 + %12 = arith.remsi %0, %5 : i32 + %13 = arith.divsi %12, %9 : i32 + %14 = arith.muli %11, %c128_i32 : i32 + %15 = arith.muli %13, %c128_i32 : i32 + %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> + %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> + %20 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %21 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> + %22 = arith.addi %20, %16 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %23 = arith.addi %21, %17 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> + %24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %25 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> + %26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %27 = arith.addi %25, %19 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> + %28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %29 = arith.cmpi slt, %22, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %30 = arith.select %29, %22, %cst_2 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %31 = tt.splat %arg4 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %32 = arith.cmpi slt, %26, %31 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %33 = arith.select %32, %26, %cst_3 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %34 = tt.expand_dims %30 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2> + %35 = tt.splat %arg6 : i32 -> tensor<128x1xi32, #blocked2> + %36 = arith.muli %34, %35 : tensor<128x1xi32, #blocked2> + %37 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> + %38 = tt.expand_dims %37 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> + %39 = tt.broadcast %36 : tensor<128x1xi32, #blocked2> -> tensor<128x64xi32, #blocked2> + %40 = tt.broadcast %38 : tensor<1x64xi32, #blocked2> -> tensor<128x64xi32, #blocked2> + %41 = arith.addi %39, %40 : tensor<128x64xi32, #blocked2> + %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> + %43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> + %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> + %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> + %46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> + %47 = tt.splat %arg7 : i32 -> tensor<1x128xi32, #blocked1> + %48 = arith.muli %46, %47 : tensor<1x128xi32, #blocked1> + %49 = tt.broadcast %45 : tensor<64x1xi32, #blocked1> -> tensor<64x128xi32, #blocked1> + %50 = tt.broadcast %48 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> + %51 = arith.addi %49, %50 : tensor<64x128xi32, #blocked1> + %52 = tt.splat %arg1 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> + %53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> + %54 = arith.addi %arg5, %c63_i32 : i32 + %55 = arith.divsi %54, %c64_i32 : i32 + %56 = arith.remsi %arg5, %c64_i32 : i32 + %57 = arith.cmpi eq, %56, %c0_i32 : i32 + %58 = arith.cmpi sgt, %arg5, %c64_i32 : i32 + %59 = arith.andi %57, %58 : i1 %60 = scf.if %59 -> (tensor<128x128xf32, #blocked>) { %79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1>) : i32 { - %80 = tt.load %arg11 : tensor<128x64x!tt.ptr, #blocked2> loc(#loc37) - %81 = tt.load %arg12 : tensor<64x128x!tt.ptr, #blocked1> loc(#loc38) - %82 = tt.fp_to_fp %80 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> loc(#loc39) - %83 = ttg.local_alloc %82 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> loc(#loc39) - %84 = ttg.local_load %83 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> loc(#loc39) - %85 = tt.fp_to_fp %81 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> loc(#loc39) - %86 = ttg.local_alloc %85 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> loc(#loc39) - %87 = ttg.local_load %86 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> loc(#loc39) - %88 = tt.dot %84, %87, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> loc(#loc39) - %89 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> loc(#loc40) - %90 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41) - scf.yield %88, %89, %90 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> loc(#loc42) - } loc(#loc36) - scf.yield %79#0 : tensor<128x128xf32, #blocked> loc(#loc36) + %80 = tt.load %arg11 : tensor<128x64x!tt.ptr, #blocked2> + %81 = tt.load %arg12 : tensor<64x128x!tt.ptr, #blocked1> + %82 = tt.fp_to_fp %80 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> + %83 = ttg.local_alloc %82 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> + %84 = ttg.local_load %83 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> + %85 = tt.fp_to_fp %81 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> + %86 = ttg.local_alloc %85 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> + %87 = ttg.local_load %86 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> + %88 = tt.dot %84, %87, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> + %89 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> + %90 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> + scf.yield %88, %89, %90 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> + } + scf.yield %79#0 : tensor<128x128xf32, #blocked> } else { %79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1>) : i32 { - %80 = arith.muli %arg9, %c64_i32 : i32 loc(#loc43) - %81 = arith.subi %arg5, %80 : i32 loc(#loc44) - %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> loc(#loc45) - %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> loc(#loc45) - %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> loc(#loc37) - %85 = tt.load %arg11, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> loc(#loc37) - %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc46) - %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> loc(#loc46) - %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> loc(#loc38) - %89 = tt.load %arg12, %88, %cst_0 : tensor<64x128x!tt.ptr, #blocked1> loc(#loc38) - %90 = tt.fp_to_fp %85 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> loc(#loc39) - %91 = ttg.local_alloc %90 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> loc(#loc39) - %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> loc(#loc39) - %93 = tt.fp_to_fp %89 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> loc(#loc39) - %94 = ttg.local_alloc %93 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> loc(#loc39) - %95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> loc(#loc39) - %96 = tt.dot %92, %95, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> loc(#loc39) - %97 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> loc(#loc40) - %98 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41) - scf.yield %96, %97, %98 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> loc(#loc42) - } loc(#loc36) - scf.yield %79#0 : tensor<128x128xf32, #blocked> loc(#loc36) - } loc(#loc36) - %61 = arith.truncf %60 : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked> loc(#loc47) - %62 = tt.expand_dims %23 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1xi32, #blocked3> loc(#loc48) - %63 = tt.splat %arg8 : i32 -> tensor<128x1xi32, #blocked3> loc(#loc49) - %64 = arith.muli %63, %62 : tensor<128x1xi32, #blocked3> loc(#loc49) - %65 = tt.splat %arg2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked3> loc(#loc50) - %66 = tt.addptr %65, %64 : tensor<128x1x!tt.ptr, #blocked3>, tensor<128x1xi32, #blocked3> loc(#loc50) - %67 = tt.expand_dims %27 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x128xi32, #blocked3> loc(#loc51) - %68 = tt.broadcast %66 : tensor<128x1x!tt.ptr, #blocked3> -> tensor<128x128x!tt.ptr, #blocked3> loc(#loc52) - %69 = tt.broadcast %67 : tensor<1x128xi32, #blocked3> -> tensor<128x128xi32, #blocked3> loc(#loc52) - %70 = tt.addptr %68, %69 : tensor<128x128x!tt.ptr, #blocked3>, tensor<128x128xi32, #blocked3> loc(#loc52) - %71 = tt.splat %arg3 : i32 -> tensor<128x1xi32, #blocked3> loc(#loc53) - %72 = arith.cmpi slt, %62, %71 : tensor<128x1xi32, #blocked3> loc(#loc53) - %73 = tt.splat %arg4 : i32 -> tensor<1x128xi32, #blocked3> loc(#loc54) - %74 = arith.cmpi slt, %67, %73 : tensor<1x128xi32, #blocked3> loc(#loc54) - %75 = tt.broadcast %72 : tensor<128x1xi1, #blocked3> -> tensor<128x128xi1, #blocked3> loc(#loc55) - %76 = tt.broadcast %74 : tensor<1x128xi1, #blocked3> -> tensor<128x128xi1, #blocked3> loc(#loc55) - %77 = arith.andi %75, %76 : tensor<128x128xi1, #blocked3> loc(#loc55) - %78 = ttg.convert_layout %61 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #blocked3> loc(#loc56) - tt.store %70, %78, %77 : tensor<128x128x!tt.ptr, #blocked3> loc(#loc56) - tt.return loc(#loc57) - } loc(#loc) -} loc(#loc) -#loc1 = loc(unknown) -#loc2 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":136:24) -#loc3 = loc("/home/runner/intel-xpu-backend-for-triton/python/triton/language/standard.py":40:22) -#loc4 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":137:27) -#loc5 = loc("/home/runner/intel-xpu-backend-for-triton/python/triton/language/standard.py":40:28) -#loc6 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":138:27) -#loc7 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":139:38) -#loc8 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":140:22) -#loc9 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":141:29) -#loc10 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":142:35) -#loc11 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":142:48) -#loc12 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":143:33) -#loc13 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":143:27) -#loc14 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":144:19) -#loc15 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":144:40) -#loc16 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":146:22) -#loc17 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":147:22) -#loc18 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":149:37) -#loc19 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":149:24) -#loc20 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":150:24) -#loc21 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":151:33) -#loc22 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":151:45) -#loc23 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":152:33) -#loc24 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":152:45) -#loc25 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:30) -#loc26 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:41) -#loc27 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:60) -#loc28 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:53) -#loc29 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":157:22) -#loc30 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:29) -#loc31 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:60) -#loc32 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:71) -#loc33 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:52) -#loc34 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":158:22) -#loc35 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":162:33) -#loc36 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":162:22) -#loc37 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:20) -#loc38 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":164:20) -#loc39 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":165:35) -#loc40 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":166:18) -#loc41 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":167:18) -#loc42 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":167:8) -#loc43 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:59) -#loc44 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:55) -#loc45 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":163:51) -#loc46 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":164:51) -#loc47 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":172:27) -#loc48 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:41) -#loc49 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:33) -#loc50 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:21) -#loc51 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:72) -#loc52 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":176:52) -#loc53 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":177:33) -#loc54 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":177:58) -#loc55 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":177:39) -#loc56 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":178:21) -#loc57 = loc("/home/runner/intel-xpu-backend-for-triton/python/tutorials/09-persistent-matmul.py":178:4) -#loc58 = loc(callsite(#loc3 at #loc4)) -#loc59 = loc(callsite(#loc5 at #loc4)) -#loc60 = loc(callsite(#loc3 at #loc6)) -#loc61 = loc(callsite(#loc5 at #loc6)) -#loc62 = loc(callsite(#loc3 at #loc35)) -#loc63 = loc(callsite(#loc5 at #loc35)) + %80 = arith.muli %arg9, %c64_i32 : i32 + %81 = arith.subi %arg5, %80 : i32 + %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> + %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> + %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> + %85 = tt.load %arg11, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> + %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> + %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> + %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> + %89 = tt.load %arg12, %88, %cst_0 : tensor<64x128x!tt.ptr, #blocked1> + %90 = tt.fp_to_fp %85 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> + %91 = ttg.local_alloc %90 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> + %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> + %93 = tt.fp_to_fp %89 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> + %94 = ttg.local_alloc %93 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> + %95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> + %96 = tt.dot %92, %95, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> + %97 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> + %98 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> + scf.yield %96, %97, %98 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> + } + scf.yield %79#0 : tensor<128x128xf32, #blocked> + } + %61 = arith.truncf %60 : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked> + %62 = tt.expand_dims %23 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1xi32, #blocked3> + %63 = tt.splat %arg8 : i32 -> tensor<128x1xi32, #blocked3> + %64 = arith.muli %63, %62 : tensor<128x1xi32, #blocked3> + %65 = tt.splat %arg2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked3> + %66 = tt.addptr %65, %64 : tensor<128x1x!tt.ptr, #blocked3>, tensor<128x1xi32, #blocked3> + %67 = tt.expand_dims %27 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x128xi32, #blocked3> + %68 = tt.broadcast %66 : tensor<128x1x!tt.ptr, #blocked3> -> tensor<128x128x!tt.ptr, #blocked3> + %69 = tt.broadcast %67 : tensor<1x128xi32, #blocked3> -> tensor<128x128xi32, #blocked3> + %70 = tt.addptr %68, %69 : tensor<128x128x!tt.ptr, #blocked3>, tensor<128x128xi32, #blocked3> + %71 = tt.splat %arg3 : i32 -> tensor<128x1xi32, #blocked3> + %72 = arith.cmpi slt, %62, %71 : tensor<128x1xi32, #blocked3> + %73 = tt.splat %arg4 : i32 -> tensor<1x128xi32, #blocked3> + %74 = arith.cmpi slt, %67, %73 : tensor<1x128xi32, #blocked3> + %75 = tt.broadcast %72 : tensor<128x1xi1, #blocked3> -> tensor<128x128xi1, #blocked3> + %76 = tt.broadcast %74 : tensor<1x128xi1, #blocked3> -> tensor<128x128xi1, #blocked3> + %77 = arith.andi %75, %76 : tensor<128x128xi1, #blocked3> + %78 = ttg.convert_layout %61 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #blocked3> + tt.store %70, %78, %77 : tensor<128x128x!tt.ptr, #blocked3> + tt.return + } +} """ - temp_file = tmp_path / "test_regression_4441.ttgir" + temp_file = tmp_path / "test_kernel_from_09_tutorial.ttgir" temp_file.write_text(ir) kernel = triton.compile(str(temp_file)) From 55a854092f43fb5868eab253ba1306081aef204c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 13:48:02 +0000 Subject: [PATCH 3/7] further reduction Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 100 ++++++--------------- 1 file changed, 27 insertions(+), 73 deletions(-) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index 36d8242ad4..acbb87e497 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -66,15 +66,15 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} { - tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} { + tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> %c63_i32 = arith.constant 63 : i32 %c127_i32 = arith.constant 127 : i32 %c1_i32 = arith.constant 1 : i32 %c0_i32 = arith.constant 0 : i32 %c64_i32 = arith.constant 64 : i32 - %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked1> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked2> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked2> %c8_i32 = arith.constant 8 : i32 %c128_i32 = arith.constant 128 : i32 %cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> @@ -98,17 +98,11 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %14 = arith.muli %11, %c128_i32 : i32 %15 = arith.muli %13, %c128_i32 : i32 %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> %20 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %21 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> %22 = arith.addi %20, %16 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %23 = arith.addi %21, %17 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> %24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %25 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> %26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %27 = arith.addi %25, %19 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> %28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> %29 = arith.cmpi slt, %22, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> %30 = arith.select %29, %22, %cst_2 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> @@ -123,8 +117,8 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %39 = tt.broadcast %36 : tensor<128x1xi32, #blocked2> -> tensor<128x64xi32, #blocked2> %40 = tt.broadcast %38 : tensor<1x64xi32, #blocked2> -> tensor<128x64xi32, #blocked2> %41 = arith.addi %39, %40 : tensor<128x64xi32, #blocked2> - %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> - %43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> + %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> + %43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> %46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> @@ -133,74 +127,34 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %49 = tt.broadcast %45 : tensor<64x1xi32, #blocked1> -> tensor<64x128xi32, #blocked1> %50 = tt.broadcast %48 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> %51 = arith.addi %49, %50 : tensor<64x128xi32, #blocked1> - %52 = tt.splat %arg1 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> - %53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> + %52 = tt.splat %arg1 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> + %53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> %54 = arith.addi %arg5, %c63_i32 : i32 %55 = arith.divsi %54, %c64_i32 : i32 %56 = arith.remsi %arg5, %c64_i32 : i32 %57 = arith.cmpi eq, %56, %c0_i32 : i32 %58 = arith.cmpi sgt, %arg5, %c64_i32 : i32 %59 = arith.andi %57, %58 : i1 - %60 = scf.if %59 -> (tensor<128x128xf32, #blocked>) { - %79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1>) : i32 { - %80 = tt.load %arg11 : tensor<128x64x!tt.ptr, #blocked2> - %81 = tt.load %arg12 : tensor<64x128x!tt.ptr, #blocked1> - %82 = tt.fp_to_fp %80 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> - %83 = ttg.local_alloc %82 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> - %84 = ttg.local_load %83 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> - %85 = tt.fp_to_fp %81 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> - %86 = ttg.local_alloc %85 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> - %87 = ttg.local_load %86 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> - %88 = tt.dot %84, %87, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> - %89 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %90 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> - scf.yield %88, %89, %90 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> - } - scf.yield %79#0 : tensor<128x128xf32, #blocked> - } else { - %79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1>) : i32 { - %80 = arith.muli %arg9, %c64_i32 : i32 - %81 = arith.subi %arg5, %80 : i32 - %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> - %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> - %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> - %85 = tt.load %arg11, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> - %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> - %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> - %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> - %89 = tt.load %arg12, %88, %cst_0 : tensor<64x128x!tt.ptr, #blocked1> - %90 = tt.fp_to_fp %85 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2> - %91 = ttg.local_alloc %90 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> - %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> - %93 = tt.fp_to_fp %89 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1> - %94 = ttg.local_alloc %93 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> - %95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> - %96 = tt.dot %92, %95, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> - %97 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %98 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> - scf.yield %96, %97, %98 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr, #blocked2>, tensor<64x128x!tt.ptr, #blocked1> - } - scf.yield %79#0 : tensor<128x128xf32, #blocked> - } - %61 = arith.truncf %60 : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked> - %62 = tt.expand_dims %23 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1xi32, #blocked3> - %63 = tt.splat %arg8 : i32 -> tensor<128x1xi32, #blocked3> - %64 = arith.muli %63, %62 : tensor<128x1xi32, #blocked3> - %65 = tt.splat %arg2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked3> - %66 = tt.addptr %65, %64 : tensor<128x1x!tt.ptr, #blocked3>, tensor<128x1xi32, #blocked3> - %67 = tt.expand_dims %27 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x128xi32, #blocked3> - %68 = tt.broadcast %66 : tensor<128x1x!tt.ptr, #blocked3> -> tensor<128x128x!tt.ptr, #blocked3> - %69 = tt.broadcast %67 : tensor<1x128xi32, #blocked3> -> tensor<128x128xi32, #blocked3> - %70 = tt.addptr %68, %69 : tensor<128x128x!tt.ptr, #blocked3>, tensor<128x128xi32, #blocked3> - %71 = tt.splat %arg3 : i32 -> tensor<128x1xi32, #blocked3> - %72 = arith.cmpi slt, %62, %71 : tensor<128x1xi32, #blocked3> - %73 = tt.splat %arg4 : i32 -> tensor<1x128xi32, #blocked3> - %74 = arith.cmpi slt, %67, %73 : tensor<1x128xi32, #blocked3> - %75 = tt.broadcast %72 : tensor<128x1xi1, #blocked3> -> tensor<128x128xi1, #blocked3> - %76 = tt.broadcast %74 : tensor<1x128xi1, #blocked3> -> tensor<128x128xi1, #blocked3> - %77 = arith.andi %75, %76 : tensor<128x128xi1, #blocked3> - %78 = ttg.convert_layout %61 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #blocked3> - tt.store %70, %78, %77 : tensor<128x128x!tt.ptr, #blocked3> + + %80 = arith.muli %c0_i32, %c64_i32 : i32 + %81 = arith.subi %arg5, %80 : i32 + %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> + %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> + %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> + %85 = tt.load %43, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> + %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> + %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> + %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> + %89 = tt.load %53, %88, %cst_0 : tensor<64x128x!tt.ptr, #blocked1> + %91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> + %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> + %94 = ttg.local_alloc %89 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> + %95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> + %96 = tt.dot %92, %95, %cst, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> + %97 = tt.addptr %43, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> + %98 = tt.addptr %53, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> + + %78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked3> tt.return } } From a92234a17b7020e064587c372d98c8a14466a806 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 15:05:50 +0000 Subject: [PATCH 4/7] further reduction Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 55 +++++----------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index acbb87e497..689520e1fa 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -61,12 +61,11 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> #blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked3 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} { - tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} { + tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> %c63_i32 = arith.constant 63 : i32 %c127_i32 = arith.constant 127 : i32 @@ -79,69 +78,42 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %c128_i32 = arith.constant 128 : i32 %cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> %cst_3 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %cst_4 = arith.constant dense<64> : tensor<128x64xi32, #blocked2> - %cst_5 = arith.constant dense<64> : tensor<64x128xi32, #blocked1> %0 = tt.get_program_id x : i32 %1 = arith.addi %arg3, %c127_i32 : i32 %2 = arith.divsi %1, %c128_i32 : i32 - %3 = arith.addi %arg4, %c127_i32 : i32 - %4 = arith.divsi %3, %c128_i32 : i32 - %5 = arith.muli %4, %c8_i32 : i32 + %5 = arith.muli %2, %c8_i32 : i32 %6 = arith.divsi %0, %5 : i32 %7 = arith.muli %6, %c8_i32 : i32 %8 = arith.subi %2, %7 : i32 %9 = arith.minsi %8, %c8_i32 : i32 - %10 = arith.remsi %0, %9 : i32 - %11 = arith.addi %7, %10 : i32 %12 = arith.remsi %0, %5 : i32 %13 = arith.divsi %12, %9 : i32 - %14 = arith.muli %11, %c128_i32 : i32 %15 = arith.muli %13, %c128_i32 : i32 - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %20 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %22 = arith.addi %20, %16 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %20 = tt.splat %c128_i32 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> %24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> %26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> %28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %29 = arith.cmpi slt, %22, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %30 = arith.select %29, %22, %cst_2 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %31 = tt.splat %arg4 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> + %29 = arith.cmpi slt, %20, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> + %31 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> %32 = arith.cmpi slt, %26, %31 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> %33 = arith.select %32, %26, %cst_3 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %34 = tt.expand_dims %30 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2> - %35 = tt.splat %arg6 : i32 -> tensor<128x1xi32, #blocked2> - %36 = arith.muli %34, %35 : tensor<128x1xi32, #blocked2> %37 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> %38 = tt.expand_dims %37 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> - %39 = tt.broadcast %36 : tensor<128x1xi32, #blocked2> -> tensor<128x64xi32, #blocked2> - %40 = tt.broadcast %38 : tensor<1x64xi32, #blocked2> -> tensor<128x64xi32, #blocked2> - %41 = arith.addi %39, %40 : tensor<128x64xi32, #blocked2> %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> - %43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> %46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> - %47 = tt.splat %arg7 : i32 -> tensor<1x128xi32, #blocked1> - %48 = arith.muli %46, %47 : tensor<1x128xi32, #blocked1> - %49 = tt.broadcast %45 : tensor<64x1xi32, #blocked1> -> tensor<64x128xi32, #blocked1> - %50 = tt.broadcast %48 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> - %51 = arith.addi %49, %50 : tensor<64x128xi32, #blocked1> - %52 = tt.splat %arg1 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> - %53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> - %54 = arith.addi %arg5, %c63_i32 : i32 - %55 = arith.divsi %54, %c64_i32 : i32 - %56 = arith.remsi %arg5, %c64_i32 : i32 - %57 = arith.cmpi eq, %56, %c0_i32 : i32 - %58 = arith.cmpi sgt, %arg5, %c64_i32 : i32 - %59 = arith.andi %57, %58 : i1 + %50 = tt.broadcast %46 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> + %52 = tt.splat %arg0 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> + %53 = tt.addptr %52, %50 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> %80 = arith.muli %c0_i32, %c64_i32 : i32 %81 = arith.subi %arg5, %80 : i32 %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> - %85 = tt.load %43, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> + %85 = tt.load %42, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> @@ -149,12 +121,11 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> %94 = ttg.local_alloc %89 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> - %95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> - %96 = tt.dot %92, %95, %cst, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> - %97 = tt.addptr %43, %cst_4 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %98 = tt.addptr %53, %cst_5 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> + %cst_test = arith.constant dense<1.11111116> : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> + %cst_test2 = arith.constant dense<1.11111116> : tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> + %96 = tt.dot %92, %cst_test2, %cst, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> - %78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked3> + %78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked2> tt.return } } From 06d5701162e6119bd849b1c15d79b2a7fa7d62f6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 15:36:40 +0000 Subject: [PATCH 5/7] further reduction Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 36 +++------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index 689520e1fa..455173e44a 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -65,7 +65,7 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} { - tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { + tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> %c63_i32 = arith.constant 63 : i32 %c127_i32 = arith.constant 127 : i32 @@ -73,47 +73,19 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %c0_i32 = arith.constant 0 : i32 %c64_i32 = arith.constant 64 : i32 %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked2> %c8_i32 = arith.constant 8 : i32 %c128_i32 = arith.constant 128 : i32 - %cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %cst_3 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %0 = tt.get_program_id x : i32 - %1 = arith.addi %arg3, %c127_i32 : i32 - %2 = arith.divsi %1, %c128_i32 : i32 - %5 = arith.muli %2, %c8_i32 : i32 - %6 = arith.divsi %0, %5 : i32 - %7 = arith.muli %6, %c8_i32 : i32 - %8 = arith.subi %2, %7 : i32 - %9 = arith.minsi %8, %c8_i32 : i32 - %12 = arith.remsi %0, %5 : i32 - %13 = arith.divsi %12, %9 : i32 - %15 = arith.muli %13, %c128_i32 : i32 %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %20 = tt.splat %c128_i32 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %29 = arith.cmpi slt, %20, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> - %31 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %32 = arith.cmpi slt, %26, %31 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %33 = arith.select %32, %26, %cst_3 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %37 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> - %38 = tt.expand_dims %37 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> - %46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> + %46 = tt.expand_dims %18 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> %50 = tt.broadcast %46 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> %52 = tt.splat %arg0 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> %53 = tt.addptr %52, %50 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> - %80 = arith.muli %c0_i32, %c64_i32 : i32 - %81 = arith.subi %arg5, %80 : i32 - %82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2> - %83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2> - %84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2> - %85 = tt.load %42, %84, %cst_1 : tensor<128x64x!tt.ptr, #blocked2> + %81 = arith.subi %arg5, %c64_i32 : i32 + %85 = tt.load %42: tensor<128x64x!tt.ptr, #blocked2> %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> From 67ddd4fde70889aafecf24eb445bdb1d4fad9e0d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 16:01:05 +0000 Subject: [PATCH 6/7] further reduction Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index 455173e44a..6d5fc04ef1 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -60,23 +60,15 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): ir = """ #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} { - tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { + tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) { %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked> - %c63_i32 = arith.constant 63 : i32 - %c127_i32 = arith.constant 127 : i32 - %c1_i32 = arith.constant 1 : i32 - %c0_i32 = arith.constant 0 : i32 - %c64_i32 = arith.constant 64 : i32 %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1> - %c8_i32 = arith.constant 8 : i32 - %c128_i32 = arith.constant 128 : i32 %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> - %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked2> + %42 = tt.splat %arg0 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked1> %44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> %45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> %46 = tt.expand_dims %18 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> @@ -84,20 +76,18 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): %52 = tt.splat %arg0 : !tt.ptr -> tensor<64x128x!tt.ptr, #blocked1> %53 = tt.addptr %52, %50 : tensor<64x128x!tt.ptr, #blocked1>, tensor<64x128xi32, #blocked1> - %81 = arith.subi %arg5, %c64_i32 : i32 - %85 = tt.load %42: tensor<128x64x!tt.ptr, #blocked2> - %86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1> + %85 = tt.load %42: tensor<128x64x!tt.ptr, #blocked1> + %86 = tt.splat %arg5 : i32 -> tensor<64x1xi32, #blocked1> %87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1> %88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1> %89 = tt.load %53, %88, %cst_0 : tensor<64x128x!tt.ptr, #blocked1> - %91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem> + %91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked1>) -> !ttg.memdesc<128x64xf32, #shared, #smem> %92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> %94 = ttg.local_alloc %89 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem> - %cst_test = arith.constant dense<1.11111116> : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> %cst_test2 = arith.constant dense<1.11111116> : tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> %96 = tt.dot %92, %cst_test2, %cst, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked> - %78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked2> + %78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked1> tt.return } } From 06c86fc6406844f05f8905e040413e8fe2272ca8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 25 Jun 2025 16:02:22 +0000 Subject: [PATCH 7/7] correct error code Signed-off-by: Anatoly Myachev --- python/test/unit/intel/test_regressions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py index 6d5fc04ef1..f62dccc98b 100644 --- a/python/test/unit/intel/test_regressions.py +++ b/python/test/unit/intel/test_regressions.py @@ -102,8 +102,8 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path): # try to catch: # L0 build module failed. Log: IGC: Internal Compiler Error: Segmentation violation - # Error during Intel loadBinary: Triton Error [ZE]: 0x70000004 - # RuntimeError: Triton Error [ZE]: 0x70000004 + # Error during Intel loadBinary: Triton Error [ZE]: 0x78000011 + # RuntimeError: Triton Error [ZE]: 0x78000011 module, function, n_regs, n_spills, n_max_threads = driver.active.utils.load_binary( kernel.name, kernel.kernel, kernel.metadata.shared, kernel.metadata.build_flags, not kernel.metadata.generate_native_code, device)