ROCm · mirza-halilcevic · Oct 29, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 27, 2025
@@ -10,9 +10,9 @@
 // GRID-LABEL: rock_conv
 func.func @rock_conv(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) attributes {arch = "amdgcn-amd-amdhsa:gfx906"} {
   // CHECK: rock.conv
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 8, mPerBlock = 128, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 4, mPerBlock = 64, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
-  // GRID-SAME: gridSize = 900
+  // GRID-SAME: gridSize = 1800
   rock.conv(%filter, %input, %output) features = none {
     filter_layout = ["g", "k", "c", "0", "1"],
     input_layout = ["ni", "gi", "ci", "0i", "1i"],
@@ -28,9 +28,9 @@ func.func @rock_conv(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x
 // GRID-LABEL: rock_conv_schedulev2
 func.func @rock_conv_schedulev2(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) attributes {schedule_version =  #rock.schedule_version<2>, arch = "amdgcn-amd-amdhsa:gfx906"} {
   // CHECK: rock.conv
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 8, mPerBlock = 128, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 2, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 4, mPerBlock = 64, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 2, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
-  // GRID-SAME: gridSize = 900
+  // GRID-SAME: gridSize = 1800
   rock.conv(%filter, %input, %output) features = none {
     filter_layout = ["g", "k", "c", "0", "1"],
     input_layout = ["ni", "gi", "ci", "0i", "1i"],
@@ -46,9 +46,9 @@ func.func @rock_conv_schedulev2(%filter : memref<1x128x8x3x3xf32>, %input : memr
 // GRID-LABEL: func.func @rock_conv_f16
 func.func @rock_conv_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x1x8x32x32xf16>, %output : memref<128x1x128x30x30xf16>) attributes {arch = "amdgcn-amd-amdhsa:gfx906"} {
   // CHECK: rock.conv
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 8, mPerBlock = 128, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 4, mPerBlock = 64, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
-  // GRID-SAME: gridSize = 900
+  // GRID-SAME: gridSize = 1800
   rock.conv(%filter, %input, %output) features = none {
     filter_layout = ["g", "k", "c", "0", "1"],
     input_layout = ["ni", "gi", "ci", "0i", "1i"],
@@ -125,7 +125,7 @@ func.func @rock_conv_bwd_data_f16(%filter: memref<1x1024x1024x1x1xf16>, %input:
 // GRID-LABEL: func.func @rock_conv_bwd_data_padMN
 func.func @rock_conv_bwd_data_padMN(%filter : memref<1x64x3x1x1xf32>, %input : memref<11x1x3x15x15xf32>, %output : memref<11x1x64x15x15xf32>) attributes {arch = "amdgcn-amd-amdhsa:gfx906"} {
   // CHECK: rock.conv_bwd_data
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 16, mPerBlock = 32, nPerBlock = 64, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 8, mPerBlock = 32, nPerBlock = 64, kPerThread = 1, mPerThread = 2, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
   // GRID-SAME: gridSize = 39
   rock.conv_bwd_data(%filter, %input, %output) features = none {
@@ -145,9 +145,9 @@ func.func @rock_conv_bwd_data_padMN(%filter : memref<1x64x3x1x1xf32>, %input : m
 // GRID-LABEL: @rock_conv_bwd_data_padMK
 func.func @rock_conv_bwd_data_padMK(%filter : memref<1x11x3x1x1xf32>, %input : memref<128x1x3x15x15xf32>, %output : memref<128x1x11x15x15xf32>) attributes {arch = "amdgcn-amd-amdhsa:gfx906"} {
   // CHECK: rock.conv_bwd_data
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 4, mPerBlock = 32, nPerBlock = 64, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 4, mPerBlock = 32, nPerBlock = 128, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
-  // GRID-SAME: gridSize = 450
+  // GRID-SAME: gridSize = 225
   rock.conv_bwd_data(%filter, %input, %output) features = none {
     filter_layout = ["g", "k", "c", "0", "1"],
     input_layout = ["ni", "gi", "ci", "0i", "1i"],
@@ -165,7 +165,7 @@ func.func @rock_conv_bwd_data_padMK(%filter : memref<1x11x3x1x1xf32>, %input : m
 // GRID-LABEL: @rock_conv_bwd_weight
 func.func @rock_conv_bwd_weight(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) attributes {arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} {
   // CHECK: rock.conv_bwd_weight
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 16, mPerBlock = 64, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 8, mPerBlock = 64, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
   // GRID-SAME: gridSize = 6
   rock.conv_bwd_weight(%filter, %input, %output) features = none {
@@ -183,7 +183,7 @@ func.func @rock_conv_bwd_weight(%filter : memref<1x128x8x3x3xf32>, %input : memr
 // GRID-LABEL: @rock_conv_bwd_weight_f16
 func.func @rock_conv_bwd_weight_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x1x8x32x32xf16>, %output : memref<128x1x128x30x30xf16>) attributes {arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} {
   // CHECK: rock.conv_bwd_weight
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 16, mPerBlock = 64, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 8, mPerBlock = 64, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
   // GRID-SAME: gridSize = 6
   rock.conv_bwd_weight(%filter, %input, %output) features = none {
@@ -201,7 +201,7 @@ func.func @rock_conv_bwd_weight_f16(%filter : memref<1x128x8x3x3xf16>, %input :
 // GRID-LABEL: func.func @rock_conv_bwd_weight_padALL
 func.func @rock_conv_bwd_weight_padALL(%filter : memref<1x20x8x3x3xf32>, %input : memref<7x1x8x32x32xf32>, %output : memref<7x1x20x30x30xf32>) attributes {arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} {
   // CHECK: rock.conv_bwd_weight
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 16, mPerBlock = 32, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 16, mPerBlock = 32, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
   // GRID-SAME: gridSize = 3
   rock.conv_bwd_weight(%filter, %input, %output) features = none {
@@ -219,7 +219,7 @@ func.func @rock_conv_bwd_weight_padALL(%filter : memref<1x20x8x3x3xf32>, %input
 // GRID-LABEL: @rock_conv_bwd_weight_padALL_f16
 func.func @rock_conv_bwd_weight_padALL_f16(%filter : memref<1x20x8x3x3xf16>, %input : memref<7x1x8x32x32xf16>, %output : memref<7x1x20x30x30xf16>) attributes {arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} {
   // CHECK: rock.conv_bwd_weight
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 64, kPerBlock = 16, mPerBlock = 32, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 16, mPerBlock = 32, nPerBlock = 32, kPerThread = 1, mPerThread = 2, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
   // GRID-SAME: gridSize = 3
   rock.conv_bwd_weight(%filter, %input, %output) features = none {
@@ -340,7 +340,7 @@ func.func @rock_conv_bwd_data_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<2
 // GRID-LABEL: @rock_gemm_from_conv
 func.func @rock_gemm_from_conv(%a : memref<1x72x128xf32>, %b : memref<1x72x115200xf32>, %c : memref<1x128x115200xf32>) attributes {arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} {
   // CHECK: rock.gemm
-  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 8, mPerBlock = 128, nPerBlock = 128, kPerThread = 1, mPerThread = 4, nPerThread = 4, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
+  // CHECK-SAME: params = #rock.general_gemm_params<blockSize = 128, kPerBlock = 4, mPerBlock = 128, nPerBlock = 128, kPerThread = 1, mPerThread = 4, nPerThread = 2, kpack = 1, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2>
   // GRID: rock.gridwise_gemm
   // GRID-SAME: gridSize = 900
   rock.gemm %c = tr %a * %b features = none storeMethod = set  

@@ -7,7 +7,7 @@
 // EMITKEY: -t f16 -out_datatype f16 -transA false -transB false -g 2 -m 4096 -n 640 -k 320
 
 // VECTORIZATION: aVectorDim: GemmDimension::MorN
-// VECTORIZATION-NEXT: aVectorLen: 2
+// VECTORIZATION-NEXT: aVectorLen: 4
 // VECTORIZATION: bVectorDim: GemmDimension::MorN
 // VECTORIZATION-NEXT: bVectorLen: 2
 

@@ -7,7 +7,7 @@
 // EMITKEY: -t f16 -out_datatype f16 -transA false -transB true -g 2 -m 4096 -n 640 -k 320
 
 // VECTORIZATION: aVectorDim: GemmDimension::MorN
-// VECTORIZATION-NEXT: aVectorLen: 2
+// VECTORIZATION-NEXT: aVectorLen: 4
 // VECTORIZATION: bVectorDim: GemmDimension::K
 // VECTORIZATION-NEXT: bVectorLen: 8
 

@@ -6,7 +6,7 @@
 // EMITKEY: -t f16 -out_datatype f16 -transA true -transB false -g 2 -m 4096 -n 640 -k 320
 
 // VECTORIZATION: aVectorDim: GemmDimension::MorN
-// VECTORIZATION-NEXT: aVectorLen: 2
+// VECTORIZATION-NEXT: aVectorLen: 4
 // VECTORIZATION: bVectorDim: GemmDimension::MorN
 // VECTORIZATION-NEXT: bVectorLen: 2
 

@@ -7,7 +7,7 @@
 // EMITKEY: -t f16 -out_datatype f16 -transA true -transB false -g 2 -m 4096 -n 640 -k 320
 
 // VECTORIZATION: aVectorDim: GemmDimension::MorN
-// VECTORIZATION-NEXT: aVectorLen: 2
+// VECTORIZATION-NEXT: aVectorLen: 4
 // VECTORIZATION: bVectorDim: GemmDimension::MorN
 // VECTORIZATION-NEXT: bVectorLen: 2
 

@@ -7,7 +7,7 @@
 // EMITKEY: -t f16 -out_datatype f16 -transA true -transB true -g 2 -m 4096 -n 640 -k 320
 
 // VECTORIZATION: aVectorDim: GemmDimension::MorN
-// VECTORIZATION-NEXT: aVectorLen: 2
+// VECTORIZATION-NEXT: aVectorLen: 4
 // VECTORIZATION: bVectorDim: GemmDimension::K
 // VECTORIZATION-NEXT: bVectorLen: 8
 

@@ -7,7 +7,7 @@
 // EMITKEY: -t f16 -out_datatype f16 -transA true -transB true -g 2 -m 4096 -n 640 -k 320
 
 // VECTORIZATION: aVectorDim: GemmDimension::MorN
-// VECTORIZATION-NEXT: aVectorLen: 2
+// VECTORIZATION-NEXT: aVectorLen: 4
 // VECTORIZATION: bVectorDim: GemmDimension::K
 // VECTORIZATION-NEXT: bVectorLen: 8
 

@@ -7,7 +7,7 @@
 
 // CHECK_LINALG_ALIGN-COUNT-2: rock.threadwise_read_into {{.*}}
 // CHECK_LINALG_ALIGN: rock.threadwise_read_into {{.*}} -> [[lain:%.*]] :
-// CHECK_LINALG_ALIGN: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<32xf32, #gpu.address_space<private>>)
+// CHECK_LINALG_ALIGN: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<8xf32, #gpu.address_space<private>>)
 // CHECK_LINALG_ALIGN: rock.threadwise_write_all {{.*}} %[[outBuf]] ->
 // to test reshape is converted as transform and fused.
 

@@ -3,7 +3,7 @@
 // CHECK-DAG: #[[MAP2:.*]] = #rock.transform_map<{{.*}} by [<PassThrough ["dim0", "dim2", "dim3", "dim1"] at [0, 1, 2, 3] -> ["dim0", "dim2", "dim3", "dim1"] at [0, 2, 3, 1]>] bounds = [256, 28, 28, 64] -> [256, 64, 28, 28]>
 // CHECK-COUNT-2: rock.threadwise_read_into {{.*}}
 // CHECK: rock.threadwise_read_into {{.*}} -> [[lain:%.*]] :
-// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<32xf32, #gpu.address_space<private>>)
+// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<128xf32, #gpu.address_space<private>>)
 // CHECK: rock.threadwise_write_all {{.*}} %[[outBuf]] ->
 // to test transpose is converted as transform and fused.
 

@@ -3,7 +3,7 @@
 // CHECK-DAG: #[[MAP2:.*]] = #rock.transform_map<#map{{.*}} by [<PassThrough ["{{.*}}", "{{.*}}", "{{.*}}", "{{.*}}"] at [0, 1, 2, 3] -> ["{{.*}}", "{{.*}}", "{{.*}}", "{{.*}}"] at [0, 2, 3, 1]>] bounds = [256, 28, 28, 64] -> [256, 64, 28, 28]>
 // CHECK-COUNT-2: rock.threadwise_read_into {{.*}}
 // CHECK: rock.threadwise_read_into {{.*}} -> [[lain:%.*]] :
-// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<32xf32, #gpu.address_space<private>>)
+// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<128xf32, #gpu.address_space<private>>)
 // CHECK: rock.threadwise_write_all {{.*}} %[[outBuf]] ->
 // to test transpose is converted as transform and fused.
 

@@ -55,7 +55,7 @@
 
 // ZEROINIT_KERNELCOUNT: Kernel count=1
 // ZEROINIT_BIN: ELF
-// ZEROINIT_TUNING: globalSize=100352, localSize=128
+// ZEROINIT_TUNING: globalSize=100352, localSize=64
 // ZEROINIT_DRIVER: %arg1: memref<{{.*}}xf16> {rock.prefill = 0.000000e+00 : f16}
 // ZEROINIT_DRIVER-COUNT-3: rock.transform %{{.+}} by
 // ZEROINIT_DRIVER-NEXT: rock.conv_bwd_data(%{{.+}}, %{{.+}}, %{{.+}}) features = dot {dilations = [1 : index, 1 : index], filter_layout = ["g", "k", "c", "0", "1"], input_layout = ["ni", "gi", "ci", "0i", "1i"], kernelId = 0 : index, output_layout = ["no", "go", "ko", "0o", "1o"], padding = [0 : index, 0 : index, 0 : index, 0 : index], strides = [2 : index, 2 : index], usesV4R1 = true} : memref<1x2048x1024x1x1xf16>, memref<256x1x1024x14x14xf16>, memref<256x1x2048x7x7xf16>
@@ -25,8 +25,8 @@ TEST(FindFallbackTest, OldestRelative) {
 }
 
 TEST(FindFallbackTest, YoungestRelative) {
-  // gfx1200 is the youngest available relative for gfx1900
-  EXPECT_EQ("gfx1200_conv_f16", ParamLookupTable<InitParamsAccel>::findFallback(
+  // gfx1201 is the youngest available relative for gfx1900
+  EXPECT_EQ("gfx1201_conv_f16", ParamLookupTable<InitParamsAccel>::findFallback(
                                     "gfx1900_conv_f16"));
 }
 
@@ -63,6 +63,6 @@ TEST(FindFallbackTest, NoRelativesBySuffix) {
 TEST(FindFallbackTest, AnyGfxForNonAccel) {
   // Any gfx version is acceptable for non-accelerated operations
   EXPECT_EQ(
-      "gfx1200_gemm_f32",
+      "gfx1201_gemm_f32",
       ParamLookupTable<InitParamsNonAccel>::findFallback("gfx942_gemm_f32"));
 }