@@ -222,6 +222,7 @@ enum vk_device_architecture {
222
222
AMD_RDNA2,
223
223
AMD_RDNA3,
224
224
INTEL_XE2,
225
+ NVIDIA_PRE_TURING,
225
226
};
226
227
227
228
// HSK x HSV
@@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
315
316
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
316
317
return vk_device_architecture::INTEL_XE2;
317
318
}
319
+ } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
320
+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
321
+
322
+ bool cooperative_matrix = false;
323
+
324
+ // Detect "pre-turing" based on lack of coopmat support.
325
+ for (const auto& properties : ext_props) {
326
+ if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
327
+ cooperative_matrix = true;
328
+ break;
329
+ }
330
+ }
331
+
332
+ if (!cooperative_matrix) {
333
+ return vk_device_architecture::NVIDIA_PRE_TURING;
334
+ }
318
335
}
319
336
return vk_device_architecture::OTHER;
320
337
}
321
338
339
+ enum vk_conv_shapes {
340
+ CONV_SHAPE_128x128,
341
+ CONV_SHAPE_64x32,
342
+ CONV_SHAPE_32x256,
343
+ CONV_SHAPE_COUNT,
344
+ };
345
+
322
346
struct vk_device_struct {
323
347
std::recursive_mutex mutex;
324
348
@@ -483,8 +507,8 @@ struct vk_device_struct {
483
507
vk_pipeline pipeline_rwkv_wkv6_f32;
484
508
vk_pipeline pipeline_rwkv_wkv7_f32;
485
509
vk_pipeline pipeline_opt_step_adamw_f32;
486
- vk_pipeline pipeline_conv2d_f32;
487
- vk_pipeline pipeline_conv2d_f16_f32;
510
+ vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT] ;
511
+ vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT] ;
488
512
vk_pipeline pipeline_conv2d_dw_whcn_f32;
489
513
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
490
514
@@ -908,8 +932,22 @@ struct vk_op_conv2d_push_constants {
908
932
uint32_t nb1;
909
933
uint32_t nb2;
910
934
uint32_t nb3;
935
+
936
+ // init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
937
+ uint32_t KWmp; uint32_t KWL;
938
+ uint32_t KWKHmp; uint32_t KWKHL;
939
+ uint32_t OWmp; uint32_t OWL;
940
+ uint32_t OWOHmp; uint32_t OWOHL;
911
941
};
912
942
943
+ template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
944
+ // Compute magic values to divide by KW, KW*KH, OW, OW*OH
945
+ init_fastdiv_values(p.KW, p.KWmp, p.KWL);
946
+ init_fastdiv_values(p.KW*p.KH, p.KWKHmp, p.KWKHL);
947
+ init_fastdiv_values(p.OW, p.OWmp, p.OWL);
948
+ init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
949
+ }
950
+
913
951
struct vk_op_conv2d_dw_push_constants {
914
952
uint32_t ne;
915
953
uint32_t batches;
@@ -3048,48 +3086,89 @@ static void ggml_vk_load_shaders(vk_device& device) {
3048
3086
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
3049
3087
3050
3088
// conv2d
3051
- uint32_t conv2d_WG_SIZE = 256;
3052
- uint32_t conv2d_BS_K = 128;
3053
- uint32_t conv2d_BS_CRS = 16;
3054
- uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3055
- if (device->subgroup_shuffle &&
3056
- device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
3057
- use_collectives = 1;
3058
- conv2d_BS_CRS = std::min(
3059
- device->subgroup_size,
3060
- conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
3061
- }
3062
- uint32_t conv2d_BS_NPQ = 128;
3063
- uint32_t conv2d_TS_K = 8;
3064
- uint32_t conv2d_shmem_req =
3065
- (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
3066
- if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
3067
- conv2d_BS_CRS = 8;
3068
- if (use_collectives) {
3069
- conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3070
- }
3071
- }
3072
-
3073
- if (use_collectives) {
3074
- ggml_vk_create_pipeline(
3075
- device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3076
- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3077
- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3078
- ggml_vk_create_pipeline(
3079
- device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3080
- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3081
- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3082
- } else {
3083
- ggml_vk_create_pipeline(
3084
- device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3085
- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3086
- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3087
- false);
3088
- ggml_vk_create_pipeline(
3089
- device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3090
- sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3091
- { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3092
- false);
3089
+ for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
3090
+ uint32_t conv2d_WG_SIZE = 256;
3091
+ uint32_t conv2d_BS_K = 128;
3092
+ uint32_t conv2d_BS_CRS = 16;
3093
+ uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3094
+ uint32_t conv2d_BS_NPQ = 128;
3095
+ uint32_t conv2d_TS_K = 8;
3096
+ uint32_t conv2d_SHMEM_PAD = 4;
3097
+ bool conv2d_UNROLL = true;
3098
+
3099
+ if (device->vendor_id == VK_VENDOR_ID_INTEL) {
3100
+ conv2d_SHMEM_PAD = 0;
3101
+ conv2d_UNROLL = false;
3102
+ } else if (device->vendor_id == VK_VENDOR_ID_AMD) {
3103
+ conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
3104
+ }
3105
+
3106
+ switch (s) {
3107
+ default:
3108
+ case CONV_SHAPE_128x128:
3109
+ conv2d_BS_K = 128;
3110
+ conv2d_BS_NPQ = 128;
3111
+ conv2d_BS_CRS = 16;
3112
+ if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
3113
+ conv2d_UNROLL = false;
3114
+ }
3115
+ break;
3116
+ case CONV_SHAPE_64x32:
3117
+ conv2d_BS_K = 64;
3118
+ conv2d_BS_NPQ = 32;
3119
+ conv2d_BS_CRS = 32;
3120
+ conv2d_TS_K = 4;
3121
+ break;
3122
+ case CONV_SHAPE_32x256:
3123
+ conv2d_BS_K = 32;
3124
+ conv2d_BS_NPQ = 256;
3125
+ conv2d_BS_CRS = 16;
3126
+ break;
3127
+ }
3128
+
3129
+ // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
3130
+ bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
3131
+ device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
3132
+ bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
3133
+ device->architecture == vk_device_architecture::AMD_GCN;
3134
+
3135
+ if (device->subgroup_shuffle &&
3136
+ device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
3137
+ allow_collectives_nv &&
3138
+ allow_collectives_amd) {
3139
+ use_collectives = 1;
3140
+ conv2d_BS_CRS = std::min(
3141
+ device->subgroup_size,
3142
+ conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
3143
+ }
3144
+
3145
+ uint32_t conv2d_shmem_req =
3146
+ (conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
3147
+ if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
3148
+ conv2d_BS_CRS = 8;
3149
+ if (use_collectives) {
3150
+ conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3151
+ }
3152
+ }
3153
+
3154
+ std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
3155
+ std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
3156
+
3157
+ if (conv2d_UNROLL) {
3158
+ ggml_vk_create_pipeline(
3159
+ device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
3160
+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3161
+ ggml_vk_create_pipeline(
3162
+ device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
3163
+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3164
+ } else {
3165
+ ggml_vk_create_pipeline(
3166
+ device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3167
+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3168
+ ggml_vk_create_pipeline(
3169
+ device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3170
+ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3171
+ }
3093
3172
}
3094
3173
3095
3174
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6641,6 +6720,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6641
6720
}
6642
6721
}
6643
6722
6723
+ static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
6724
+ const ggml_tensor *src0 = dst->src[0];
6725
+ const ggml_tensor *src1 = dst->src[1];
6726
+
6727
+ // src0 - kernel: [KW, KH, Cin, Cout]
6728
+ // src1 - input: [W, H, Cin, N]
6729
+ // dst - result: [OW, OH, Cout, N]
6730
+
6731
+ // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6732
+ auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6733
+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6734
+ };
6735
+ // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6736
+ int64_t W = src1->ne[0];
6737
+ int64_t H = src1->ne[1];
6738
+ int64_t KW = src0->ne[0];
6739
+ int64_t KH = src0->ne[1];
6740
+ int64_t Cout = src0->ne[3];
6741
+ int64_t N = src1->ne[3];
6742
+ int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
6743
+ int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
6744
+ int64_t NPQ = N * OW * OH;
6745
+
6746
+ // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
6747
+ std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
6748
+ return elements;
6749
+ }
6750
+
6644
6751
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
6645
6752
switch (op) {
6646
6753
case GGML_OP_GET_ROWS:
@@ -6970,10 +7077,30 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6970
7077
case GGML_OP_CONV_2D:
6971
7078
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
6972
7079
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
7080
+ auto elements = ggml_vk_get_conv_elements(dst);
7081
+ vk_conv_shapes shape;
7082
+
7083
+ uint32_t tiles[CONV_SHAPE_COUNT];
7084
+ for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
7085
+ tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
7086
+ }
7087
+
7088
+ // We can't query number of shader cores on Intel, use 32 as a placeholder
7089
+ // so small convolutions will still choose a smaller tile.
7090
+ const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
7091
+
7092
+ if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
7093
+ shape = CONV_SHAPE_128x128;
7094
+ } else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
7095
+ shape = CONV_SHAPE_32x256;
7096
+ } else {
7097
+ shape = CONV_SHAPE_64x32;
7098
+ }
7099
+
6973
7100
if (src0->type == GGML_TYPE_F32) {
6974
- return ctx->device->pipeline_conv2d_f32;
7101
+ return ctx->device->pipeline_conv2d_f32[shape] ;
6975
7102
} else if (src0->type == GGML_TYPE_F16) {
6976
- return ctx->device->pipeline_conv2d_f16_f32;
7103
+ return ctx->device->pipeline_conv2d_f16_f32[shape] ;
6977
7104
}
6978
7105
}
6979
7106
return nullptr;
@@ -7301,29 +7428,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
7301
7428
} break;
7302
7429
case GGML_OP_CONV_2D:
7303
7430
{
7304
- // src0 - kernel: [KW, KH, Cin, Cout]
7305
- // src1 - input: [W, H, Cin, N]
7306
- // dst - result: [OW, OH, Cout, N]
7307
-
7308
- // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
7309
- auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
7310
- return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7311
- };
7312
- // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7313
- int64_t W = src1->ne[0];
7314
- int64_t H = src1->ne[1];
7315
- int64_t KW = src0->ne[0];
7316
- int64_t KH = src0->ne[1];
7317
- int64_t Cout = src0->ne[3];
7318
- int64_t N = src1->ne[3];
7319
- int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7320
- int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7321
- int64_t NPQ = N * OW * OH;
7322
-
7323
- // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7324
- elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
7325
- }
7326
- break;
7431
+ elements = ggml_vk_get_conv_elements(dst);
7432
+ } break;
7327
7433
case GGML_OP_ADD:
7328
7434
case GGML_OP_SUB:
7329
7435
case GGML_OP_DIV:
0 commit comments