@@ -222,6 +222,7 @@ enum vk_device_architecture {
222
222
AMD_RDNA2,
223
223
AMD_RDNA3,
224
224
INTEL_XE2,
225
+ NVIDIA_PRE_TURING,
225
226
};
226
227
227
228
// HSK x HSV
@@ -315,6 +316,22 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
315
316
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
316
317
return vk_device_architecture::INTEL_XE2;
317
318
}
319
+ } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
320
+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
321
+
322
+ bool cooperative_matrix = false;
323
+
324
+ // Detect "pre-turing" based on lack of coopmat support.
325
+ for (const auto& properties : ext_props) {
326
+ if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
327
+ cooperative_matrix = true;
328
+ break;
329
+ }
330
+ }
331
+
332
+ if (!cooperative_matrix) {
333
+ return vk_device_architecture::NVIDIA_PRE_TURING;
334
+ }
318
335
}
319
336
return vk_device_architecture::OTHER;
320
337
}
@@ -3098,9 +3115,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
3098
3115
break;
3099
3116
}
3100
3117
3118
+ // Use collectives on pre-Turing NVIDIA GPUs, which had slower integer math.
3119
+ bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
3120
+ device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
3121
+
3101
3122
if (device->subgroup_shuffle &&
3102
3123
device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
3103
- device->vendor_id != VK_VENDOR_ID_NVIDIA ) { // Collectives no faster on NVIDIA.
3124
+ allow_collectives_nv ) {
3104
3125
use_collectives = 1;
3105
3126
conv2d_BS_CRS = std::min(
3106
3127
device->subgroup_size,
0 commit comments