|
52 | 52 | #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) |
53 | 53 |
|
54 | 54 | // AMD |
55 | | -// GCN/CNDA, wave size is 64 |
| 55 | +// GCN/CDNA, wave size is 64 |
56 | 56 | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16 |
57 | 57 | #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue |
58 | 58 | #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a |
59 | 59 | #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers |
60 | 60 | #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing |
61 | 61 | #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300 |
62 | 62 |
|
63 | | -// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
| 63 | +// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
64 | 64 | #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 |
65 | 65 | #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a |
66 | 66 | #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA |
| 67 | +#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 |
67 | 68 |
|
68 | 69 | #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) |
69 | 70 | #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) |
70 | 71 | #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) |
71 | 72 | #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) |
72 | | -#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3) |
| 73 | +#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) |
| 74 | +#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) |
73 | 75 | #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA) |
74 | 76 | #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) |
75 | 77 |
|
@@ -209,9 +211,9 @@ typedef float2 dfloat2; |
209 | 211 | #define FP16_MMA_AVAILABLE |
210 | 212 | #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA |
211 | 213 |
|
212 | | -#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3)) |
| 214 | +#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) |
213 | 215 | #define FP16_MMA_AVAILABLE |
214 | | -#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3)) |
| 216 | +#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) |
215 | 217 |
|
216 | 218 | #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING |
217 | 219 | #define NEW_MMA_AVAILABLE |
@@ -244,14 +246,14 @@ static bool fp16_mma_available(const int cc) { |
244 | 246 | return false; |
245 | 247 | #else |
246 | 248 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || |
247 | | - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
| 249 | + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); |
248 | 250 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) |
249 | 251 | } |
250 | 252 |
|
251 | 253 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
252 | 254 | static bool fp16_mma_hardware_available(const int cc) { |
253 | 255 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || |
254 | | - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
| 256 | + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); |
255 | 257 | } |
256 | 258 |
|
257 | 259 | // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. |
@@ -409,7 +411,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i |
409 | 411 | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) |
410 | 412 | #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) |
411 | 413 | c = __builtin_amdgcn_sdot4(a, b, c, false); |
412 | | -#elif defined(RDNA3) |
| 414 | +#elif defined(RDNA3) || defined(RDNA4) |
413 | 415 | c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); |
414 | 416 | #elif defined(RDNA1) || defined(__gfx900__) |
415 | 417 | int tmp1; |
|
0 commit comments