cuda : fix mask dim 2/3 (wip)

ggerganov · ggerganov · commit 2a20a7e1d901 · 2025-07-02T19:13:16.000+03:00
ggml-ci
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -33,8 +33,10 @@ typedef void (* fattn_kernel_t)(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
@@ -705,8 +707,6 @@ void launch_fattn(
 
     GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
 
-    GGML_ASSERT(Q->ne[3] == 1);
-
     ggml_cuda_pool & pool = ctx.pool();
     cudaStream_t main_stream = ctx.stream();
     const int id  = ggml_cuda_get_device();
@@ -853,8 +853,8 @@ void launch_fattn(
         scale, max_bias, m0, m1, n_head_log2, logit_softcap,
         Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
         K->ne[0], K->ne[1], K->ne[2], K->ne[3],
-        mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0,
-        mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0,
+        mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
+        mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0,
         Q->nb[1], Q->nb[2], Q->nb[3],
         nb11, nb12, nb13,
         nb21, nb22, nb23,
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1224,8 +1224,10 @@ static __global__ void flash_attn_ext_f16(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -31,8 +31,10 @@ static __global__ void flash_attn_tile_ext_f16(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
@@ -61,12 +63,14 @@ static __global__ void flash_attn_tile_ext_f16(
     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
 
     const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
+    const int i02 = blockIdx.z % ne02;
+    const int i03 = blockIdx.z / ne02;
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *) (mask + nb32*(blockIdx.z % ne32)      + nb31*ic0);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb03*i03 + nb02*i02 + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb13*i03 + nb12*(i02 / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb23*i03 + nb22*(i02 / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *) (mask + nb33*(i03 % ne33) + nb32*(i02 % ne32) + nb31*ic0);
 
     const int stride_KV2 = nb11 / sizeof(half2);
 
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -31,8 +31,10 @@ static __global__ void flash_attn_tile_ext_f32(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
@@ -73,12 +75,14 @@ static __global__ void flash_attn_tile_ext_f32(
     // In this kernel Q, K, V are matrices while i, j, k are matrix indices.
 
     const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
+    const int i02 = blockIdx.z % ne02;
+    const int i03 = blockIdx.z / ne02;
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *) (mask + nb32*(blockIdx.z % ne32)      + nb31*ic0);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb03*i03 + nb02*i02 + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb13*i03 + nb12*(i02 / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb23*i03 + nb22*(i02 / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *) (mask + nb33*(i03 % ne33) + nb32*(i02 % ne32) + nb31*ic0);
 
     const int stride_KV2 = nb11 / sizeof(half2);
 
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -28,8 +28,10 @@ static __global__ void flash_attn_vec_ext_f16(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
@@ -64,13 +66,15 @@ static __global__ void flash_attn_vec_ext_f16(
     constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);
 
     const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
+    const int i02 = blockIdx.z % ne02;
+    const int i03 = blockIdx.z / ne02;
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.z              + nb01*ic0;
-    K += nb12*(blockIdx.z / gqa_ratio);
-    V += nb22*(blockIdx.z / gqa_ratio);
+    Q += nb03*i03 + nb02*i02 + nb01*ic0;
+    K += nb13*i03 + nb12*(i02 / gqa_ratio);
+    V += nb23*i03 + nb22*(i02 / gqa_ratio);
 
-    const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
+    const half * maskh = (const half *) (mask + nb33*(i03 % ne33) + nb32*(i02 % ne32) + nb31*ic0);
 
     const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
     const half  slopeh = __float2half(slopef);
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -28,8 +28,10 @@ static __global__ void flash_attn_vec_ext_f32(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
@@ -76,13 +78,15 @@ static __global__ void flash_attn_vec_ext_f32(
     constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);
 
     const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
+    const int i02 = blockIdx.z % ne02;
+    const int i03 = blockIdx.z / ne02;
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.z              + nb01*ic0;
-    K += nb12*(blockIdx.z / gqa_ratio);
-    V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
+    Q += nb03*i03 + nb02*i02 + nb01*ic0;
+    K += nb13*i03 + nb12*(i02 / gqa_ratio);
+    V += nb23*i03 + nb22*(i02 / gqa_ratio);
 
-    const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
+    const half * maskh = (const half *) (mask + nb33*(i03 % ne33) + nb32*(i02 % ne32) + nb31*ic0);
 
     const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
 
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -47,8 +47,10 @@ static __global__ void flash_attn_ext_f16(
         const int ne13,
         const int ne31,
         const int ne32,
+        const int ne33,
         const int nb31,
         const int nb32,
+        const int nb33,
         const int nb01,
         const int nb02,
         const int nb03,
@@ -74,6 +76,8 @@ static __global__ void flash_attn_ext_f16(
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     const int ic0 = ncols*blockIdx.x; // Index of the first Q/QKV column to work on.
+    const int i02 = blockIdx.z % ne02;
+    const int i03 = blockIdx.z / ne02;
 
     static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
     static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
@@ -96,10 +100,10 @@ static __global__ void flash_attn_ext_f16(
     constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float * Q_f   = (const float *) (Q    + nb02* blockIdx.z              + nb01*ic0);
-    const half  * K_h   = (const half  *) (K    + nb12*(blockIdx.z / gqa_ratio));
-    const half  * V_h   = (const half  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
-    const half  * maskh = (const half  *) (mask + nb32*(blockIdx.z % ne32)      + nb31*ic0);
+    const float * Q_f   = (const float *) (Q    + nb03*i03 + nb02*i02 + nb01*ic0);
+    const half  * K_h   = (const half  *) (K    + nb13*i03 + nb12*(i02 / gqa_ratio));
+    const half  * V_h   = (const half  *) (V    + nb23*i03 + nb22*(i02 / gqa_ratio)); // K and V have same shape
+    const half  * maskh = (const half  *) (mask + nb33*(i03 % ne33) + nb32*(i02 % ne32) + nb31*ic0);
     const half2 * mask2 = (const half2 *)  maskh;
 
     const int stride_Q  = nb01 / sizeof(float);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3376,11 +3376,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             if (op->src[0]->ne[0] == 192) {
                 return false;
             }
-            // TODO: support broadcast
-            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
-            if (op->src[0]->ne[3] != 1) {
-                return false;
-            }
             if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
                 return false;
             }

Original file line number	Diff line number	Diff line change
`@@ -3376,11 +3376,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3376`	`3376`	`if (op->src[0]->ne[0] == 192) {`
`3377`	`3377`	`return false;`
`3378`	`3378`	`}`
`3379`		`- // TODO: support broadcast`
`3380`		`- // ref: https://github.com/ggml-org/llama.cpp/pull/14435`
`3381`		`- if (op->src[0]->ne[3] != 1) {`
`3382`		`- return false;`
`3383`		`- }`
`3384`	`3379`	`if (op->src[1]->type == GGML_TYPE_BF16 \|\| op->src[2]->type == GGML_TYPE_BF16) {`
`3385`	`3380`	`return false;`
`3386`	`3381`	`}`