Skip to content

Commit baac713

Browse files
Merge pull request #45 from menloresearch/update-dev-from-master-2025-04-07-00-08
Sync master with upstream release b5061
2 parents 8e4f8cd + 916c83b commit baac713

File tree

6 files changed

+46
-11
lines changed

6 files changed

+46
-11
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,9 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
360360
// copy destination pointers to GPU
361361
CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
362362
cuda_graph->graph_cpynode_index = 0; // reset index
363+
#else
364+
GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
365+
GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
363366
#endif
364367
}
365368

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
6262
T sum = 0.0f;
6363

6464
#pragma unroll
65-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
65+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
6666
const int k_KQ = k_KQ_0 + threadIdx.x;
6767

6868
const int ib = k_KQ / QI8_1;
@@ -102,7 +102,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
102102
T sum = 0.0f;
103103

104104
#pragma unroll
105-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
105+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
106106
const int k_KQ = k_KQ_0 + threadIdx.x;
107107

108108
const int ib = k_KQ / QI8_1;
@@ -146,7 +146,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
146146
T sum = 0.0f;
147147

148148
#pragma unroll
149-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
149+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
150150
const int k_KQ = k_KQ_0 + threadIdx.x;
151151

152152
const int ib = k_KQ / QI8_1;
@@ -193,7 +193,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
193193
T sum = 0.0f;
194194

195195
#pragma unroll
196-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
196+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
197197
const int k_KQ = k_KQ_0 + threadIdx.x;
198198

199199
const int ib = k_KQ / QI8_1;
@@ -244,7 +244,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
244244
T sum = 0.0f;
245245

246246
#pragma unroll
247-
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += warp_size) {
247+
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
248248
const int k_KQ = k_KQ_0 + threadIdx.x;
249249

250250
const int ib = k_KQ / QI8_0;

ggml/src/ggml-cuda/fattn-tile-f32.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ static __global__ void flash_attn_tile_ext_f32(
5252
return;
5353
#endif // FP16_MMA_AVAILABLE
5454
if (use_logit_softcap && !(D == 128 || D == 256)) {
55+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
56+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
57+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
58+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
59+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
60+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
61+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
62+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
63+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
64+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
65+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
66+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
5567
NO_DEVICE_CODE;
5668
return;
5769
}

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ static __global__ void flash_attn_vec_ext_f32(
4545

4646
// Skip unused kernel variants for faster compilation:
4747
if (use_logit_softcap && !(D == 128 || D == 256)) {
48+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
49+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
50+
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
51+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
52+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
53+
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
54+
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
55+
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
56+
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
57+
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
58+
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
59+
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
4860
NO_DEVICE_CODE;
4961
return;
5062
}
@@ -114,7 +126,7 @@ static __global__ void flash_attn_vec_ext_f32(
114126
// Set memory to zero if out of bounds:
115127
if (ncols > 2 && ic0 + j >= ne01) {
116128
#pragma unroll
117-
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
129+
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
118130
const int i = i0 + threadIdx.x;
119131

120132
tmp_q_i32[i] = 0;
@@ -127,7 +139,7 @@ static __global__ void flash_attn_vec_ext_f32(
127139

128140
const float * Q_f = (const float *) (Q + j*nb01);
129141
#pragma unroll
130-
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
142+
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
131143
quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
132144
}
133145
}
@@ -140,7 +152,7 @@ static __global__ void flash_attn_vec_ext_f32(
140152
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
141153

142154
#pragma unroll
143-
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
155+
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
144156
const int i = i0 + threadIdx.x;
145157

146158
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,6 +1833,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
18331833
// can't use 256 for D==80.
18341834
uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128;
18351835
auto rows_cols = fa_rows_cols(D, clamp, type, small_rows);
1836+
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
1837+
GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
18361838
return {wg_size, rows_cols[0], rows_cols[1], (D), clamp};
18371839
};
18381840

@@ -5511,6 +5513,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
55115513
// the "aligned" shader variant will forcibly align strides, for performance
55125514
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
55135515

5516+
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
5517+
GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0);
5518+
55145519
vk_pipeline pipeline = pipelines[aligned];
55155520
assert(pipeline);
55165521

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,11 @@ void main() {
227227

228228
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
229229

230+
// Use -FLT_MAX/2 rather than -inf to reduce the possibility of NaNs, e.g. when computing Mold-M.
231+
const float NEG_FLT_MAX_OVER_2 = uintBitsToFloat(0xFEFFFFFF);
232+
230233
L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
231-
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-1.0/0.0);
234+
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(NEG_FLT_MAX_OVER_2);
232235

233236
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> slopeMat = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(1.0);
234237

@@ -256,7 +259,7 @@ void main() {
256259
}
257260

258261
if (p.mask != 0) {
259-
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
262+
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
260263
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
261264
// When using grouped query attention, all rows use the same mask.
262265
if (p.gqa_ratio > 1) {
@@ -278,7 +281,7 @@ void main() {
278281
uint R = ((i + 1) * Br > N) ? (N % Br) : Br;
279282
uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
280283

281-
coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C);
284+
coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(NEG_FLT_MAX_OVER_2), R, C);
282285
}
283286

284287
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> rowmax, P, rowsum, eM;

0 commit comments

Comments
 (0)