Skip to content

Commit ac3992d

Browse files
committed
Switch CogVLM CLIP to use merged QKV
1 parent 76091ee commit ac3992d

File tree

5 files changed

+77
-49
lines changed

5 files changed

+77
-49
lines changed

convert_hf_to_gguf.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8319,15 +8319,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
83198319
if not name.startswith("model.vision."):
83208320
return []
83218321

8322-
if "query_key_value" in name:
8323-
# Split tensor into three along first axis
8324-
q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0)
8325-
return [
8326-
(self.map_tensor_name(name.replace("query_key_value", "query")), q),
8327-
(self.map_tensor_name(name.replace("query_key_value", "key")), k),
8328-
(self.map_tensor_name(name.replace("query_key_value", "value")), v),
8329-
]
8330-
83318322
return [(self.map_tensor_name(name), data_torch)]
83328323

83338324

gguf-py/gguf/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,7 @@ class MODEL_TENSOR(IntEnum):
576576
V_ENC_EMBD_PATCH = auto()
577577
V_ENC_EMBD_POS = auto()
578578
V_ENC_INPUT_NORM = auto()
579+
V_ENC_ATTN_QKV = auto()
579580
V_ENC_ATTN_Q = auto()
580581
V_ENC_ATTN_Q_NORM = auto()
581582
V_ENC_ATTN_K = auto()
@@ -919,6 +920,7 @@ class MODEL_TENSOR(IntEnum):
919920
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
920921
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
921922
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
923+
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
922924
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
923925
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
924926
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
@@ -994,6 +996,7 @@ class MODEL_TENSOR(IntEnum):
994996
MODEL_TENSOR.V_ENC_EMBD_PATCH,
995997
MODEL_TENSOR.V_ENC_EMBD_POS,
996998
MODEL_TENSOR.V_ENC_INPUT_NORM,
999+
MODEL_TENSOR.V_ENC_ATTN_QKV,
9971000
MODEL_TENSOR.V_ENC_ATTN_Q,
9981001
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
9991002
MODEL_TENSOR.V_ENC_ATTN_K,

gguf-py/gguf/tensor_mapping.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,10 @@ class TensorNameMap:
11631163
"model.vision.patch_embedding.position_embedding", # cogvlm
11641164
),
11651165

1166+
MODEL_TENSOR.V_ENC_ATTN_QKV: (
1167+
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
1168+
),
1169+
11661170
MODEL_TENSOR.V_ENC_ATTN_Q: (
11671171
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
11681172
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
@@ -1171,7 +1175,6 @@ class TensorNameMap:
11711175
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
11721176
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
11731177
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
1174-
"model.vision.transformer.layers.{bid}.attention.query", # cogvlm
11751178
),
11761179

11771180
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1187,7 +1190,6 @@ class TensorNameMap:
11871190
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
11881191
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
11891192
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
1190-
"model.vision.transformer.layers.{bid}.attention.key", # cogvlm
11911193
),
11921194

11931195
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1203,7 +1205,6 @@ class TensorNameMap:
12031205
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
12041206
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
12051207
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
1206-
"model.vision.transformer.layers.{bid}.attention.value", # cogvlm
12071208
),
12081209

12091210
MODEL_TENSOR.V_ENC_INPUT_NORM: (

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
6060
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
6161
#define TN_PATCH_BIAS "v.patch_embd.bias"
62+
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
6263
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
6364
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
6465
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"

tools/mtmd/clip.cpp

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ struct clip_layer {
211211
ggml_tensor * q_b = nullptr;
212212
ggml_tensor * v_w = nullptr;
213213
ggml_tensor * v_b = nullptr;
214+
ggml_tensor * qkv_w = nullptr;
215+
ggml_tensor * qkv_b = nullptr;
214216

215217
ggml_tensor * o_w = nullptr;
216218
ggml_tensor * o_b = nullptr;
@@ -1576,18 +1578,65 @@ struct clip_graph {
15761578
ggml_tensor * inp = build_inp();
15771579
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
15781580

1579-
// build ViT transformer
1580-
ggml_tensor * cur = build_vit(
1581-
inp, n_pos,
1582-
NORM_TYPE_NORMAL,
1583-
hparams.ffn_op,
1584-
model.position_embeddings,
1585-
nullptr);
1581+
inp = ggml_add(ctx0, inp, model.position_embeddings);
1582+
cb(inp, "inp_pos", -1);
1583+
1584+
ggml_tensor * inpL = inp;
1585+
1586+
for (int il = 0; il < n_layer; il++) {
1587+
auto & layer = model.layers[il];
1588+
ggml_tensor * cur = inpL;
1589+
1590+
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
1591+
1592+
cur = ggml_add(ctx0, cur, layer.qkv_b);
1593+
1594+
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos,
1595+
cur->nb[1], 0));
1596+
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos,
1597+
cur->nb[1], n_embd * sizeof(float)));
1598+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos,
1599+
cur->nb[1], 2 * n_embd * sizeof(float)));
1600+
1601+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1602+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1603+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
1604+
1605+
cb(Qcur, "Qcur", il);
1606+
cb(Kcur, "Kcur", il);
1607+
cb(Vcur, "Vcur", il);
1608+
1609+
cur = build_attn(layer.o_w, layer.o_b,
1610+
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
1611+
cb(cur, "attn_out", il);
1612+
1613+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
1614+
cb(cur, "attn_post_norm", il);
1615+
1616+
cur = ggml_add(ctx0, cur, inpL);
1617+
inpL = cur;
1618+
1619+
cur = build_ffn(cur,
1620+
layer.ff_up_w, layer.ff_up_b,
1621+
layer.ff_gate_w, layer.ff_gate_b,
1622+
layer.ff_down_w, layer.ff_down_b,
1623+
hparams.ffn_op, il);
1624+
1625+
cb(cur, "ffn_out", il);
1626+
1627+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
1628+
cb(cur, "ffn_post_norm", il);
1629+
1630+
cur = ggml_add(ctx0, cur, inpL);
1631+
cb(cur, "layer_out", il);
1632+
inpL = cur;
1633+
1634+
}
15861635

15871636
// remove CLS token (like build_llama4 does)
1588-
cur = ggml_view_2d(ctx0, cur,
1637+
ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
15891638
n_embd, n_patches,
1590-
ggml_row_size(cur->type, n_embd), 0);
1639+
ggml_row_size(inpL->type, n_embd), 0);
15911640

15921641
// Multiply with mm_model_proj
15931642
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
@@ -1665,14 +1714,9 @@ struct clip_graph {
16651714
auto & layer = model.layers[il];
16661715
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
16671716

1668-
// Check if this is COGVLM projector type for post-norm layernorm order
1669-
const bool is_cogvlm = ctx->proj_type() == PROJECTOR_TYPE_COGVLM;
1670-
1671-
// layernorm1 (only for non-COGVLM)
1672-
if (!is_cogvlm) {
1673-
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1674-
cb(cur, "layer_inp_normed", il);
1675-
}
1717+
// layernorm1
1718+
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1719+
cb(cur, "layer_inp_normed", il);
16761720

16771721
// self-attention
16781722
{
@@ -1726,24 +1770,16 @@ struct clip_graph {
17261770
cb(cur, "attn_out_scaled", il);
17271771
}
17281772

1729-
// Apply layernorm AFTER attention for COGVLM (post-norm)
1730-
if (is_cogvlm) {
1731-
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
1732-
cb(cur, "attn_post_norm", il);
1733-
}
1734-
17351773
// re-add the layer input, e.g., residual
17361774
cur = ggml_add(ctx0, cur, inpL);
17371775

17381776
inpL = cur; // inpL = residual, cur = hidden_states
17391777

17401778
cb(cur, "ffn_inp", il);
17411779

1742-
// layernorm2 (only for non-COGVLM)
1743-
if (!is_cogvlm) {
1744-
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1745-
cb(cur, "ffn_inp_normed", il);
1746-
}
1780+
// layernorm2
1781+
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1782+
cb(cur, "ffn_inp_normed", il);
17471783

17481784
// ffn
17491785
cur = build_ffn(cur,
@@ -1759,12 +1795,6 @@ struct clip_graph {
17591795
cb(cur, "ffn_out_scaled", il);
17601796
}
17611797

1762-
// Apply layernorm AFTER MLP for COGVLM (post-norm)
1763-
if (is_cogvlm) {
1764-
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
1765-
cb(cur, "ffn_post_norm", il);
1766-
}
1767-
17681798
// residual 2
17691799
cur = ggml_add(ctx0, inpL, cur);
17701800
cb(cur, "layer_out", il);
@@ -2466,10 +2496,11 @@ struct clip_model_loader {
24662496
model.layers.resize(hparams.n_layer);
24672497
for (int il = 0; il < hparams.n_layer; ++il) {
24682498
auto & layer = model.layers[il];
2469-
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
2470-
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
2471-
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
2499+
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
2500+
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
2501+
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
24722502
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
2503+
layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
24732504
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
24742505
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
24752506
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
@@ -2481,6 +2512,7 @@ struct clip_model_loader {
24812512
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
24822513
layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
24832514
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
2515+
layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false);
24842516
layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
24852517
layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
24862518

0 commit comments

Comments
 (0)