Skip to content

Commit e969238

Browse files
committed
Switch CogVLM LLM graph to merged QKV tensor
1 parent 11c5dfd commit e969238

File tree

7 files changed

+25
-71
lines changed

7 files changed

+25
-71
lines changed

convert_hf_to_gguf.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8342,15 +8342,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
83428342
if name.startswith("model.vision."):
83438343
return []
83448344

8345-
if "query_key_value.weight" in name:
8346-
# Slice tensor into three along first axis
8347-
q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0)
8348-
return [
8349-
(self.map_tensor_name(name.replace("query_key_value", "query")), q),
8350-
(self.map_tensor_name(name.replace("query_key_value", "key")), k),
8351-
(self.map_tensor_name(name.replace("query_key_value", "value")), v),
8352-
]
8353-
83548345
return [(self.map_tensor_name(name), data_torch)]
83558346

83568347
###### CONVERSION LOGIC ######

gguf-py/gguf/constants.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -562,9 +562,7 @@ class MODEL_TENSOR(IntEnum):
562562
SHORTCONV_CONV = auto()
563563
SHORTCONV_INPROJ = auto()
564564
SHORTCONV_OUTPROJ = auto()
565-
VISEXP_ATTN_Q = auto()
566-
VISEXP_ATTN_K = auto()
567-
VISEXP_ATTN_V = auto()
565+
VISEXP_ATTN_QKV = auto()
568566
VISEXP_ATTN_OUT = auto()
569567
VISEXP_GATE = auto()
570568
VISEXP_DOWN = auto()
@@ -908,9 +906,7 @@ class MODEL_TENSOR(IntEnum):
908906
MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
909907
MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
910908
MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
911-
MODEL_TENSOR.VISEXP_ATTN_Q: "blk.{bid}.vis_attn_q",
912-
MODEL_TENSOR.VISEXP_ATTN_K: "blk.{bid}.vis_attn_k",
913-
MODEL_TENSOR.VISEXP_ATTN_V: "blk.{bid}.vis_attn_v",
909+
MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv",
914910
MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output",
915911
MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate",
916912
MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down",
@@ -2649,17 +2645,13 @@ class MODEL_TENSOR(IntEnum):
26492645
MODEL_TENSOR.OUTPUT_NORM,
26502646
MODEL_TENSOR.OUTPUT,
26512647
MODEL_TENSOR.ATTN_NORM,
2652-
MODEL_TENSOR.ATTN_Q,
2653-
MODEL_TENSOR.ATTN_K,
2654-
MODEL_TENSOR.ATTN_V,
2648+
MODEL_TENSOR.ATTN_QKV,
26552649
MODEL_TENSOR.ATTN_OUT,
26562650
MODEL_TENSOR.FFN_NORM,
26572651
MODEL_TENSOR.FFN_GATE,
26582652
MODEL_TENSOR.FFN_DOWN,
26592653
MODEL_TENSOR.FFN_UP,
2660-
MODEL_TENSOR.VISEXP_ATTN_Q,
2661-
MODEL_TENSOR.VISEXP_ATTN_K,
2662-
MODEL_TENSOR.VISEXP_ATTN_V,
2654+
MODEL_TENSOR.VISEXP_ATTN_QKV,
26632655
MODEL_TENSOR.VISEXP_ATTN_OUT,
26642656
MODEL_TENSOR.VISEXP_GATE,
26652657
MODEL_TENSOR.VISEXP_UP,

gguf-py/gguf/tensor_mapping.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ class TensorNameMap:
176176
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
177177
"transformer.layers.{bid}.attn.qkv_proj", # openelm
178178
"transformer_encoder.{bid}.qkv", # neobert
179+
"model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm
179180
),
180181

181182
# Attention query
@@ -193,7 +194,6 @@ class TensorNameMap:
193194
"model.layers.{bid}.self_attn.q_proj", # llama4
194195
"model.transformer.blocks.{bid}.q_proj", # llada
195196
"layers.{bid}.self_attn.q_proj", # qwen3-embedding
196-
"model.layers.{bid}.self_attn.language_expert_query", # cogvlm
197197
),
198198

199199
# Attention key
@@ -212,7 +212,6 @@ class TensorNameMap:
212212
"model.layers.{bid}.self_attn.k_proj", # llama4
213213
"model.transformer.blocks.{bid}.k_proj", # llada
214214
"layers.{bid}.self_attn.k_proj", # qwen3-embedding
215-
"model.layers.{bid}.self_attn.language_expert_key", # cogvlm
216215
),
217216

218217
# Attention value
@@ -230,7 +229,6 @@ class TensorNameMap:
230229
"model.layers.{bid}.self_attn.v_proj", # llama4
231230
"model.transformer.blocks.{bid}.v_proj", # llada
232231
"layers.{bid}.self_attn.v_proj", # qwen3-embedding
233-
"model.layers.{bid}.self_attn.language_expert_value", # cogvlm
234232
),
235233

236234
# Attention output
@@ -1020,16 +1018,8 @@ class TensorNameMap:
10201018
"model.layers.{bid}.self_attn.vision_expert_dense", # cogvlm
10211019
),
10221020

1023-
MODEL_TENSOR.VISEXP_ATTN_Q: (
1024-
"model.layers.{bid}.self_attn.vision_expert_query", # cogvlm
1025-
),
1026-
1027-
MODEL_TENSOR.VISEXP_ATTN_K: (
1028-
"model.layers.{bid}.self_attn.vision_expert_key", # cogvlm
1029-
),
1030-
1031-
MODEL_TENSOR.VISEXP_ATTN_V: (
1032-
"model.layers.{bid}.self_attn.vision_expert_value", # cogvlm
1021+
MODEL_TENSOR.VISEXP_ATTN_QKV: (
1022+
"model.layers.{bid}.self_attn.vision_expert_query_key_value", # cogvlm
10331023
),
10341024

10351025
############################################################################

src/llama-arch.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2075,17 +2075,13 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
20752075
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
20762076
{ LLM_TENSOR_OUTPUT, "output" },
20772077
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2078-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2079-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2080-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2078+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
20812079
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
20822080
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
20832081
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
20842082
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
20852083
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2086-
{ LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.vis_attn_q" },
2087-
{ LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.vis_attn_k" },
2088-
{ LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.vis_attn_v" },
2084+
{ LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
20892085
{ LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
20902086
{ LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
20912087
{ LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
@@ -2263,9 +2259,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
22632259
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
22642260
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
22652261
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2266-
{LLM_TENSOR_VISEXP_ATTN_WQ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2267-
{LLM_TENSOR_VISEXP_ATTN_WK, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2268-
{LLM_TENSOR_VISEXP_ATTN_WV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2262+
{LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
22692263
{LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
22702264
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
22712265
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},

src/llama-arch.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -414,9 +414,7 @@ enum llm_tensor {
414414
LLM_TENSOR_SHORTCONV_CONV,
415415
LLM_TENSOR_SHORTCONV_INPROJ,
416416
LLM_TENSOR_SHORTCONV_OUTPROJ,
417-
LLM_TENSOR_VISEXP_ATTN_WQ,
418-
LLM_TENSOR_VISEXP_ATTN_WK,
419-
LLM_TENSOR_VISEXP_ATTN_WV,
417+
LLM_TENSOR_VISEXP_ATTN_QKV,
420418
LLM_TENSOR_VISEXP_ATTN_OUT,
421419
LLM_TENSOR_VISEXP_FFN_GATE,
422420
LLM_TENSOR_VISEXP_FFN_DOWN,

src/llama-model.cpp

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5560,14 +5560,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
55605560
auto & layer = layers[i];
55615561

55625562
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5563-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5564-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5565-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5563+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
55665564
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
55675565

5568-
layer.visexp_attn_wq = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WQ, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5569-
layer.visexp_attn_wk = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WK, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5570-
layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5566+
layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
55715567
layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
55725568

55735569
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -18107,21 +18103,17 @@ struct llm_build_cogvlm : public llm_graph_context {
1810718103

1810818104
for (int il = 0; il < n_layer; ++il) {
1810918105
// get either the text or image weight tensors
18110-
ggml_tensor * wq, * wk, * wv, * wo;
18106+
ggml_tensor * wqkv, * wo;
1811118107
ggml_tensor * ffn_gate, * ffn_down, * ffn_up;
1811218108

1811318109
if (is_text) {
18114-
wq = model.layers[il].wq;
18115-
wk = model.layers[il].wk;
18116-
wv = model.layers[il].wv;
18110+
wqkv = model.layers[il].wqkv;
1811718111
wo = model.layers[il].wo;
1811818112
ffn_gate = model.layers[il].ffn_gate;
1811918113
ffn_down = model.layers[il].ffn_down;
1812018114
ffn_up = model.layers[il].ffn_up;
1812118115
} else {
18122-
wq = model.layers[il].visexp_attn_wq;
18123-
wk = model.layers[il].visexp_attn_wk;
18124-
wv = model.layers[il].visexp_attn_wv;
18116+
wqkv = model.layers[il].visexp_attn_wqkv;
1812518117
wo = model.layers[il].visexp_attn_wo;
1812618118
ffn_gate = model.layers[il].visexp_ffn_gate;
1812718119
ffn_down = model.layers[il].visexp_ffn_down;
@@ -18133,17 +18125,16 @@ struct llm_build_cogvlm : public llm_graph_context {
1813318125

1813418126
// build self attention
1813518127
{
18136-
ggml_tensor * Qcur = build_lora_mm(wq, cur);
18137-
cb(Qcur, "Qcur", il);
18128+
ggml_tensor * qkv = build_lora_mm(wqkv, cur);
1813818129

18139-
ggml_tensor * Kcur = build_lora_mm(wk, cur);
18140-
cb(Kcur, "Kcur", il);
18130+
// split qkv into Q, K, V along the first dimension
18131+
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
18132+
qkv->nb[1], 0);
18133+
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
18134+
qkv->nb[1], n_embd * ggml_element_size(qkv));
18135+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
18136+
qkv->nb[1], 2 * n_embd * ggml_element_size(qkv)));
1814118137

18142-
ggml_tensor * Vcur = build_lora_mm(wv, cur);
18143-
cb(Vcur, "Vcur", il);
18144-
18145-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18146-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1814718138
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1814818139

1814918140
// TODO: Check Rope because this might not be the same as cogvlm

src/llama-model.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,7 @@ struct llama_layer {
368368
struct ggml_tensor * attn_sinks = nullptr;
369369

370370
// cogvlm
371-
struct ggml_tensor * visexp_attn_wq = nullptr;
372-
struct ggml_tensor * visexp_attn_wk = nullptr;
373-
struct ggml_tensor * visexp_attn_wv = nullptr;
371+
struct ggml_tensor * visexp_attn_wqkv = nullptr;
374372
struct ggml_tensor * visexp_attn_wo = nullptr;
375373
struct ggml_tensor * visexp_ffn_gate = nullptr;
376374
struct ggml_tensor * visexp_ffn_down = nullptr;

0 commit comments

Comments
 (0)