Skip to content

Commit a571d9a

Browse files
committed
Changes according to comments
1 parent 60739b8 commit a571d9a

File tree

3 files changed

+29
-58
lines changed

3 files changed

+29
-58
lines changed

convert_hf_to_gguf.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8330,9 +8330,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
83308330
class CogVLMModel(LlamaModel):
83318331
model_arch = gguf.MODEL_ARCH.COGVLM
83328332

8333-
def set_gguf_parameters(self):
8334-
super().set_gguf_parameters()
8335-
83368333
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
83378334
del bid # unused
83388335

src/llama-model.cpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18124,22 +18124,19 @@ struct llm_build_cogvlm : public llm_graph_context {
1812418124
cb(qkv, "qkv", il);
1812518125

1812618126
// split qkv into Q, K, V along the first dimension
18127-
ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv,
18128-
n_embd, n_tokens,
18129-
ggml_row_size(qkv->type, n_embd), 0);
18130-
ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv,
18131-
n_embd, n_tokens,
18132-
ggml_row_size(qkv->type, n_embd), n_embd * ggml_element_size(qkv));
18133-
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv,
18134-
n_embd, n_tokens,
18135-
ggml_row_size(qkv->type, n_embd), 2 * n_embd * ggml_element_size(qkv));
18127+
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
18128+
qkv->nb[1], 0));
18129+
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
18130+
qkv->nb[1], n_embd * ggml_element_size(qkv)));
18131+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
18132+
qkv->nb[1], 2 * n_embd * ggml_element_size(qkv)));
1813618133

1813718134
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
1813818135
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1813918136
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1814018137

18141-
Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX);
18142-
Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX);
18138+
Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type);
18139+
Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
1814318140

1814418141
cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
1814518142
cb(cur, "attn_out", il);
@@ -18806,7 +18803,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1880618803
case LLM_ARCH_ARCEE:
1880718804
case LLM_ARCH_ERNIE4_5:
1880818805
case LLM_ARCH_ERNIE4_5_MOE:
18809-
case LLM_ARCH_COGVLM:
1881018806
return LLAMA_ROPE_TYPE_NORM;
1881118807

1881218808
// the pairs of head values are offset by n_rot/2
@@ -18852,6 +18848,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1885218848
case LLM_ARCH_LFM2:
1885318849
case LLM_ARCH_SMALLTHINKER:
1885418850
case LLM_ARCH_GLM4_MOE:
18851+
case LLM_ARCH_COGVLM:
1885518852
return LLAMA_ROPE_TYPE_NEOX;
1885618853

1885718854
case LLM_ARCH_QWEN2VL:

tools/mtmd/clip.cpp

Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1553,7 +1553,7 @@ struct clip_graph {
15531553
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
15541554
// projector
15551555
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1556-
cur = ggml_gelu_erf(ctx0,ld cur);
1556+
cur = ggml_gelu_erf(ctx0, cur);
15571557
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
15581558

15591559
} else {
@@ -1575,25 +1575,17 @@ struct clip_graph {
15751575
const int n_pos = n_patches + 1; // +1 for [CLS]
15761576

15771577
// build input and concatenate class embedding
1578-
ggml_tensor * inp = build_inp();
1579-
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
1578+
ggml_tensor * cur = build_inp();
1579+
cur = ggml_concat(ctx0, cur, model.class_embedding, 1);
15801580

15811581
// Add position embeddings
1582-
inp = ggml_add(ctx0, inp, model.position_embeddings);
1583-
cb(inp, "pos_embed", -1);
1584-
1585-
ggml_tensor * inpL = inp;
1586-
1587-
// pre-layernorm
1588-
if (model.pre_ln_w) {
1589-
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
1590-
cb(inpL, "pre_ln", -1);
1591-
}
1582+
cur = ggml_add(ctx0, cur, model.position_embeddings);
1583+
cb(cur, "pos_embed", -1);
15921584

15931585
// loop over layers
15941586
for (int il = 0; il < n_layer; il++) {
15951587
auto & layer = model.layers[il];
1596-
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1588+
ggml_tensor * inpL = cur; // inpL = residual, cur = hidden_states
15971589

15981590
// Note: cogvlm applies layernorm after attention, not before
15991591
// So we skip the layernorm1 here
@@ -1608,12 +1600,15 @@ struct clip_graph {
16081600

16091601
// Split qkv into Q, K, V along the first dimension
16101602
// qkv shape: [3 * n_embd, n_pos] -> split into [n_embd, n_pos] each
1611-
ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1612-
ggml_row_size(qkv->type, n_embd), 0);
1613-
ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1614-
ggml_row_size(qkv->type, n_embd), n_embd * ggml_element_size(qkv));
1615-
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1616-
ggml_row_size(qkv->type, n_embd), 2 * n_embd * ggml_element_size(qkv));
1603+
ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1604+
qkv->nb[1], 0);
1605+
ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1606+
qkv->nb[1], n_embd * ggml_element_size(qkv));
1607+
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
1608+
qkv->nb[1], 2 * n_embd * ggml_element_size(qkv));
1609+
Qcur = ggml_cont(ctx0, Qcur);
1610+
Kcur = ggml_cont(ctx0, Kcur);
1611+
Vcur = ggml_cont(ctx0, Vcur);
16171612

16181613
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
16191614
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
@@ -1628,11 +1623,6 @@ struct clip_graph {
16281623
cb(cur, "attn_out", il);
16291624
}
16301625

1631-
if (layer.ls_1_w) {
1632-
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
1633-
cb(cur, "attn_out_scaled", il);
1634-
}
1635-
16361626
// Apply layernorm after attention for cogvlm
16371627
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
16381628
cb(cur, "attn_post_norm", il);
@@ -1656,31 +1646,19 @@ struct clip_graph {
16561646

16571647
cb(cur, "ffn_out", il);
16581648

1659-
if (layer.ls_2_w) {
1660-
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
1661-
cb(cur, "ffn_out_scaled", il);
1662-
}
1663-
16641649
// Apply layernorm after mlp for cogvlm
16651650
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
16661651
cb(cur, "ffn_post_norm", il);
16671652

16681653
// residual 2
16691654
cur = ggml_add(ctx0, inpL, cur);
16701655
cb(cur, "layer_out", il);
1671-
1672-
inpL = cur;
1673-
}
1674-
1675-
// post-layernorm
1676-
if (model.post_ln_w) {
1677-
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
16781656
}
16791657

16801658
// remove CLS token (like build_llama4 does)
1681-
cur = ggml_view_2d(ctx0, inpL,
1659+
cur = ggml_view_2d(ctx0, cur,
16821660
n_embd, n_patches,
1683-
ggml_row_size(inpL->type, n_embd), 0);
1661+
ggml_row_size(cur->type, n_embd), 0);
16841662

16851663
// Multiply with mm_model_proj
16861664
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
@@ -1689,7 +1667,6 @@ struct clip_graph {
16891667
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
16901668

16911669
// Apply GELU
1692-
// TODO: Not 100% sure about gelu and silu configuration
16931670
cur = ggml_gelu_inplace(ctx0, cur);
16941671

16951672
// Branch 1: multiply with mm_h_to_4h_w
@@ -2548,9 +2525,9 @@ struct clip_model_loader {
25482525
model.layers.resize(hparams.n_layer);
25492526
for (int il = 0; il < hparams.n_layer; ++il) {
25502527
auto & layer = model.layers[il];
2551-
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
2552-
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
2553-
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
2528+
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
2529+
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
2530+
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
25542531
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
25552532
layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
25562533
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);

0 commit comments

Comments
 (0)