Changes according to comments

Tianyue-Zhao · Tianyue-Zhao · commit a571d9a6b3cf · 2025-08-08T00:05:17.000Z
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -8330,9 +8330,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 class CogVLMModel(LlamaModel):
     model_arch = gguf.MODEL_ARCH.COGVLM
 
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -18124,22 +18124,19 @@ struct llm_build_cogvlm : public llm_graph_context {
                 cb(qkv, "qkv", il);
 
                 // split qkv into Q, K, V along the first dimension
-                ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv,
-                    n_embd, n_tokens,
-                    ggml_row_size(qkv->type, n_embd), 0);
-                ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv,
-                    n_embd, n_tokens,
-                    ggml_row_size(qkv->type, n_embd), n_embd * ggml_element_size(qkv));
-                ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv,
-                    n_embd, n_tokens,
-                    ggml_row_size(qkv->type, n_embd), 2 * n_embd * ggml_element_size(qkv));
+                ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
+                    qkv->nb[1], 0));
+                ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
+                    qkv->nb[1], n_embd * ggml_element_size(qkv)));
+                ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens,
+                    qkv->nb[1], 2 * n_embd * ggml_element_size(qkv)));
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-                Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX);
-                Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX);
+                Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type);
+                Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
 
                 cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
                 cb(cur, "attn_out", il);
@@ -18806,7 +18803,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARCEE:
         case LLM_ARCH_ERNIE4_5:
         case LLM_ARCH_ERNIE4_5_MOE:
-        case LLM_ARCH_COGVLM:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
@@ -18852,6 +18848,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_LFM2:
         case LLM_ARCH_SMALLTHINKER:
         case LLM_ARCH_GLM4_MOE:
+        case LLM_ARCH_COGVLM:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -1553,7 +1553,7 @@ struct clip_graph {
         } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
             // projector
             cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_gelu_erf(ctx0,ld cur);
+            cur = ggml_gelu_erf(ctx0, cur);
             cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
 
         } else {
@@ -1575,25 +1575,17 @@ struct clip_graph {
         const int n_pos = n_patches + 1; // +1 for [CLS]
 
         // build input and concatenate class embedding
-        ggml_tensor * inp = build_inp();
-        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+        ggml_tensor * cur = build_inp();
+        cur = ggml_concat(ctx0, cur, model.class_embedding, 1);
 
         // Add position embeddings
-        inp = ggml_add(ctx0, inp, model.position_embeddings);
-        cb(inp, "pos_embed", -1);
-
-        ggml_tensor * inpL = inp;
-
-        // pre-layernorm
-        if (model.pre_ln_w) {
-            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
-            cb(inpL, "pre_ln", -1);
-        }
+        cur = ggml_add(ctx0, cur, model.position_embeddings);
+        cb(cur, "pos_embed", -1);
 
         // loop over layers
         for (int il = 0; il < n_layer; il++) {
             auto & layer = model.layers[il];
-            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+            ggml_tensor * inpL = cur; // inpL = residual, cur = hidden_states
 
             // Note: cogvlm applies layernorm after attention, not before
             // So we skip the layernorm1 here
@@ -1608,12 +1600,15 @@ struct clip_graph {
 
                 // Split qkv into Q, K, V along the first dimension
                 // qkv shape: [3 * n_embd, n_pos] -> split into [n_embd, n_pos] each
-                ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos, 
-                    ggml_row_size(qkv->type, n_embd), 0);
-                ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos, 
-                    ggml_row_size(qkv->type, n_embd), n_embd * ggml_element_size(qkv));
-                ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos, 
-                    ggml_row_size(qkv->type, n_embd), 2 * n_embd * ggml_element_size(qkv));
+                ggml_tensor * Qcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
+                    qkv->nb[1], 0);
+                ggml_tensor * Kcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
+                    qkv->nb[1], n_embd * ggml_element_size(qkv));
+                ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd, n_pos,
+                    qkv->nb[1], 2 * n_embd * ggml_element_size(qkv));
+                Qcur = ggml_cont(ctx0, Qcur);
+                Kcur = ggml_cont(ctx0, Kcur);
+                Vcur = ggml_cont(ctx0, Vcur);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
@@ -1628,11 +1623,6 @@ struct clip_graph {
                 cb(cur, "attn_out", il);
             }
 
-            if (layer.ls_1_w) {
-                cur = ggml_mul(ctx0, cur, layer.ls_1_w);
-                cb(cur, "attn_out_scaled", il);
-            }
-
             // Apply layernorm after attention for cogvlm
             cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
             cb(cur, "attn_post_norm", il);
@@ -1656,31 +1646,19 @@ struct clip_graph {
 
             cb(cur, "ffn_out", il);
 
-            if (layer.ls_2_w) {
-                cur = ggml_mul(ctx0, cur, layer.ls_2_w);
-                cb(cur, "ffn_out_scaled", il);
-            }
-
             // Apply layernorm after mlp for cogvlm
             cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
             cb(cur, "ffn_post_norm", il);
 
             // residual 2
             cur = ggml_add(ctx0, inpL, cur);
             cb(cur, "layer_out", il);
-
-            inpL = cur;
-        }
-
-        // post-layernorm
-        if (model.post_ln_w) {
-            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
         }
 
         // remove CLS token (like build_llama4 does)
-        cur = ggml_view_2d(ctx0, inpL,
+        cur = ggml_view_2d(ctx0, cur,
             n_embd, n_patches,
-            ggml_row_size(inpL->type, n_embd), 0);
+            ggml_row_size(cur->type, n_embd), 0);
 
         // Multiply with mm_model_proj
         cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
@@ -1689,7 +1667,6 @@ struct clip_graph {
         cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
 
         // Apply GELU
-        // TODO: Not 100% sure about gelu and silu configuration
         cur = ggml_gelu_inplace(ctx0, cur);
 
         // Branch 1: multiply with mm_h_to_4h_w
@@ -2548,9 +2525,9 @@ struct clip_model_loader {
         model.layers.resize(hparams.n_layer);
         for (int il = 0; il < hparams.n_layer; ++il) {
             auto & layer = model.layers[il];
-            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "weight"));
-            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "weight"));
-            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "weight"));
+            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "weight"), false);
+            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "weight"), false);
+            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "weight"), false);
             layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
             layer.qkv_w  = get_tensor(string_format(TN_ATTN_QKV,    prefix, il, "weight"), false);
             layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);