From 26d81e0f650ff44bbb6f51e431a308f767b9e1ac Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 8 Jul 2025 21:51:50 +0000 Subject: [PATCH 01/16] Added GGUF mappings for CogVLM model --- gguf-py/gguf/constants.py | 7 ++++ gguf-py/gguf/tensor_mapping.py | 66 ++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 911eea504a19e..6fe6d96ef157c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -560,6 +560,13 @@ class MODEL_TENSOR(IntEnum): SHORTCONV_CONV = auto() SHORTCONV_INPROJ = auto() SHORTCONV_OUTPROJ = auto() + VISEXP_ATTN_Q = auto() + VISEXP_ATTN_K = auto() + VISEXP_ATTN_V = auto() + VISEXP_ATTN_OUT = auto() + VISEXP_GATE = auto() + VISEXP_DOWN = auto() + VISEXP_UP = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index dc7c03b464c25..5543a886d999a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -153,6 +153,7 @@ class TensorNameMap: "encoder.layer.{bid}.layer_norm_1", # jina-v2-code "rwkv.blocks.{bid}.ln2", # rwkv6 "model.layers.{bid}.ln2", # rwkv7 + "model.layers.{bid}.post_attention_layernorm", # cogvlm ), # Attention query-key-value @@ -191,6 +192,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.q_proj", # llama4 "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding + "model.layers.{bid}.self_attn.language_expert_query", # cogvlm ), # Attention key @@ -209,6 +211,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.k_proj", # llama4 "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding + "model.layers.{bid}.self_attn.language_expert_key", # cogvlm ), # Attention value @@ -226,6 +229,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.v_proj", # llama4 "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding + "model.layers.{bid}.self_attn.language_expert_value", # cogvlm ), # Attention output @@ -260,6 +264,7 @@ class TensorNameMap: "transformer_encoder.{bid}.wo", # neobert "model.transformer.blocks.{bid}.attn_out", # llada "layers.{bid}.self_attn.o_proj", # qwen3-embedding + "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm ), # Attention output norm @@ -387,6 +392,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.up", # smallthinker "model.transformer.blocks.{bid}.up_proj", # llada "layers.{bid}.mlp.up_proj", # qwen3-embedding + "model.layers.{bid}.mlp.language_mlp.up_proj", # cogvlm ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -415,21 +421,22 @@ class TensorNameMap: # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "transformer.h.{bid}.mlp.c_fc2", # jais - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) - "transformer.h.{bid}.mlp.linear_1", # refact - "model.layers.{bid}.residual_mlp.w1", # arctic - "transformer.h.{bid}.mlp.c_fc_0", # exaone - "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid - "model.layers.{bid}.block_sparse_moe.gate", # smallthinker - "model.transformer.blocks.{bid}.ff_proj", # llada - "layers.{bid}.mlp.gate_proj", # qwen3-embedding + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) + "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid + "model.layers.{bid}.block_sparse_moe.gate", # smallthinker + "model.transformer.blocks.{bid}.ff_proj", # llada + "layers.{bid}.mlp.gate_proj", # qwen3-embedding + "model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -481,6 +488,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.down", # smallthinker "model.transformer.blocks.{bid}.ff_out", # llada "layers.{bid}.mlp.down_proj", # qwen3-embedding + "model.layers.{bid}.mlp.language_mlp.down_proj", # cogvlm ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -995,6 +1003,34 @@ class TensorNameMap: "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 ), + MODEL_TENSOR.VISEXP_UP: ( + "model.layers.{bid}.mlp.vision_mlp.up_proj", # cogvlm + ), + + MODEL_TENSOR.VISEXP_GATE: ( + "model.layers.{bid}.mlp.vision_mlp.gate_proj", # cogvlm + ), + + MODEL_TENSOR.VISEXP_DOWN: ( + "model.layers.{bid}.mlp.vision_mlp.down_proj", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_OUT: ( + "model.layers.{bid}.self_attn.vision_expert_dense", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_Q: ( + "model.layers.{bid}.self_attn.vision_expert_query", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_K: ( + "model.layers.{bid}.self_attn.vision_expert_key", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_V: ( + "model.layers.{bid}.self_attn.vision_expert_value", # cogvlm + ), + ############################################################################ # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( From 613096adb02e5afef9e7f1b69fab6ab55a14a6d6 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 8 Jul 2025 23:02:36 +0000 Subject: [PATCH 02/16] Add tensor mapping for CogVLM visual encoder --- gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6fe6d96ef157c..6e4dd71ec4543 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -607,6 +607,9 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY = auto() # minicpmv V_TOK_EMBD_IMG_BREAK = auto() # pixtral V_MM_PATCH_MERGER = auto() # mistral small 3.1 + V_MM_UP = auto() # cogvlm + V_MM_DOWN = auto() # cogvlm + V_MM_GATE = auto() # cogvlm # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_CONV1D = auto() diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5543a886d999a..c8a22f4aa9c34 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -98,6 +98,7 @@ class TensorNameMap: "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 "model.transformer.ln_f", # llada + "model.norm", # cogvlm ), # Rope frequencies @@ -1138,6 +1139,7 @@ class TensorNameMap: "model.mm_projector.mlp.mlp.{bid}", "vision_model.vision_adapter.mlp.fc{bid}", # llama 4 "mlp1.{bid}", # InternVL + "model.vision.linear_proj.linear_proj", # cogvlm ), MODEL_TENSOR.V_MMPROJ_PEG: ( @@ -1158,6 +1160,7 @@ class TensorNameMap: "vision_tower.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl + "model.vision.patch_embedding", # cogvlm ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -1176,6 +1179,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated + "model.vision.transformer.layers.{bid}.attention.query", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1191,6 +1195,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated + "model.vision.transformer.layers.{bid}.attention.key", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1206,6 +1211,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated + "model.vision.transformer.layers.{bid}.attention.value", # cogvlm ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1217,6 +1223,7 @@ class TensorNameMap: "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4 "visual.blocks.{bid}.norm1", # qwen2vl + "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1228,6 +1235,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral "visual.blocks.{bid}.attn.proj", # qwen2vl + "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1239,6 +1247,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral "visual.blocks.{bid}.norm2", # qwen2vl + "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1250,6 +1259,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.mlp.fc1", # llama4 "visual.blocks.{bid}.mlp.fc1", # qwen2vl "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl + "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1266,6 +1276,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.mlp.fc2", # llama4 "visual.blocks.{bid}.mlp.fc2", # qwen2vl "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl + "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm ), MODEL_TENSOR.V_LAYER_SCALE_1: ( @@ -1289,6 +1300,7 @@ class TensorNameMap: "model.vision_model.post_layernorm", # SmolVLM "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl + "model.vision.linear_proj.norm1", # cogvlm ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1355,6 +1367,18 @@ class TensorNameMap: "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 ), + MODEL_TENSOR.V_MM_UP: ( + "model.vision.linear_proj.dense_h_to_4h", # cogvlm + ), + + MODEL_TENSOR.V_MM_DOWN: ( + "model.vision.linear_proj.dense_4h_to_h", # cogvlm + ), + + MODEL_TENSOR.V_MM_GATE: ( + "model.vision.linear_proj.gate_proj", # cogvlm + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( From bbebec925e1f2a90342a710d6929adbd002431fd Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Wed, 9 Jul 2025 22:47:37 +0000 Subject: [PATCH 03/16] Add CogVLM to conversion script, no vision part yet --- convert_hf_to_gguf.py | 26 ++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b8c7d97a786c7..3a4b4971b6629 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8304,6 +8304,32 @@ def prepare_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + +@ModelBase.register("CogVLMForCausalLM") +class CogVLMModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.COGVLM + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # block vision tensors + if name.startswith("model.vision."): + return [] + + if "query_key_value" in name: + # Slice tensor into three along first axis + q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) + return [ + (self.map_tensor_name(name.replace("query_key_value", "query")), q), + (self.map_tensor_name(name.replace("query_key_value", "key")), k), + (self.map_tensor_name(name.replace("query_key_value", "value")), v), + ] + + return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6e4dd71ec4543..d0834b216e49a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -385,6 +385,7 @@ class MODEL_ARCH(IntEnum): DREAM = auto() SMALLTHINKER = auto() LLADA = auto() + COGVLM = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -395,6 +396,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): GLM_EDGE = auto() MERGER = auto() GEMMA3 = auto() + COGVLM = auto() class MODEL_TENSOR(IntEnum): @@ -727,6 +729,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", + MODEL_ARCH.COGVLM: "cogvlm", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -902,6 +905,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", + MODEL_TENSOR.VISEXP_ATTN_Q: "blk.{bid}.vis_attn_q", + MODEL_TENSOR.VISEXP_ATTN_K: "blk.{bid}.vis_attn_k", + MODEL_TENSOR.VISEXP_ATTN_V: "blk.{bid}.vis_attn_v", + MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output", + MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", + MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", + MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up", # vision MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", @@ -2619,6 +2629,27 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.COGVLM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.VISEXP_ATTN_Q, + MODEL_TENSOR.VISEXP_ATTN_K, + MODEL_TENSOR.VISEXP_ATTN_V, + MODEL_TENSOR.VISEXP_ATTN_OUT, + MODEL_TENSOR.VISEXP_GATE, + MODEL_TENSOR.VISEXP_UP, + MODEL_TENSOR.VISEXP_DOWN, + ], # TODO } From 35436eaa12433341a16199717f9e73f060a62386 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Thu, 10 Jul 2025 00:27:01 +0000 Subject: [PATCH 04/16] Added CogVLM vision model to conversion script --- convert_hf_to_gguf.py | 25 ++++++++++++++++++++++++- gguf-py/gguf/constants.py | 12 ++++++++++++ gguf-py/gguf/tensor_mapping.py | 12 +++++++++++- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3a4b4971b6629..9dcf29125bd2c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8305,6 +8305,29 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("CogVLMForCausalLM") +class CogVLMVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams_vision['num_attention_heads'] = self.hparams['num_heads'] + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if not name.startswith("model.vision."): + return [] + + if "query_key_value" in name: + # Split tensor into three along first axis + q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) + return [ + (self.map_tensor_name(name.replace("query_key_value", "query")), q), + (self.map_tensor_name(name.replace("query_key_value", "key")), k), + (self.map_tensor_name(name.replace("query_key_value", "value")), v), + ] + + return [(self.map_tensor_name(name), data_torch)] + @ModelBase.register("CogVLMForCausalLM") class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM @@ -8319,7 +8342,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("model.vision."): return [] - if "query_key_value" in name: + if "query_key_value.weight" in name: # Slice tensor into three along first axis q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) return [ diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d0834b216e49a..6d884968677f5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -612,6 +612,8 @@ class MODEL_TENSOR(IntEnum): V_MM_UP = auto() # cogvlm V_MM_DOWN = auto() # cogvlm V_MM_GATE = auto() # cogvlm + V_TOK_BOI = auto() # cogvlm + V_TOK_EOI = auto() # cogvlm # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_CONV1D = auto() @@ -952,6 +954,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 + MODEL_TENSOR.V_MM_UP: "mm.up", + MODEL_TENSOR.V_MM_DOWN: "mm.down", + MODEL_TENSOR.V_MM_GATE: "mm.gate", + MODEL_TENSOR.V_TOK_BOI: "v.boi", + MODEL_TENSOR.V_TOK_EOI: "v.eoi", # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", @@ -1020,6 +1027,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY, MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, MODEL_TENSOR.V_MM_PATCH_MERGER, + MODEL_TENSOR.V_MM_UP, + MODEL_TENSOR.V_MM_DOWN, + MODEL_TENSOR.V_MM_GATE, + MODEL_TENSOR.V_TOK_BOI, + MODEL_TENSOR.V_TOK_EOI, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_CONV1D, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c8a22f4aa9c34..d42dc7e20ce77 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1150,6 +1150,7 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.class_embedding", "model.vision_tower.embeddings.cls_token", # Intern-S1 "vision_model.class_embedding", # llama 4 + "model.vision.patch_embedding.cls_embedding", # cogvlm ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( @@ -1160,7 +1161,7 @@ class TensorNameMap: "vision_tower.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl - "model.vision.patch_embedding", # cogvlm + "model.vision.patch_embedding.proj", # cogvlm ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -1169,6 +1170,7 @@ class TensorNameMap: "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM "vision_model.positional_embedding_vlm", # llama 4 + "model.vision.patch_embedding.position_embedding", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_Q: ( @@ -1379,6 +1381,14 @@ class TensorNameMap: "model.vision.linear_proj.gate_proj", # cogvlm ), + MODEL_TENSOR.V_TOK_BOI: ( + "model.vision.boi", # cogvlm + ), + + MODEL_TENSOR.V_TOK_EOI: ( + "model.vision.eoi", # cogvlm + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( From 73fbc1789990500cffb6c1c7ff998e87c9c6ee5b Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 15 Jul 2025 22:42:24 +0000 Subject: [PATCH 05/16] Add graph for CogVLM CLIP model --- gguf-py/gguf/constants.py | 3 + gguf-py/gguf/tensor_mapping.py | 2 +- tools/mtmd/clip-impl.h | 10 +++ tools/mtmd/clip.cpp | 115 +++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 7 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6d884968677f5..6ea232b9e83e4 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -609,6 +609,7 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY = auto() # minicpmv V_TOK_EMBD_IMG_BREAK = auto() # pixtral V_MM_PATCH_MERGER = auto() # mistral small 3.1 + V_MM_POST_FC_NORM = auto() # cogvlm V_MM_UP = auto() # cogvlm V_MM_DOWN = auto() # cogvlm V_MM_GATE = auto() # cogvlm @@ -954,6 +955,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 + MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm MODEL_TENSOR.V_MM_UP: "mm.up", MODEL_TENSOR.V_MM_DOWN: "mm.down", MODEL_TENSOR.V_MM_GATE: "mm.gate", @@ -1027,6 +1029,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY, MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, MODEL_TENSOR.V_MM_PATCH_MERGER, + MODEL_TENSOR.V_MM_POST_FC_NORM, MODEL_TENSOR.V_MM_UP, MODEL_TENSOR.V_MM_DOWN, MODEL_TENSOR.V_MM_GATE, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d42dc7e20ce77..8eb1de45468d5 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1302,7 +1302,6 @@ class TensorNameMap: "model.vision_model.post_layernorm", # SmolVLM "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl - "model.vision.linear_proj.norm1", # cogvlm ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1311,6 +1310,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", + "model.vision.linear_proj.norm1", # cogvlm ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c8822dcf5c34c..a84e70df3ce71 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -111,6 +111,14 @@ #define TN_MM_NORM_PRE "mm.a.norm_pre.%s" #define TN_MM_NORM_MID "mm.a.norm_mid.%s" +// cogvlm +#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s" +#define TN_MM_H_TO_4H "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_4H_TO_H "mm.down.%s" +#define TN_TOK_BOI "v.boi" +#define TN_TOK_EOI "v.eoi" + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -133,6 +141,7 @@ enum projector_type { PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_UNKNOWN, + PROJECTOR_TYPE_COGVLM, }; static std::map PROJECTOR_TYPE_NAMES = { @@ -152,6 +161,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_COGVLM, "cogvlm"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 20c2173314a4a..9c50e8db3d669 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -355,6 +355,15 @@ struct clip_model { ggml_tensor * mm_norm_pre_w = nullptr; ggml_tensor * mm_norm_mid_w = nullptr; + // cogvlm + ggml_tensor * mm_post_fc_norm_w = nullptr; + ggml_tensor * mm_post_fc_norm_b = nullptr; + ggml_tensor * mm_h_to_4h_w = nullptr; + ggml_tensor * mm_gate_w = nullptr; + ggml_tensor * mm_4h_to_h_w = nullptr; + ggml_tensor * mm_boi = nullptr; + ggml_tensor * mm_eoi = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; @@ -1556,6 +1565,66 @@ struct clip_graph { return gf; } + // cogvlm vision encoder + ggml_cgraph * build_cogvlm() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // build input and concatenate class embedding + ggml_tensor * inp = build_inp(); + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // build ViT transformer + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token (like build_llama4 does) + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // Multiply with mm_model_proj + cur = ggml_mul_mat(ctx0, model.mm_model_proj_w, cur); + + // Apply layernorm, weight, bias + cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + + // Apply GELU + // TODO: Not 100% sure about gelu and silu configuration + cur = ggml_gelu_inplace(ctx0, cur); + + // Branch 1: multiply with mm_h_to_4h_w + ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + + // Branch 2: multiply with mm_gate_w + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + + // Apply silu + gate = ggml_silu_inplace(ctx0, gate); + + // Multiply together + cur = ggml_mul(ctx0, gate, h_to_4h); + + // Apply mm_4h_to_h_w + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, cur); + + // Concatenate with boi and eoi + // TODO: The shape may be incorrect + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + private: // // utility functions @@ -1601,9 +1670,14 @@ struct clip_graph { auto & layer = model.layers[il]; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "layer_inp_normed", il); + // Check if this is COGVLM projector type for post-norm layernorm order + const bool is_cogvlm = ctx->proj_type() == PROJECTOR_TYPE_COGVLM; + + // layernorm1 (only for non-COGVLM) + if (!is_cogvlm) { + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); + } // self-attention { @@ -1657,6 +1731,12 @@ struct clip_graph { cb(cur, "attn_out_scaled", il); } + // Apply layernorm AFTER attention for COGVLM (post-norm) + if (is_cogvlm) { + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "attn_post_norm", il); + } + // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, inpL); @@ -1664,9 +1744,11 @@ struct clip_graph { cb(cur, "ffn_inp", il); - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); + // layernorm2 (only for non-COGVLM) + if (!is_cogvlm) { + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + } // ffn cur = build_ffn(cur, @@ -1682,6 +1764,12 @@ struct clip_graph { cb(cur, "ffn_out_scaled", il); } + // Apply layernorm AFTER MLP for COGVLM (post-norm) + if (is_cogvlm) { + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_post_norm", il); + } + // residual 2 cur = ggml_add(ctx0, inpL, cur); cb(cur, "layer_out", il); @@ -2008,6 +2096,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_whisper_enc(); } break; + case PROJECTOR_TYPE_COGVLM: + { + res = graph.build_cogvlm(); + } break; default: { res = graph.build_llava(); @@ -2589,6 +2681,17 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); } break; + case PROJECTOR_TYPE_COGVLM: + { + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); + model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); + model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); + model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight")); + model.mm_boi = get_tensor(TN_TOK_BOI); + model.mm_eoi = get_tensor(TN_TOK_EOI); + } break; default: GGML_ASSERT(false && "unknown projector type"); } From 6c7327f7f7f0228f0b71233a515ef3679aff3875 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Thu, 17 Jul 2025 01:00:30 +0000 Subject: [PATCH 06/16] Add graph for CogVLM --- src/llama-arch.cpp | 32 ++++++++++ src/llama-arch.h | 8 +++ src/llama-model.cpp | 146 ++++++++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 9 +++ 4 files changed, 195 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 18dcc6ddfe567..a10857c341e32 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -93,6 +93,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, + { LLM_ARCH_COGVLM, "cogvlm" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2067,6 +2068,30 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_COGVLM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.visexp_attn_wq" }, + { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.visexp_attn_wk" }, + { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.visexp_attn_wv" }, + { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.visexp_attn_wo" }, + { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.visexp_ffn_gate" }, + { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.visexp_ffn_down" }, + { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.visexp_ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2238,6 +2263,13 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_WQ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_WK, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_WV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are currently ignored (reserved for future MTP support) // These tensors only exist in the last layer(s) and are treated as output tensors {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 7af587e7951bc..9a42780bd2d92 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -97,6 +97,7 @@ enum llm_arch { LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, + LLM_ARCH_COGVLM, LLM_ARCH_UNKNOWN, }; @@ -413,6 +414,13 @@ enum llm_tensor { LLM_TENSOR_SHORTCONV_CONV, LLM_TENSOR_SHORTCONV_INPROJ, LLM_TENSOR_SHORTCONV_OUTPROJ, + LLM_TENSOR_VISEXP_ATTN_WQ, + LLM_TENSOR_VISEXP_ATTN_WK, + LLM_TENSOR_VISEXP_ATTN_WV, + LLM_TENSOR_VISEXP_ATTN_OUT, + LLM_TENSOR_VISEXP_FFN_GATE, + LLM_TENSOR_VISEXP_FFN_DOWN, + LLM_TENSOR_VISEXP_FFN_UP, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 58ca7df707ef3..0363b8449512d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5535,6 +5535,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); } } break; + case LLM_ARCH_COGVLM: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.visexp_attn_wq = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WQ, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wk = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WK, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -18034,6 +18062,120 @@ struct llm_build_smallthinker : public llm_graph_context{ } }; +struct llm_build_cogvlm : public llm_graph_context { + llm_build_cogvlm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + float kq_scale = 1.0f / sqrtf(float(n_embd_head)); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * inpL, * cur; + inpL = build_inp_embd(model.tok_embd); + + // check ubatch to see if we have input tokens (text) + // or an input embedding vector (image) + bool is_text; + if (ubatch.token) { + is_text = true; + } else { + is_text = false; + } + + for (int il = 0; il < n_layer; ++il) { + // get either the text or image weight tensors + ggml_tensor * wq, * wk, * wv, * wo; + ggml_tensor * ffn_gate, * ffn_down, * ffn_up; + + if (is_text) { + wq = model.layers[il].wq; + wk = model.layers[il].wk; + wv = model.layers[il].wv; + wo = model.layers[il].wo; + ffn_gate = model.layers[il].ffn_gate; + ffn_down = model.layers[il].ffn_down; + ffn_up = model.layers[il].ffn_up; + } else { + wq = model.layers[il].visexp_attn_wq; + wk = model.layers[il].visexp_attn_wk; + wv = model.layers[il].visexp_attn_wv; + wo = model.layers[il].visexp_attn_wo; + ffn_gate = model.layers[il].visexp_ffn_gate; + ffn_down = model.layers[il].visexp_ffn_down; + ffn_up = model.layers[il].visexp_ffn_up; + } + + ggml_tensor * inpSA = inpL; + cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + + // build self attention + { + ggml_tensor * Qcur = build_lora_mm(wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // TODO: Check Rope because this might not be the same as cogvlm + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + + cur = build_attn(inp_attn, gf, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // Make a standard ffn without the build_ffn function + ggml_tensor * tmp = build_lora_mm(ffn_up, cur); + ggml_tensor * gate = build_lora_mm(ffn_gate, cur); + gate = ggml_silu(ctx0, gate); + cur = ggml_mul(ctx0, gate, tmp); + cur = build_lora_mm(ffn_down, cur); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); + + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -18499,6 +18641,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique>(*this, params); } } break; + case LLM_ARCH_COGVLM: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-model.h b/src/llama-model.h index 6fcd74d57fdca..c291c6763910a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -367,6 +367,15 @@ struct llama_layer { // openai-moe struct ggml_tensor * attn_sinks = nullptr; + // cogvlm + struct ggml_tensor * visexp_attn_wq = nullptr; + struct ggml_tensor * visexp_attn_wk = nullptr; + struct ggml_tensor * visexp_attn_wv = nullptr; + struct ggml_tensor * visexp_attn_wo = nullptr; + struct ggml_tensor * visexp_ffn_gate = nullptr; + struct ggml_tensor * visexp_ffn_down = nullptr; + struct ggml_tensor * visexp_ffn_up = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; From 22b80e73f4ce7c9bf58b269029830bf56fcb08f1 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Mon, 21 Jul 2025 21:58:14 +0000 Subject: [PATCH 07/16] Fixes for CogVLM. Now compiles. --- src/llama-model.cpp | 5 +++++ tools/mtmd/clip.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0363b8449512d..1f349459a43c5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18073,6 +18073,10 @@ struct llm_build_cogvlm : public llm_graph_context { ggml_tensor * inpL, * cur; inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + // check ubatch to see if we have input tokens (text) // or an input embedding vector (image) bool is_text; @@ -18804,6 +18808,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: + case LLM_ARCH_COGVLM: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9c50e8db3d669..86d1a3ead9e28 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1590,7 +1590,7 @@ struct clip_graph { ggml_row_size(cur->type, n_embd), 0); // Multiply with mm_model_proj - cur = ggml_mul_mat(ctx0, model.mm_model_proj_w, cur); + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); // Apply layernorm, weight, bias cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); From d3885657c3124df9a632bc4840aa895552483ab1 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 22 Jul 2025 00:15:22 +0000 Subject: [PATCH 08/16] Model now runs --- convert_hf_to_gguf.py | 5 +++++ gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 6 +++++- src/llama-arch.cpp | 14 +++++++------- src/llama-model.cpp | 23 +++++++++++++++++++++-- tools/mtmd/clip.cpp | 7 +++++++ 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9dcf29125bd2c..808f9de395f4e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8310,6 +8310,11 @@ class CogVLMVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.hparams_vision['num_attention_heads'] = self.hparams['num_heads'] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6ea232b9e83e4..2ed55cabfb880 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2888,6 +2888,7 @@ class VisionProjectorType: QWEN2A = "qwen2a" # audio QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" + COGVLM = "cogvlm" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8eb1de45468d5..08fca68477886 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1133,13 +1133,13 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM + "model.vision.linear_proj.linear_proj", # cogvlm ), MODEL_TENSOR.V_MMPROJ_MLP: ( "model.mm_projector.mlp.mlp.{bid}", "vision_model.vision_adapter.mlp.fc{bid}", # llama 4 "mlp1.{bid}", # InternVL - "model.vision.linear_proj.linear_proj", # cogvlm ), MODEL_TENSOR.V_MMPROJ_PEG: ( @@ -1369,6 +1369,10 @@ class TensorNameMap: "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 ), + MODEL_TENSOR.V_MM_POST_FC_NORM: ( + "model.vision.linear_proj.norm1", # cogvlm + ), + MODEL_TENSOR.V_MM_UP: ( "model.vision.linear_proj.dense_h_to_4h", # cogvlm ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a10857c341e32..0347eb2d127b9 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2083,13 +2083,13 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.visexp_attn_wq" }, - { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.visexp_attn_wk" }, - { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.visexp_attn_wv" }, - { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.visexp_attn_wo" }, - { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.visexp_ffn_gate" }, - { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.visexp_ffn_down" }, - { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.visexp_ffn_up" }, + { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.vis_attn_q" }, + { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.vis_attn_k" }, + { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.vis_attn_v" }, + { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" }, + { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, + { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, + { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, }, }, { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1f349459a43c5..bb33b6246b05b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1872,6 +1872,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_COGVLM: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: type = LLM_TYPE_13B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -5543,6 +5551,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -5557,10 +5570,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; default: @@ -18140,8 +18159,8 @@ struct llm_build_cogvlm : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // TODO: Check Rope because this might not be the same as cogvlm - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); + Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); cur = build_attn(inp_attn, gf, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 86d1a3ead9e28..fab9cad93e3c2 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3730,6 +3730,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches_sq /= 2; } } break; + case PROJECTOR_TYPE_COGVLM: + { + n_patches_sq += 2; + } break; default: GGML_ABORT("unsupported projector type"); } @@ -4135,6 +4139,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_COGVLM: { // do nothing } break; @@ -4249,6 +4254,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; + case PROJECTOR_TYPE_COGVLM: + return ctx->model.mm_4h_to_h_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } From 4905bdefbc92fdcb137063368211162668c3b576 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 29 Jul 2025 21:30:46 +0000 Subject: [PATCH 09/16] Fixes for cogvlm graph --- src/llama-model.cpp | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bb33b6246b05b..4c5f26bd4d4f5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18135,24 +18135,12 @@ struct llm_build_cogvlm : public llm_graph_context { { ggml_tensor * Qcur = build_lora_mm(wq, cur); cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } ggml_tensor * Kcur = build_lora_mm(wk, cur); cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } ggml_tensor * Vcur = build_lora_mm(wv, cur); cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -18173,11 +18161,17 @@ struct llm_build_cogvlm : public llm_graph_context { cb(cur, "ffn_norm", il); // Make a standard ffn without the build_ffn function - ggml_tensor * tmp = build_lora_mm(ffn_up, cur); - ggml_tensor * gate = build_lora_mm(ffn_gate, cur); - gate = ggml_silu(ctx0, gate); - cur = ggml_mul(ctx0, gate, tmp); - cur = build_lora_mm(ffn_down, cur); + //ggml_tensor * tmp = build_lora_mm(ffn_up, cur); + //ggml_tensor * gate = build_lora_mm(ffn_gate, cur); + //gate = ggml_silu(ctx0, gate); + //cur = ggml_mul(ctx0, gate, tmp); + //cur = build_lora_mm(ffn_down, cur); + cur = build_ffn(cur, + ffn_up, NULL, NULL, + ffn_gate, NULL, NULL, + ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From f59abc5a485f139a3af20ba153731bb41ca50a6c Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 18:38:15 +0000 Subject: [PATCH 10/16] Account for graph context change after rebase --- src/llama-model.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4c5f26bd4d4f5..438d3cd4b47ec 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18082,7 +18082,7 @@ struct llm_build_smallthinker : public llm_graph_context{ }; struct llm_build_cogvlm : public llm_graph_context { - llm_build_cogvlm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; float kq_scale = 1.0f / sqrtf(float(n_embd_head)); @@ -18150,7 +18150,7 @@ struct llm_build_cogvlm : public llm_graph_context { Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); - cur = build_attn(inp_attn, gf, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -18660,7 +18660,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_COGVLM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; default: GGML_ABORT("fatal error"); From 1b7f34df7fc98bfdd7c0f19bc6620083f792ffa8 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:10:57 +0000 Subject: [PATCH 11/16] Changes for whitespace --- convert_hf_to_gguf.py | 9 +++++---- gguf-py/gguf/tensor_mapping.py | 4 ++-- tools/mtmd/clip.cpp | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 808f9de395f4e..3fcd3b6113340 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8315,7 +8315,7 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) - + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -8330,13 +8330,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter (self.map_tensor_name(name.replace("query_key_value", "key")), k), (self.map_tensor_name(name.replace("query_key_value", "value")), v), ] - + return [(self.map_tensor_name(name), data_torch)] + @ModelBase.register("CogVLMForCausalLM") class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM - + def set_gguf_parameters(self): super().set_gguf_parameters() @@ -8346,7 +8347,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # block vision tensors if name.startswith("model.vision."): return [] - + if "query_key_value.weight" in name: # Slice tensor into three along first axis q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 08fca68477886..a7154af6ddf6c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1023,7 +1023,7 @@ class TensorNameMap: MODEL_TENSOR.VISEXP_ATTN_Q: ( "model.layers.{bid}.self_attn.vision_expert_query", # cogvlm ), - + MODEL_TENSOR.VISEXP_ATTN_K: ( "model.layers.{bid}.self_attn.vision_expert_key", # cogvlm ), @@ -1388,7 +1388,7 @@ class TensorNameMap: MODEL_TENSOR.V_TOK_BOI: ( "model.vision.boi", # cogvlm ), - + MODEL_TENSOR.V_TOK_EOI: ( "model.vision.eoi", # cogvlm ), diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fab9cad93e3c2..30b2eea2f996d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1607,7 +1607,7 @@ struct clip_graph { // Apply silu gate = ggml_silu_inplace(ctx0, gate); - + // Multiply together cur = ggml_mul(ctx0, gate, h_to_4h); From 11c5dfd2b2ee25606843474fde614a932f1a8418 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:12:52 +0000 Subject: [PATCH 12/16] Changes in convert script according to comments --- convert_hf_to_gguf.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3fcd3b6113340..2ae6be2c52c10 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1261,7 +1261,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"])) self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"])) self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) - self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"])) + self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"])) # preprocessor config self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) @@ -8307,9 +8307,6 @@ def prepare_tensors(self): @ModelBase.register("CogVLMForCausalLM") class CogVLMVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.hparams_vision['num_attention_heads'] = self.hparams['num_heads'] def set_gguf_parameters(self): super().set_gguf_parameters() @@ -8338,9 +8335,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM - def set_gguf_parameters(self): - super().set_gguf_parameters() - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused From e96923893b549f9f5edb94b3266359948ecb5fef Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:33:39 +0000 Subject: [PATCH 13/16] Switch CogVLM LLM graph to merged QKV tensor --- convert_hf_to_gguf.py | 9 --------- gguf-py/gguf/constants.py | 16 ++++------------ gguf-py/gguf/tensor_mapping.py | 16 +++------------- src/llama-arch.cpp | 12 +++--------- src/llama-arch.h | 4 +--- src/llama-model.cpp | 35 +++++++++++++--------------------- src/llama-model.h | 4 +--- 7 files changed, 25 insertions(+), 71 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2ae6be2c52c10..2a9367e3b3708 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8342,15 +8342,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("model.vision."): return [] - if "query_key_value.weight" in name: - # Slice tensor into three along first axis - q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) - return [ - (self.map_tensor_name(name.replace("query_key_value", "query")), q), - (self.map_tensor_name(name.replace("query_key_value", "key")), k), - (self.map_tensor_name(name.replace("query_key_value", "value")), v), - ] - return [(self.map_tensor_name(name), data_torch)] ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2ed55cabfb880..babf2ca0c01cc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -562,9 +562,7 @@ class MODEL_TENSOR(IntEnum): SHORTCONV_CONV = auto() SHORTCONV_INPROJ = auto() SHORTCONV_OUTPROJ = auto() - VISEXP_ATTN_Q = auto() - VISEXP_ATTN_K = auto() - VISEXP_ATTN_V = auto() + VISEXP_ATTN_QKV = auto() VISEXP_ATTN_OUT = auto() VISEXP_GATE = auto() VISEXP_DOWN = auto() @@ -908,9 +906,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", - MODEL_TENSOR.VISEXP_ATTN_Q: "blk.{bid}.vis_attn_q", - MODEL_TENSOR.VISEXP_ATTN_K: "blk.{bid}.vis_attn_k", - MODEL_TENSOR.VISEXP_ATTN_V: "blk.{bid}.vis_attn_v", + MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv", MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output", MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", @@ -2649,17 +2645,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.VISEXP_ATTN_Q, - MODEL_TENSOR.VISEXP_ATTN_K, - MODEL_TENSOR.VISEXP_ATTN_V, + MODEL_TENSOR.VISEXP_ATTN_QKV, MODEL_TENSOR.VISEXP_ATTN_OUT, MODEL_TENSOR.VISEXP_GATE, MODEL_TENSOR.VISEXP_UP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7154af6ddf6c..5c669f7cc2e9a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -176,6 +176,7 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.query_key_value", # chatglm "transformer.layers.{bid}.attn.qkv_proj", # openelm "transformer_encoder.{bid}.qkv", # neobert + "model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm ), # Attention query @@ -193,7 +194,6 @@ class TensorNameMap: "model.layers.{bid}.self_attn.q_proj", # llama4 "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding - "model.layers.{bid}.self_attn.language_expert_query", # cogvlm ), # Attention key @@ -212,7 +212,6 @@ class TensorNameMap: "model.layers.{bid}.self_attn.k_proj", # llama4 "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding - "model.layers.{bid}.self_attn.language_expert_key", # cogvlm ), # Attention value @@ -230,7 +229,6 @@ class TensorNameMap: "model.layers.{bid}.self_attn.v_proj", # llama4 "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding - "model.layers.{bid}.self_attn.language_expert_value", # cogvlm ), # Attention output @@ -1020,16 +1018,8 @@ class TensorNameMap: "model.layers.{bid}.self_attn.vision_expert_dense", # cogvlm ), - MODEL_TENSOR.VISEXP_ATTN_Q: ( - "model.layers.{bid}.self_attn.vision_expert_query", # cogvlm - ), - - MODEL_TENSOR.VISEXP_ATTN_K: ( - "model.layers.{bid}.self_attn.vision_expert_key", # cogvlm - ), - - MODEL_TENSOR.VISEXP_ATTN_V: ( - "model.layers.{bid}.self_attn.vision_expert_value", # cogvlm + MODEL_TENSOR.VISEXP_ATTN_QKV: ( + "model.layers.{bid}.self_attn.vision_expert_query_key_value", # cogvlm ), ############################################################################ diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 0347eb2d127b9..7cc9c67651e07 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2075,17 +2075,13 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.vis_attn_q" }, - { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.vis_attn_k" }, - { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.vis_attn_v" }, + { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" }, { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" }, { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, @@ -2263,9 +2259,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_VISEXP_ATTN_WQ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_VISEXP_ATTN_WK, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_VISEXP_ATTN_WV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 9a42780bd2d92..55323c12f0b8d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -414,9 +414,7 @@ enum llm_tensor { LLM_TENSOR_SHORTCONV_CONV, LLM_TENSOR_SHORTCONV_INPROJ, LLM_TENSOR_SHORTCONV_OUTPROJ, - LLM_TENSOR_VISEXP_ATTN_WQ, - LLM_TENSOR_VISEXP_ATTN_WK, - LLM_TENSOR_VISEXP_ATTN_WV, + LLM_TENSOR_VISEXP_ATTN_QKV, LLM_TENSOR_VISEXP_ATTN_OUT, LLM_TENSOR_VISEXP_FFN_GATE, LLM_TENSOR_VISEXP_FFN_DOWN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 438d3cd4b47ec..8453004bf0905 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5560,14 +5560,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); - layer.visexp_attn_wq = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WQ, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.visexp_attn_wk = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WK, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0); layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); @@ -18107,21 +18103,17 @@ struct llm_build_cogvlm : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // get either the text or image weight tensors - ggml_tensor * wq, * wk, * wv, * wo; + ggml_tensor * wqkv, * wo; ggml_tensor * ffn_gate, * ffn_down, * ffn_up; if (is_text) { - wq = model.layers[il].wq; - wk = model.layers[il].wk; - wv = model.layers[il].wv; + wqkv = model.layers[il].wqkv; wo = model.layers[il].wo; ffn_gate = model.layers[il].ffn_gate; ffn_down = model.layers[il].ffn_down; ffn_up = model.layers[il].ffn_up; } else { - wq = model.layers[il].visexp_attn_wq; - wk = model.layers[il].visexp_attn_wk; - wv = model.layers[il].visexp_attn_wv; + wqkv = model.layers[il].visexp_attn_wqkv; wo = model.layers[il].visexp_attn_wo; ffn_gate = model.layers[il].visexp_ffn_gate; ffn_down = model.layers[il].visexp_ffn_down; @@ -18133,17 +18125,16 @@ struct llm_build_cogvlm : public llm_graph_context { // build self attention { - ggml_tensor * Qcur = build_lora_mm(wq, cur); - cb(Qcur, "Qcur", il); + ggml_tensor * qkv = build_lora_mm(wqkv, cur); - ggml_tensor * Kcur = build_lora_mm(wk, cur); - cb(Kcur, "Kcur", il); + // split qkv into Q, K, V along the first dimension + ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), + qkv->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + qkv->nb[1], n_embd * ggml_element_size(qkv)); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, + qkv->nb[1], 2 * n_embd * ggml_element_size(qkv))); - ggml_tensor * Vcur = build_lora_mm(wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // TODO: Check Rope because this might not be the same as cogvlm diff --git a/src/llama-model.h b/src/llama-model.h index c291c6763910a..132fb6bd366f4 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -368,9 +368,7 @@ struct llama_layer { struct ggml_tensor * attn_sinks = nullptr; // cogvlm - struct ggml_tensor * visexp_attn_wq = nullptr; - struct ggml_tensor * visexp_attn_wk = nullptr; - struct ggml_tensor * visexp_attn_wv = nullptr; + struct ggml_tensor * visexp_attn_wqkv = nullptr; struct ggml_tensor * visexp_attn_wo = nullptr; struct ggml_tensor * visexp_ffn_gate = nullptr; struct ggml_tensor * visexp_ffn_down = nullptr; From 66cb20fa7517663b09ffbb451fb727c54ba61627 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:57:09 +0000 Subject: [PATCH 14/16] Use rope_type variable instead of direct definition --- src/llama-model.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8453004bf0905..ed7b202b89d03 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18137,9 +18137,8 @@ struct llm_build_cogvlm : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - // TODO: Check Rope because this might not be the same as cogvlm - Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); - Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); + Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type); + Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type); cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -18151,12 +18150,6 @@ struct llm_build_cogvlm : public llm_graph_context { cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - // Make a standard ffn without the build_ffn function - //ggml_tensor * tmp = build_lora_mm(ffn_up, cur); - //ggml_tensor * gate = build_lora_mm(ffn_gate, cur); - //gate = ggml_silu(ctx0, gate); - //cur = ggml_mul(ctx0, gate, tmp); - //cur = build_lora_mm(ffn_down, cur); cur = build_ffn(cur, ffn_up, NULL, NULL, ffn_gate, NULL, NULL, @@ -18812,7 +18805,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: - case LLM_ARCH_COGVLM: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -18858,6 +18850,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_LFM2: case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_GLM4_MOE: + case LLM_ARCH_COGVLM: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: From 76091ee25f28228740f3acb2137ec30ae4afcfad Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 20:01:57 +0000 Subject: [PATCH 15/16] Change CogVLM CLIP encoder to use SWIGLU --- tools/mtmd/clip.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 30b2eea2f996d..b821975e06c74 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1596,7 +1596,6 @@ struct clip_graph { cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); // Apply GELU - // TODO: Not 100% sure about gelu and silu configuration cur = ggml_gelu_inplace(ctx0, cur); // Branch 1: multiply with mm_h_to_4h_w @@ -1606,16 +1605,12 @@ struct clip_graph { ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); // Apply silu - gate = ggml_silu_inplace(ctx0, gate); - - // Multiply together - cur = ggml_mul(ctx0, gate, h_to_4h); + gate = ggml_swiglu_split(ctx0, gate, h_to_4h); // Apply mm_4h_to_h_w - cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, cur); + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); // Concatenate with boi and eoi - // TODO: The shape may be incorrect cur = ggml_concat(ctx0, model.mm_boi, cur, 1); cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); From ac3992dd4da655f9f787254b30175eed2d17dc34 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 20:57:39 +0000 Subject: [PATCH 16/16] Switch CogVLM CLIP to use merged QKV --- convert_hf_to_gguf.py | 9 --- gguf-py/gguf/constants.py | 3 + gguf-py/gguf/tensor_mapping.py | 7 ++- tools/mtmd/clip-impl.h | 1 + tools/mtmd/clip.cpp | 106 +++++++++++++++++++++------------ 5 files changed, 77 insertions(+), 49 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2a9367e3b3708..875e52aafb19f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8319,15 +8319,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if not name.startswith("model.vision."): return [] - if "query_key_value" in name: - # Split tensor into three along first axis - q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) - return [ - (self.map_tensor_name(name.replace("query_key_value", "query")), q), - (self.map_tensor_name(name.replace("query_key_value", "key")), k), - (self.map_tensor_name(name.replace("query_key_value", "value")), v), - ] - return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index babf2ca0c01cc..a58c363c47296 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -576,6 +576,7 @@ class MODEL_TENSOR(IntEnum): V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_POS = auto() V_ENC_INPUT_NORM = auto() + V_ENC_ATTN_QKV = auto() V_ENC_ATTN_Q = auto() V_ENC_ATTN_Q_NORM = auto() V_ENC_ATTN_K = auto() @@ -919,6 +920,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm", MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", @@ -994,6 +996,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_ATTN_QKV, MODEL_TENSOR.V_ENC_ATTN_Q, MODEL_TENSOR.V_ENC_ATTN_Q_NORM, MODEL_TENSOR.V_ENC_ATTN_K, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5c669f7cc2e9a..c7f452719e6df 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1163,6 +1163,10 @@ class TensorNameMap: "model.vision.patch_embedding.position_embedding", # cogvlm ), + MODEL_TENSOR.V_ENC_ATTN_QKV: ( + "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm + ), + MODEL_TENSOR.V_ENC_ATTN_Q: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 @@ -1171,7 +1175,6 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated - "model.vision.transformer.layers.{bid}.attention.query", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1187,7 +1190,6 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated - "model.vision.transformer.layers.{bid}.attention.key", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1203,7 +1205,6 @@ class TensorNameMap: "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated - "model.vision.transformer.layers.{bid}.attention.value", # cogvlm ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index a84e70df3ce71..cd9bf1e6a3c83 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -59,6 +59,7 @@ #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b821975e06c74..18fdf658278bd 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -211,6 +211,8 @@ struct clip_layer { ggml_tensor * q_b = nullptr; ggml_tensor * v_w = nullptr; ggml_tensor * v_b = nullptr; + ggml_tensor * qkv_w = nullptr; + ggml_tensor * qkv_b = nullptr; ggml_tensor * o_w = nullptr; ggml_tensor * o_b = nullptr; @@ -1576,18 +1578,65 @@ struct clip_graph { ggml_tensor * inp = build_inp(); inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - // build ViT transformer - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - model.position_embeddings, - nullptr); + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + ggml_tensor * inpL = inp; + + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, + cur->nb[1], 0)); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, + cur->nb[1], n_embd * sizeof(float))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, + cur->nb[1], 2 * n_embd * sizeof(float))); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "layer_out", il); + inpL = cur; + + } // remove CLS token (like build_llama4 does) - cur = ggml_view_2d(ctx0, cur, + ggml_tensor * cur = ggml_view_2d(ctx0, inpL, n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); + ggml_row_size(inpL->type, n_embd), 0); // Multiply with mm_model_proj cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); @@ -1665,14 +1714,9 @@ struct clip_graph { auto & layer = model.layers[il]; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - // Check if this is COGVLM projector type for post-norm layernorm order - const bool is_cogvlm = ctx->proj_type() == PROJECTOR_TYPE_COGVLM; - - // layernorm1 (only for non-COGVLM) - if (!is_cogvlm) { - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "layer_inp_normed", il); - } + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); // self-attention { @@ -1726,12 +1770,6 @@ struct clip_graph { cb(cur, "attn_out_scaled", il); } - // Apply layernorm AFTER attention for COGVLM (post-norm) - if (is_cogvlm) { - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "attn_post_norm", il); - } - // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, inpL); @@ -1739,11 +1777,9 @@ struct clip_graph { cb(cur, "ffn_inp", il); - // layernorm2 (only for non-COGVLM) - if (!is_cogvlm) { - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - } + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); // ffn cur = build_ffn(cur, @@ -1759,12 +1795,6 @@ struct clip_graph { cb(cur, "ffn_out_scaled", il); } - // Apply layernorm AFTER MLP for COGVLM (post-norm) - if (is_cogvlm) { - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_post_norm", il); - } - // residual 2 cur = ggml_add(ctx0, inpL, cur); cb(cur, "layer_out", il); @@ -2466,10 +2496,11 @@ struct clip_model_loader { model.layers.resize(hparams.n_layer); for (int il = 0; il < hparams.n_layer; ++il) { auto & layer = model.layers[il]; - layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight")); - layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight")); - layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight")); + layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false); + layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false); + layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false); layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight")); + layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false); layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false); layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false); layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false); @@ -2481,6 +2512,7 @@ struct clip_model_loader { layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false); layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false); layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false); + layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false); layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false); layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);