Skip to content

Commit 335ad88

Browse files
authored
Merge pull request #219 from menloresearch/update-dev-from-master-2025-08-27-00-11
Sync master with upstream release b6293
2 parents a459ddc + ed349df commit 335ad88

39 files changed

+1719
-779
lines changed

convert_hf_to_gguf.py

Lines changed: 108 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,6 +1216,55 @@ def _try_set_pooling_type(self) -> None:
12161216
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
12171217
self.gguf_writer.add_pooling_type(pooling_type)
12181218

1219+
def _set_vocab_interns1(self):
1220+
tokens: list[str] = []
1221+
toktypes: list[int] = []
1222+
1223+
from transformers import AutoTokenizer
1224+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
1225+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
1226+
vocab_size = self.hparams.get("vocab_size", len(vocab))
1227+
assert max(vocab.values()) < vocab_size
1228+
1229+
tokpre = self.get_vocab_base_pre(tokenizer)
1230+
1231+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
1232+
added_vocab = tokenizer.get_added_vocab()
1233+
1234+
added_tokens_decoder = tokenizer.added_tokens_decoder
1235+
1236+
for i in range(vocab_size):
1237+
if i not in reverse_vocab:
1238+
tokens.append(f"[PAD{i}]")
1239+
toktypes.append(gguf.TokenType.UNUSED)
1240+
else:
1241+
token: str = reverse_vocab[i]
1242+
if token in added_vocab:
1243+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1244+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
1245+
if not added_tokens_decoder[i].normalized:
1246+
previous_token = token
1247+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1248+
if previous_token != token:
1249+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1250+
1251+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
1252+
toktypes.append(gguf.TokenType.CONTROL)
1253+
else:
1254+
toktypes.append(gguf.TokenType.USER_DEFINED)
1255+
else:
1256+
toktypes.append(gguf.TokenType.NORMAL)
1257+
tokens.append(token)
1258+
1259+
self.gguf_writer.add_tokenizer_model("gpt2")
1260+
self.gguf_writer.add_tokenizer_pre(tokpre)
1261+
self.gguf_writer.add_token_list(tokens)
1262+
self.gguf_writer.add_token_types(toktypes)
1263+
1264+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1265+
special_vocab._set_special_token("bos", 151643)
1266+
special_vocab.add_to_gguf(self.gguf_writer)
1267+
12191268

12201269
class MmprojModel(ModelBase):
12211270
model_type = ModelType.MMPROJ
@@ -2932,7 +2981,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29322981
if "language_model." in name:
29332982
name = name.replace("language_model.", "") # for InternVL
29342983
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
2935-
or name.startswith("vision_model") or name.startswith("audio_tower"):
2984+
or name.startswith("vision_model") or name.startswith("audio_tower") \
2985+
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
29362986
# skip vision and audio tensors
29372987
return []
29382988
yield from super().modify_tensors(data_torch, name, bid)
@@ -3109,7 +3159,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
31093159
yield from super().modify_tensors(data_torch, name, bid)
31103160

31113161

3112-
@ModelBase.register("Ernie4_5_ForCausalLM")
3162+
@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM")
31133163
class Ernie4_5Model(TextModel):
31143164
model_arch = gguf.MODEL_ARCH.ERNIE4_5
31153165

@@ -3604,6 +3654,19 @@ def prepare_tensors(self):
36043654
class Qwen3Model(Qwen2Model):
36053655
model_arch = gguf.MODEL_ARCH.QWEN3
36063656

3657+
def __init__(self, *args, **kwargs):
3658+
super().__init__(*args, **kwargs)
3659+
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
3660+
self.origin_hf_arch = hparams.get('architectures', [None])[0]
3661+
3662+
def set_vocab(self):
3663+
# deal with intern-s1-mini
3664+
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
3665+
self._set_vocab_interns1()
3666+
return
3667+
3668+
super().set_vocab()
3669+
36073670

36083671
@ModelBase.register("Qwen3MoeForCausalLM")
36093672
class Qwen3MoeModel(Qwen2MoeModel):
@@ -3620,73 +3683,7 @@ def set_vocab(self):
36203683
self._set_vocab_interns1()
36213684
return
36223685

3623-
try:
3624-
self._set_vocab_sentencepiece()
3625-
except FileNotFoundError:
3626-
self._set_vocab_gpt2()
3627-
3628-
def _set_vocab_interns1(self):
3629-
tokens: list[str] = []
3630-
toktypes: list[int] = []
3631-
3632-
from transformers import AutoTokenizer
3633-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3634-
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3635-
vocab_size = self.hparams.get("vocab_size", len(vocab))
3636-
assert max(vocab.values()) < vocab_size
3637-
3638-
tokpre = self.get_vocab_base_pre(tokenizer)
3639-
3640-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3641-
added_vocab = tokenizer.get_added_vocab()
3642-
3643-
added_tokens_decoder = tokenizer.added_tokens_decoder
3644-
3645-
for i in range(vocab_size):
3646-
if i not in reverse_vocab:
3647-
tokens.append(f"[PAD{i}]")
3648-
toktypes.append(gguf.TokenType.UNUSED)
3649-
else:
3650-
token: str = reverse_vocab[i]
3651-
if token in added_vocab:
3652-
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3653-
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3654-
if not added_tokens_decoder[i].normalized:
3655-
previous_token = token
3656-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3657-
if previous_token != token:
3658-
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3659-
3660-
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3661-
toktypes.append(gguf.TokenType.CONTROL)
3662-
else:
3663-
toktypes.append(gguf.TokenType.USER_DEFINED)
3664-
else:
3665-
toktypes.append(gguf.TokenType.NORMAL)
3666-
tokens.append(token)
3667-
3668-
self.gguf_writer.add_tokenizer_model("gpt2")
3669-
self.gguf_writer.add_tokenizer_pre(tokpre)
3670-
self.gguf_writer.add_token_list(tokens)
3671-
self.gguf_writer.add_token_types(toktypes)
3672-
3673-
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3674-
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3675-
additional_special_tokens = []
3676-
if special_tokens_map_file.is_file():
3677-
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3678-
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3679-
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3680-
if tokenizer_cfg_file.is_file():
3681-
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3682-
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3683-
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3684-
for token in additional_special_tokens:
3685-
if token in token2ids_map:
3686-
special_vocab._set_special_token(token, token2ids_map[token])
3687-
special_vocab._set_special_token('eos', 151645)
3688-
special_vocab._set_special_token("bos", 151643)
3689-
special_vocab.add_to_gguf(self.gguf_writer)
3686+
super().set_vocab()
36903687

36913688

36923689
@ModelBase.register("GPT2LMHeadModel")
@@ -6257,9 +6254,11 @@ def prepare_tensors(self):
62576254
raise ValueError(f"Unprocessed experts: {experts}")
62586255

62596256

6260-
@ModelBase.register("DeepseekV2ForCausalLM")
6261-
@ModelBase.register("DeepseekV3ForCausalLM")
6262-
@ModelBase.register("KimiVLForConditionalGeneration")
6257+
@ModelBase.register(
6258+
"DeepseekV2ForCausalLM",
6259+
"DeepseekV3ForCausalLM",
6260+
"KimiVLForConditionalGeneration",
6261+
)
62636262
class DeepseekV2Model(TextModel):
62646263
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
62656264

@@ -8510,6 +8509,43 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
85108509
return "mm.2.weight"
85118510
return super().map_tensor_name(name, try_suffixes)
85128511

8512+
8513+
@ModelBase.register("KimiVLForConditionalGeneration")
8514+
class KimiVLModel(MmprojModel):
8515+
def __init__(self, *args, **kwargs):
8516+
super().__init__(*args, **kwargs)
8517+
assert self.hparams_vision is not None
8518+
self.hparams_vision["image_size"] = 64 * 14 # for compatibility
8519+
8520+
def set_gguf_parameters(self):
8521+
super().set_gguf_parameters()
8522+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
8523+
self.gguf_writer.add_vision_use_gelu(True)
8524+
self.gguf_writer.add_vision_projector_scale_factor(2)
8525+
# eps is the same as pytorch's default value
8526+
assert self.hparams_vision is not None
8527+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
8528+
8529+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8530+
del bid # unused
8531+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8532+
8533+
if is_vision_tensor:
8534+
if "pos_emb.weight" in name:
8535+
data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
8536+
elif "wqkv" in name:
8537+
split_dim = 0 if "weight" in name else -1
8538+
wq, wk, wv = data_torch.chunk(3, dim=split_dim)
8539+
return [
8540+
(self.map_tensor_name(name.replace("wqkv", "wq")), wq),
8541+
(self.map_tensor_name(name.replace("wqkv", "wk")), wk),
8542+
(self.map_tensor_name(name.replace("wqkv", "wv")), wv)
8543+
]
8544+
8545+
return [(self.map_tensor_name(name), data_torch)]
8546+
8547+
return [] # skip other tensors
8548+
85138549
###### CONVERSION LOGIC ######
85148550

85158551

docs/multimodal/minicpmv4.0.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
66

77

88
### Build llama.cpp
9-
Readme modification time: 20250206
9+
Readme modification time: 20250731
1010

1111
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
1212

docs/multimodal/minicpmv4.5.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
## MiniCPM-V 4.5
2+
3+
### Prepare models and code
4+
5+
Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch model from huggingface to "MiniCPM-V-4_5" folder.
6+
7+
8+
### Build llama.cpp
9+
Readme modification time: 20250826
10+
11+
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
12+
13+
Clone llama.cpp:
14+
```bash
15+
git clone https://github.com/ggerganov/llama.cpp
16+
cd llama.cpp
17+
```
18+
19+
Build llama.cpp using `CMake`:
20+
```bash
21+
cmake -B build
22+
cmake --build build --config Release
23+
```
24+
25+
26+
### Usage of MiniCPM-V 4
27+
28+
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) by us)
29+
30+
```bash
31+
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4_5
32+
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4_5 --minicpmv-projector ../MiniCPM-V-4_5/minicpmv.projector --output-dir ../MiniCPM-V-4_5/ --minicpmv_version 6
33+
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_5/model
34+
35+
# quantize int4 version
36+
./build/bin/llama-quantize ../MiniCPM-V-4_5/model/ggml-model-f16.gguf ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
37+
```
38+
39+
40+
Inference on Linux or Mac
41+
```bash
42+
# run in single-turn mode
43+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
44+
45+
# run in conversation mode
46+
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf
47+
```

examples/model-conversion/Makefile

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Validation functions
1+
MAKEFLAGS += --no-print-directory
2+
23
define validate_model_path
34
@if [ -z "$(MODEL_PATH)" ]; then \
45
echo "Error: MODEL_PATH must be provided either as:"; \
@@ -17,6 +18,13 @@ define validate_embedding_model_path
1718
fi
1819
endef
1920

21+
define quantize_model
22+
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
23+
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
24+
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
25+
@echo "Export the quantized model path to $(2) variable in your environment"
26+
endef
27+
2028
###
2129
### Casual Model targets/recipes
2230
###
@@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
6775
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
6876
causal-quantize-Q4_0: causal-quantize-model
6977

78+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
79+
# token embedding and output types to Q8_0 instead of the default Q6_K.
80+
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
81+
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
82+
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
83+
causal-quantize-qat-Q4_0: causal-quantize-model
84+
7085
causal-quantize-model:
71-
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
72-
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
86+
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
7387

7488
causal-run-quantized-model:
7589
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
@@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
117131
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
118132
embedding-quantize-Q4_0: embedding-quantize-model
119133

134+
# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
135+
# token embedding and output types to Q8_0 instead of the default Q6_K.
136+
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
137+
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
138+
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
139+
embedding-quantize-qat-Q4_0: embedding-quantize-model
140+
120141
embedding-quantize-model:
121-
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
122-
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
142+
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
123143

124144
embedding-run-quantized-model:
125145
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
@@ -144,6 +164,15 @@ perplexity-run:
144164
hf-create-model:
145165
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}"
146166

167+
hf-create-model-dry-run:
168+
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -d
169+
170+
hf-create-model-embedding:
171+
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e
172+
173+
hf-create-model-embedding-dry-run:
174+
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e -d
175+
147176
hf-create-model-private:
148177
@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p
149178

0 commit comments

Comments
 (0)