Skip to content

Commit fba8875

Browse files
stevenkuang-tencentNexesenex
authored andcommitted
model : add hunyuan dense (ggml-org#14878)
* support hunyuan_v1_dense Signed-off-by: stevenkuang <[email protected]> * update hunyuan_moe to hunyuan_v1_moe Signed-off-by: stevenkuang <[email protected]> * fix rope alpha assert and bos token Signed-off-by: stevenkuang <[email protected]> * add blank line Signed-off-by: stevenkuang <[email protected]> * Revert "update hunyuan_moe to hunyuan_v1_moe" This reverts commit aa973ca. * use hunyuan_dense instead of hunyuan_v1_dense Signed-off-by: stevenkuang <[email protected]> * fix hunyuan_moe chat template Signed-off-by: stevenkuang <[email protected]> * remove leftover code Signed-off-by: stevenkuang <[email protected]> * update hunyuan dense chat template Signed-off-by: stevenkuang <[email protected]> * fix hunyuan dense vocab and chat template Signed-off-by: stevenkuang <[email protected]> --------- Signed-off-by: stevenkuang <[email protected]>
1 parent 1358b94 commit fba8875

File tree

10 files changed

+351
-9
lines changed

10 files changed

+351
-9
lines changed

convert_hf_to_gguf.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
10141014
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
10151015
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
10161016
res = "hunyuan"
1017+
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
1018+
# ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
1019+
res = "hunyuan-dense"
10171020
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
10181021
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
10191022
res = "falcon-h1"
@@ -7883,11 +7886,6 @@ def set_gguf_parameters(self):
78837886
class HunYuanMoEModel(TextModel):
78847887
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
78857888

7886-
def __init__(self, *args, **kwargs):
7887-
super().__init__(*args, **kwargs)
7888-
# For handling tied embeddings
7889-
self._tok_embd = None
7890-
78917889
def set_vocab(self):
78927890
from transformers import AutoTokenizer
78937891
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@@ -7981,9 +7979,6 @@ def set_gguf_parameters(self):
79817979
_experts: list[dict[str, Tensor]] | None = None
79827980

79837981
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7984-
if name == "model.embed_tokens.weight":
7985-
self._tok_embd = data_torch.clone()
7986-
79877982
if name == "lm_head.weight":
79887983
if self.hparams.get("tie_word_embeddings", False):
79897984
logger.info("Skipping tied output layer 'lm_head.weight'")
@@ -8028,6 +8023,98 @@ def prepare_tensors(self):
80288023
raise ValueError(f"Unprocessed experts: {experts}")
80298024

80308025

8026+
@ModelBase.register("HunYuanDenseV1ForCausalLM")
8027+
class HunYuanModel(TextModel):
8028+
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
8029+
8030+
def set_vocab(self):
8031+
if (self.dir_model / "tokenizer.json").is_file():
8032+
self._set_vocab_gpt2()
8033+
else:
8034+
from transformers import AutoTokenizer
8035+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
8036+
8037+
# 1. Get the pre-tokenizer identifier hash
8038+
tokpre = self.get_vocab_base_pre(tokenizer)
8039+
8040+
# 2. Reverse-engineer the merges list from mergeable_ranks
8041+
merges = []
8042+
vocab = {}
8043+
mergeable_ranks = tokenizer.mergeable_ranks
8044+
for token, rank in mergeable_ranks.items():
8045+
vocab[QwenModel.token_bytes_to_string(token)] = rank
8046+
if len(token) == 1:
8047+
continue
8048+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
8049+
if len(merged) == 2:
8050+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
8051+
8052+
# 3. Generate the tokens and toktypes lists
8053+
vocab_size = self.hparams["vocab_size"]
8054+
assert tokenizer.vocab_size == vocab_size
8055+
special_tokens = tokenizer.special_tokens
8056+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
8057+
tokens: list[str] = []
8058+
toktypes: list[int] = []
8059+
for i in range(vocab_size):
8060+
if i not in reverse_vocab:
8061+
tokens.append(f"[PAD{i}]")
8062+
toktypes.append(gguf.TokenType.UNUSED)
8063+
else:
8064+
token = reverse_vocab[i]
8065+
tokens.append(token)
8066+
if i in special_tokens.values():
8067+
toktypes.append(gguf.TokenType.CONTROL)
8068+
else:
8069+
toktypes.append(gguf.TokenType.NORMAL)
8070+
8071+
# 4. Write all vocab-related fields to the GGUF writer
8072+
self.gguf_writer.add_tokenizer_model("gpt2")
8073+
self.gguf_writer.add_tokenizer_pre(tokpre)
8074+
self.gguf_writer.add_token_list(tokens)
8075+
self.gguf_writer.add_token_types(toktypes)
8076+
self.gguf_writer.add_token_merges(merges)
8077+
8078+
# 5. Add special tokens and chat templates
8079+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
8080+
special_vocab.add_to_gguf(self.gguf_writer)
8081+
# FIX for BOS token: Overwrite incorrect id read from config.json
8082+
if self.hparams['hidden_size'] == 4096:
8083+
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
8084+
8085+
def set_gguf_parameters(self):
8086+
super().set_gguf_parameters()
8087+
hparams = self.hparams
8088+
8089+
# Rope
8090+
rope_scaling = hparams.get("rope_scaling", {})
8091+
if rope_scaling.get("type") == "dynamic":
8092+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
8093+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
8094+
alpha = rope_scaling.get("alpha", 50)
8095+
base = hparams.get("rope_theta", 10000.0)
8096+
dim = hparams["head_dim"]
8097+
scaled_base = base * (alpha ** (dim / (dim - 2)))
8098+
self.gguf_writer.add_rope_freq_base(scaled_base)
8099+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
8100+
self.gguf_writer.add_rope_scaling_factor(1)
8101+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
8102+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
8103+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
8104+
8105+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
8106+
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
8107+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
8108+
8109+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8110+
if name == "lm_head.weight":
8111+
if self.hparams.get("tie_word_embeddings", False):
8112+
logger.info("Skipping tied output layer 'lm_head.weight'")
8113+
return []
8114+
8115+
return [(self.map_tensor_name(name), data_torch)]
8116+
8117+
80318118
@ModelBase.register("SmolLM3ForCausalLM")
80328119
class SmolLM3Model(LlamaModel):
80338120
model_arch = gguf.MODEL_ARCH.SMOLLM3

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
140140
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
141141
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
142142
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
143+
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
143144
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
144145
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
145146
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},

gguf-py/gguf/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ class MODEL_ARCH(IntEnum):
378378
ERNIE4_5 = auto()
379379
ERNIE4_5_MOE = auto()
380380
HUNYUAN_MOE = auto()
381+
HUNYUAN_DENSE = auto()
381382
SMOLLM3 = auto()
382383
LFM2 = auto()
383384
DREAM = auto()
@@ -700,6 +701,7 @@ class MODEL_TENSOR(IntEnum):
700701
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
701702
MODEL_ARCH.FALCON_H1: "falcon-h1",
702703
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
704+
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
703705
MODEL_ARCH.SMOLLM3: "smollm3",
704706
MODEL_ARCH.LFM2: "lfm2",
705707
MODEL_ARCH.DREAM: "dream",
@@ -2496,6 +2498,22 @@ class MODEL_TENSOR(IntEnum):
24962498
MODEL_TENSOR.FFN_DOWN_SHEXP,
24972499
MODEL_TENSOR.FFN_UP_SHEXP,
24982500
],
2501+
MODEL_ARCH.HUNYUAN_DENSE: [
2502+
MODEL_TENSOR.TOKEN_EMBD,
2503+
MODEL_TENSOR.OUTPUT_NORM,
2504+
MODEL_TENSOR.OUTPUT,
2505+
MODEL_TENSOR.ATTN_NORM,
2506+
MODEL_TENSOR.ATTN_Q,
2507+
MODEL_TENSOR.ATTN_Q_NORM,
2508+
MODEL_TENSOR.ATTN_K,
2509+
MODEL_TENSOR.ATTN_K_NORM,
2510+
MODEL_TENSOR.ATTN_V,
2511+
MODEL_TENSOR.ATTN_OUT,
2512+
MODEL_TENSOR.FFN_NORM,
2513+
MODEL_TENSOR.FFN_GATE,
2514+
MODEL_TENSOR.FFN_DOWN,
2515+
MODEL_TENSOR.FFN_UP,
2516+
],
24992517
MODEL_ARCH.SMOLLM3: [
25002518
MODEL_TENSOR.TOKEN_EMBD,
25012519
MODEL_TENSOR.OUTPUT_NORM,

src/llama-arch.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8585
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
8686
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
8787
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
88+
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
8889
{ LLM_ARCH_SMOLLM3, "smollm3" },
8990
{ LLM_ARCH_LFM2, "lfm2" },
9091
{ LLM_ARCH_DREAM, "dream" },
@@ -1897,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
18971898
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
18981899
},
18991900
},
1901+
{
1902+
LLM_ARCH_HUNYUAN_DENSE,
1903+
{
1904+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1905+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1906+
{ LLM_TENSOR_OUTPUT, "output" },
1907+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1908+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1909+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1910+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1911+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1912+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1913+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1914+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1915+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1916+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1917+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1918+
1919+
},
1920+
},
19001921
{
19011922
LLM_ARCH_SMOLLM3,
19021923
{

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ enum llm_arch {
8989
LLM_ARCH_ERNIE4_5,
9090
LLM_ARCH_ERNIE4_5_MOE,
9191
LLM_ARCH_HUNYUAN_MOE,
92+
LLM_ARCH_HUNYUAN_DENSE,
9293
LLM_ARCH_SMOLLM3,
9394
LLM_ARCH_LFM2,
9495
LLM_ARCH_DREAM,

src/llama-chat.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
6666
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
6767
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
6868
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
69+
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
6970
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
7071
};
7172

@@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
193194
return LLM_CHAT_TEMPLATE_DOTS1;
194195
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
195196
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
197+
} else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
198+
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
196199
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
197200
return LLM_CHAT_TEMPLATE_KIMI_K2;
198201
}
@@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
698701
if (role == "system") {
699702
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
700703
} else if (role == "assistant") {
701-
ss << "<|startoftext|>" << message->content << "<|eos|>";
704+
ss << message->content << "<|eos|>";
702705
} else {
703706
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
704707
}
705708
}
709+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
710+
// tencent/Hunyuan-4B-Instruct
711+
for (size_t i = 0; i < chat.size(); i++) {
712+
std::string role(chat[i]->role);
713+
if (i == 0) {
714+
if (role == "system") {
715+
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
716+
}
717+
}
718+
719+
if (role == "assistant") {
720+
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
721+
} else if (role == "user") {
722+
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
723+
}
724+
}
706725
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
707726
// moonshotai/Kimi-K2-Instruct
708727
for (auto message : chat) {

src/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ enum llm_chat_template {
4646
LLM_CHAT_TEMPLATE_SMOLVLM,
4747
LLM_CHAT_TEMPLATE_DOTS1,
4848
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
49+
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
4950
LLM_CHAT_TEMPLATE_KIMI_K2,
5051
LLM_CHAT_TEMPLATE_UNKNOWN,
5152
};

0 commit comments

Comments
 (0)