Skip to content

Commit 75c292c

Browse files
Merge pull request #195 from menloresearch/update-dev-from-master-2025-08-06-00-13
Sync master with upstream release b6096
2 parents 5ae5b31 + fd1234c commit 75c292c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+2996
-247
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ LLM inference in C/C++
1717

1818
## Hot topics
1919

20+
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
2021
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
2122
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
2223
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode

common/arg.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2947,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29472947
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
29482948
"- none: leaves thoughts unparsed in `message.content`\n"
29492949
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2950-
"(default: deepseek)",
2950+
"(default: auto)",
29512951
[](common_params & params, const std::string & value) {
29522952
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
29532953
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
29542954
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955+
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
29552956
else { throw std::invalid_argument("invalid value"); }
29562957
}
29572958
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));

common/chat.cpp

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
126126
typedef minja::chat_template common_chat_template;
127127

128128
struct common_chat_templates {
129+
bool add_bos;
130+
bool add_eos;
129131
bool has_explicit_template; // Model had builtin template or template overridde was specified.
130132
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
131133
std::unique_ptr<common_chat_template> template_tool_use;
@@ -143,6 +145,8 @@ struct templates_params {
143145
bool enable_thinking = true;
144146
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
145147
json extra_context;
148+
bool add_bos;
149+
bool add_eos;
146150
};
147151

148152
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -445,6 +449,8 @@ std::string common_chat_format_single(
445449

446450
common_chat_templates_inputs inputs;
447451
inputs.use_jinja = use_jinja;
452+
inputs.add_bos = tmpls->add_bos;
453+
inputs.add_eos = tmpls->add_eos;
448454

449455
std::string fmt_past_msg;
450456
if (!past_msg.empty()) {
@@ -469,6 +475,8 @@ std::string common_chat_format_single(
469475
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
470476
common_chat_templates_inputs inputs;
471477
inputs.use_jinja = use_jinja;
478+
inputs.add_bos = tmpls->add_bos;
479+
inputs.add_eos = tmpls->add_eos;
472480
auto add_simple_msg = [&](auto role, auto content) {
473481
common_chat_msg msg;
474482
msg.role = role;
@@ -546,6 +554,8 @@ common_chat_templates_ptr common_chat_templates_init(
546554
}
547555
std::string token_bos = bos_token_override;
548556
std::string token_eos = eos_token_override;
557+
bool add_bos = false;
558+
bool add_eos = false;
549559
if (model) {
550560
const auto * vocab = llama_model_get_vocab(model);
551561
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -560,9 +570,13 @@ common_chat_templates_ptr common_chat_templates_init(
560570
};
561571
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
562572
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
573+
add_bos = llama_vocab_get_add_bos(vocab);
574+
add_eos = llama_vocab_get_add_eos(vocab);
563575
}
564576
common_chat_templates_ptr tmpls(new common_chat_templates());
565577
tmpls->has_explicit_template = has_explicit_template;
578+
tmpls->add_bos = add_bos;
579+
tmpls->add_eos = add_eos;
566580
try {
567581
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
568582
} catch (const std::exception & e) {
@@ -592,6 +606,7 @@ const char * common_chat_format_name(common_chat_format format) {
592606
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
593607
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
594608
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
609+
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
595610
default:
596611
throw std::runtime_error("Unknown chat format");
597612
}
@@ -600,6 +615,7 @@ const char * common_chat_format_name(common_chat_format format) {
600615
const char * common_reasoning_format_name(common_reasoning_format format) {
601616
switch (format) {
602617
case COMMON_REASONING_FORMAT_NONE: return "none";
618+
case COMMON_REASONING_FORMAT_AUTO: return "auto";
603619
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
604620
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
605621
default:
@@ -748,10 +764,10 @@ static std::string apply(
748764
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
749765
// may be needed inside the template / between messages too.
750766
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
751-
if (string_starts_with(result, tmpl.bos_token())) {
767+
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
752768
result = result.substr(tmpl.bos_token().size());
753769
}
754-
if (string_ends_with(result, tmpl.eos_token())) {
770+
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
755771
result = result.substr(0, result.size() - tmpl.eos_token().size());
756772
}
757773
return result;
@@ -1289,6 +1305,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
12891305
tool_calls_end);
12901306
}
12911307

1308+
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1309+
common_chat_params data;
1310+
auto prompt = apply(tmpl, inputs);
1311+
1312+
data.prompt = prompt;
1313+
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1314+
1315+
// TODO: support tool calls in GPT-OSS?
1316+
1317+
return data;
1318+
}
1319+
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1320+
// TODO @ngxson : this won't work with --special enabled, we should fix that
1321+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1322+
if (!builder.syntax().parse_tool_calls) {
1323+
builder.add_content(builder.consume_rest());
1324+
return;
1325+
}
1326+
}
1327+
12921328
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
12931329
LOG_DBG("%s\n", __func__);
12941330
common_chat_params data;
@@ -1731,6 +1767,8 @@ static common_chat_params common_chat_templates_apply_jinja(
17311767
params.enable_thinking = inputs.enable_thinking;
17321768
params.grammar = inputs.grammar;
17331769
params.now = inputs.now;
1770+
params.add_bos = inputs.add_bos;
1771+
params.add_eos = inputs.add_eos;
17341772

17351773
params.extra_context = json::object();
17361774
for (auto el : inputs.chat_template_kwargs) {
@@ -1772,6 +1810,11 @@ static common_chat_params common_chat_templates_apply_jinja(
17721810
return common_chat_params_init_hermes_2_pro(tmpl, params);
17731811
}
17741812

1813+
// GPT-OSS
1814+
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
1815+
return common_chat_params_init_gpt_oss(tmpl, params);
1816+
}
1817+
17751818
// Use generic handler when mixing tools + JSON schema.
17761819
// TODO: support that mix in handlers below.
17771820
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1923,6 +1966,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
19231966
case COMMON_CHAT_FORMAT_COMMAND_R7B:
19241967
common_chat_parse_command_r7b(builder);
19251968
break;
1969+
case COMMON_CHAT_FORMAT_GPT_OSS:
1970+
common_chat_parse_gpt_oss(builder);
1971+
break;
19261972
default:
19271973
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
19281974
}

common/chat.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ enum common_chat_format {
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110110
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111111
COMMON_CHAT_FORMAT_COMMAND_R7B,
112+
COMMON_CHAT_FORMAT_GPT_OSS,
112113

113114
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
114115
};
@@ -127,6 +128,8 @@ struct common_chat_templates_inputs {
127128
bool enable_thinking = true;
128129
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
129130
std::map<std::string, std::string> chat_template_kwargs;
131+
bool add_bos = false;
132+
bool add_eos = false;
130133
};
131134

132135
struct common_chat_params {

common/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ struct common_params_diffusion {
236236

237237
enum common_reasoning_format {
238238
COMMON_REASONING_FORMAT_NONE,
239+
COMMON_REASONING_FORMAT_AUTO,
239240
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
240241
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
241242
};
@@ -394,7 +395,7 @@ struct common_params {
394395
std::string chat_template = ""; // NOLINT
395396
bool use_jinja = false; // NOLINT
396397
bool enable_chat_template = true;
397-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
398+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
398399
int reasoning_budget = -1;
399400
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
400401

convert_hf_to_gguf.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7950,6 +7950,119 @@ def set_vocab(self):
79507950
self.gguf_writer.add_chat_template(chat_template)
79517951

79527952

7953+
@ModelBase.register("GptOssForCausalLM")
7954+
class GptOssModel(TextModel):
7955+
model_arch = gguf.MODEL_ARCH.GPT_OSS
7956+
7957+
def transform_nibble_layout(self, tensor):
7958+
assert tensor.dtype == torch.uint8
7959+
assert tensor.shape[-1] == 16
7960+
# swap nibbles
7961+
t_lo = tensor & 0x0F
7962+
t_hi = tensor & 0xF0
7963+
t_swapped = (t_lo << 4) | (t_hi >> 4)
7964+
tensor = t_swapped
7965+
# transform aaaa...bbbb... to abababab...
7966+
blk_a, blk_b = tensor.chunk(2, dim=-1)
7967+
# get a_
7968+
blk_a0 = (blk_a & 0xF0).view(-1, 1)
7969+
blk_a1 = (blk_a << 4).view(-1, 1)
7970+
blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape)
7971+
# get _b
7972+
blk_b0 = (blk_b >> 4).view(-1, 1)
7973+
blk_b1 = (blk_b & 0x0F).view(-1, 1)
7974+
blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape)
7975+
# swap once more
7976+
out = blk_a | blk_b
7977+
out_h = out & 0xF0
7978+
out_l = out & 0x0F
7979+
out = (out_h >> 4) | (out_l << 4)
7980+
return out
7981+
7982+
def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
7983+
assert blocks.dtype == torch.uint8
7984+
assert scales.dtype == torch.uint8
7985+
scales = scales.unsqueeze(-1)
7986+
assert len(blocks.shape) == 4
7987+
assert len(scales.shape) == 4
7988+
blocks = self.transform_nibble_layout(blocks)
7989+
new_data = torch.concat((scales, blocks), dim=-1)
7990+
new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32]
7991+
logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4")
7992+
# flatten last dim
7993+
new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3])
7994+
new_data = new_data.numpy()
7995+
self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
7996+
7997+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
7998+
blocks0: Tensor = torch.zeros(1)
7999+
blocks1: Tensor = torch.zeros(1)
8000+
found_mxfp4_tensors = False
8001+
# we assume that tensors are loaded in the correct order
8002+
for name, data_torch in self.get_tensors():
8003+
if "mlp.experts.down_proj_blocks" in name:
8004+
blocks0 = data_torch
8005+
elif "mlp.experts.down_proj_scales" in name:
8006+
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
8007+
self.repack_mxfp4(new_name, blocks0, data_torch)
8008+
found_mxfp4_tensors = True
8009+
elif "mlp.experts.gate_up_proj_blocks" in name:
8010+
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
8011+
elif "mlp.experts.gate_up_proj_scales" in name:
8012+
scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :]
8013+
new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight"))
8014+
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
8015+
self.repack_mxfp4(new_name_gate, blocks0, scales0)
8016+
self.repack_mxfp4(new_name_up, blocks1, scales1)
8017+
found_mxfp4_tensors = True
8018+
if not found_mxfp4_tensors:
8019+
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
8020+
return []
8021+
8022+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8023+
del bid # unused
8024+
8025+
if "sinks" in name:
8026+
name += ".weight"
8027+
8028+
# correct naming for down_proj
8029+
if "down_proj" in name:
8030+
if name.endswith("_bias"):
8031+
name = name.replace("down_proj_bias", "down_proj.bias")
8032+
else:
8033+
return []
8034+
8035+
# split the gate_up into gate and up
8036+
if "gate_up_proj" in name:
8037+
if name.endswith("_bias"):
8038+
name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
8039+
name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
8040+
gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2]
8041+
return [
8042+
(self.map_tensor_name(name_gate), gate_proj_bias),
8043+
(self.map_tensor_name(name_up), up_proj_bias)
8044+
]
8045+
else:
8046+
return []
8047+
8048+
return [(self.map_tensor_name(name), data_torch)]
8049+
8050+
def set_vocab(self):
8051+
self._set_vocab_gpt2()
8052+
8053+
def set_gguf_parameters(self):
8054+
super().set_gguf_parameters()
8055+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
8056+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
8057+
8058+
rope_scaling = self.hparams.get("rope_scaling") or {}
8059+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
8060+
assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
8061+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
8062+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
8063+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
8064+
8065+
79538066
@ModelBase.register("Lfm2ForCausalLM")
79548067
@ModelBase.register("LFM2ForCausalLM")
79558068
class LFM2Model(TextModel):
@@ -8089,6 +8202,7 @@ class LazyTorchTensor(gguf.LazyBase):
80898202
_dtype_map: dict[torch.dtype, type] = {
80908203
torch.float16: np.float16,
80918204
torch.float32: np.float32,
8205+
torch.uint8: np.uint8,
80928206
}
80938207

80948208
# used for safetensors slices

0 commit comments

Comments
 (0)