Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions crates/aprender-core/src/format/converter/export_include_01.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@

/// Append tokenizer metadata to GGUF metadata, preferring tokenizer.json over APR fallback.
///
/// P0-G: `vocab_size` is the model's `<arch>.vocab_size` (e.g. 151936 for Qwen2.5 with
/// TP-alignment padding). The tokenizer's true vocabulary may be smaller (151643 for
/// Qwen2.5-Coder). llama.cpp uses `len(tokenizer.ggml.tokens)` as the expected first dim
/// of `token_embd.weight`, so the tokens array MUST be padded to `vocab_size` with
/// placeholder entries to match the actual tensor shape.
fn append_tokenizer_to_metadata(
metadata: &mut Vec<(String, crate::format::gguf::GgufValue)>,
tokenizer: Option<&crate::format::gguf::GgufTokenizer>,
apr_metadata: Option<&crate::format::v2::AprV2Metadata>,
arch: &str,
model_name: &str,
vocab_size: usize,
input: &Path,
) {
if let Some(tok) = tokenizer {
metadata.extend(build_tokenizer_gguf_metadata(tok, arch, model_name));
metadata.extend(build_tokenizer_gguf_metadata(tok, arch, model_name, vocab_size));
return;
}

Expand All @@ -22,7 +29,7 @@ fn append_tokenizer_to_metadata(
let Some(apr_meta) = apr_metadata else {
return;
};
let apr_tok_entries = extract_apr_tokenizer_for_gguf(apr_meta);
let apr_tok_entries = extract_apr_tokenizer_for_gguf(apr_meta, vocab_size);
if !apr_tok_entries.is_empty() {
eprintln!(
"[GH-211] Extracted {} tokenizer entries from APR metadata",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ fn test_build_tokenizer_gguf_metadata_pre_type_gpt2() {
..Default::default()
};

let metadata = build_tokenizer_gguf_metadata(&tok, "gpt2", "gpt2-model");
let metadata = build_tokenizer_gguf_metadata(&tok, "gpt2", "gpt2-model", 0);

let pre_val = metadata
.iter()
Expand All @@ -209,7 +209,7 @@ fn test_build_tokenizer_gguf_metadata_pre_type_llama_default() {
..Default::default()
};

let metadata = build_tokenizer_gguf_metadata(&tok, "llama", "model");
let metadata = build_tokenizer_gguf_metadata(&tok, "llama", "model", 0);

let pre_val = metadata
.iter()
Expand All @@ -231,7 +231,7 @@ fn test_build_tokenizer_gguf_metadata_preserves_roundtrip_pre_type() {
..Default::default()
};

let metadata = build_tokenizer_gguf_metadata(&tok, "llama", "model");
let metadata = build_tokenizer_gguf_metadata(&tok, "llama", "model", 0);

let pre_val = metadata
.iter()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ fn test_extract_apr_tokenizer_maps_bpe_to_gpt2() {
serde_json::json!(["a", "b"]),
);

let entries = extract_apr_tokenizer_for_gguf(&apr);
let entries = extract_apr_tokenizer_for_gguf(&apr, 0);
let model_entry = entries.iter().find(|(k, _)| k == "tokenizer.ggml.model");
assert!(model_entry.is_some());
// "bpe" should be mapped to "gpt2"
Expand All @@ -217,7 +217,7 @@ fn test_extract_apr_tokenizer_includes_chat_template() {
let mut apr = AprV2Metadata::new("test");
apr.chat_template = Some("{% for msg in messages %}...{% endfor %}".to_string());

let entries = extract_apr_tokenizer_for_gguf(&apr);
let entries = extract_apr_tokenizer_for_gguf(&apr, 0);
let tmpl = entries.iter().find(|(k, _)| k == "tokenizer.chat_template");
assert!(tmpl.is_some(), "should include chat template from metadata");
}
Expand All @@ -233,7 +233,7 @@ fn test_extract_apr_tokenizer_chat_template_from_custom() {
serde_json::json!("template_str"),
);

let entries = extract_apr_tokenizer_for_gguf(&apr);
let entries = extract_apr_tokenizer_for_gguf(&apr, 0);
let tmpl = entries.iter().find(|(k, _)| k == "tokenizer.chat_template");
assert!(tmpl.is_some(), "should find chat template in custom fields");
}
Expand All @@ -248,7 +248,7 @@ fn test_extract_apr_tokenizer_add_bos_token() {
serde_json::json!(true),
);

let entries = extract_apr_tokenizer_for_gguf(&apr);
let entries = extract_apr_tokenizer_for_gguf(&apr, 0);
let bos = entries
.iter()
.find(|(k, _)| k == "tokenizer.ggml.add_bos_token");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ fn test_bug_211_extract_apr_tokenizer_for_gguf_with_vocab() {
meta.custom
.insert("tokenizer.eos_token_id".to_string(), serde_json::json!(2));

let entries = extract_apr_tokenizer_for_gguf(&meta);
let entries = extract_apr_tokenizer_for_gguf(&meta, 0);
// Should have at least: model, pre, tokens, merges, bos, eos
assert!(
entries.len() >= 6,
Expand All @@ -82,11 +82,64 @@ fn test_bug_211_extract_apr_tokenizer_for_gguf_with_vocab() {
fn test_bug_211_extract_apr_tokenizer_for_gguf_empty() {
use crate::format::v2::AprV2Metadata;
let meta = AprV2Metadata::default();
let entries = extract_apr_tokenizer_for_gguf(&meta);
let entries = extract_apr_tokenizer_for_gguf(&meta, 0);
// Should still have model and pre even without vocab
assert!(entries.len() >= 2);
}

#[test]
fn test_p0g_apr_fallback_pad_tokens_to_vocab_size() {
use crate::format::gguf::GgufValue;
use crate::format::v2::AprV2Metadata;
let mut meta = AprV2Metadata::default();
meta.architecture = Some("qwen2".to_string());
meta.custom.insert(
"tokenizer.vocabulary".to_string(),
serde_json::json!(["a", "b", "c"]),
);

let entries = extract_apr_tokenizer_for_gguf(&meta, 8);

let tokens = entries
.iter()
.find(|(k, _)| k == "tokenizer.ggml.tokens")
.and_then(|(_, v)| match v {
GgufValue::ArrayString(arr) => Some(arr),
_ => None,
})
.expect("tokens entry");

assert_eq!(tokens.len(), 8, "padded to vocab_size=8");
assert_eq!(tokens[0], "a");
assert_eq!(tokens[3], "<|pad_3|>");
assert_eq!(tokens[7], "<|pad_7|>");
}

#[test]
fn test_p0g_apr_fallback_no_pad_when_vocab_size_zero() {
use crate::format::gguf::GgufValue;
use crate::format::v2::AprV2Metadata;
let mut meta = AprV2Metadata::default();
meta.architecture = Some("qwen2".to_string());
meta.custom.insert(
"tokenizer.vocabulary".to_string(),
serde_json::json!(["a", "b"]),
);

let entries = extract_apr_tokenizer_for_gguf(&meta, 0);

let tokens = entries
.iter()
.find(|(k, _)| k == "tokenizer.ggml.tokens")
.and_then(|(_, v)| match v {
GgufValue::ArrayString(arr) => Some(arr),
_ => None,
})
.expect("tokens entry");

assert_eq!(tokens.len(), 2, "no padding when vocab_size=0");
}

// ========================================================================
// Bug 213: APR metadata → GGUF config round-trip
// ========================================================================
Expand Down
166 changes: 161 additions & 5 deletions crates/aprender-core/src/format/converter/export_tests_tied_gguf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ fn test_build_tokenizer_gguf_metadata_with_full_tokenizer() {
scores: Vec::new(),
};

let metadata = build_tokenizer_gguf_metadata(&tok, "qwen2", "model");
let metadata = build_tokenizer_gguf_metadata(&tok, "qwen2", "model", 0);

let keys: Vec<&str> = metadata.iter().map(|(k, _)| k.as_str()).collect();
assert!(keys.contains(&"tokenizer.ggml.model"));
Expand Down Expand Up @@ -224,7 +224,7 @@ fn test_build_tokenizer_gguf_metadata_without_optional_fields() {
scores: Vec::new(),
};

let metadata = build_tokenizer_gguf_metadata(&tok, "llama", "model");
let metadata = build_tokenizer_gguf_metadata(&tok, "llama", "model", 0);

// Should have model and pre, but no bos/eos/tokens/merges
let keys: Vec<&str> = metadata.iter().map(|(k, _)| k.as_str()).collect();
Expand All @@ -236,6 +236,162 @@ fn test_build_tokenizer_gguf_metadata_without_optional_fields() {
assert!(!keys.contains(&"tokenizer.ggml.merges"));
}

// ========================================================================
// P0-G: tokenizer.ggml.tokens padding to vocab_size
// ========================================================================

#[test]
fn test_p0g_pad_tokens_to_vocab_size() {
use crate::format::gguf::{GgufTokenizer, GgufValue};

let tok = GgufTokenizer {
vocabulary: vec!["a".into(), "b".into(), "c".into()],
merges: vec![],
model_type: Some("gpt2".into()),
bos_token_id: Some(0),
eos_token_id: Some(1),
architecture: None,
model_name: None,
token_type: vec![],
padding_token_id: None,
add_bos_token: None,
chat_template: None,
pre_type: None,
scores: Vec::new(),
};

let metadata = build_tokenizer_gguf_metadata(&tok, "qwen2", "model", 8);

let tokens_val = metadata
.iter()
.find(|(k, _)| k == "tokenizer.ggml.tokens")
.map(|(_, v)| v)
.expect("tokens metadata key");

match tokens_val {
GgufValue::ArrayString(arr) => {
assert_eq!(arr.len(), 8, "should pad to vocab_size=8");
assert_eq!(arr[0], "a");
assert_eq!(arr[1], "b");
assert_eq!(arr[2], "c");
assert_eq!(arr[3], "<|pad_3|>");
assert_eq!(arr[7], "<|pad_7|>");
}
_ => panic!("tokens should be ArrayString"),
}
}

#[test]
fn test_p0g_no_pad_when_vocab_size_zero() {
use crate::format::gguf::{GgufTokenizer, GgufValue};

let tok = GgufTokenizer {
vocabulary: vec!["a".into(), "b".into()],
merges: vec![],
model_type: Some("gpt2".into()),
bos_token_id: None,
eos_token_id: None,
architecture: None,
model_name: None,
token_type: vec![],
padding_token_id: None,
add_bos_token: None,
chat_template: None,
pre_type: None,
scores: Vec::new(),
};

let metadata = build_tokenizer_gguf_metadata(&tok, "qwen2", "model", 0);

let tokens_val = metadata
.iter()
.find(|(k, _)| k == "tokenizer.ggml.tokens")
.map(|(_, v)| v)
.expect("tokens metadata key");

match tokens_val {
GgufValue::ArrayString(arr) => {
assert_eq!(arr.len(), 2, "no padding when vocab_size=0");
assert_eq!(arr[0], "a");
assert_eq!(arr[1], "b");
}
_ => panic!("tokens should be ArrayString"),
}
}

#[test]
fn test_p0g_no_pad_when_vocab_size_equals_tokens() {
use crate::format::gguf::{GgufTokenizer, GgufValue};

let tok = GgufTokenizer {
vocabulary: vec!["a".into(), "b".into(), "c".into()],
merges: vec![],
model_type: Some("gpt2".into()),
bos_token_id: None,
eos_token_id: None,
architecture: None,
model_name: None,
token_type: vec![],
padding_token_id: None,
add_bos_token: None,
chat_template: None,
pre_type: None,
scores: Vec::new(),
};

let metadata = build_tokenizer_gguf_metadata(&tok, "qwen2", "model", 3);

let tokens_val = metadata
.iter()
.find(|(k, _)| k == "tokenizer.ggml.tokens")
.map(|(_, v)| v)
.expect("tokens metadata key");

match tokens_val {
GgufValue::ArrayString(arr) => {
assert_eq!(arr.len(), 3);
assert!(arr.iter().all(|t| !t.starts_with("<|pad_")));
}
_ => panic!("tokens should be ArrayString"),
}
}

#[test]
fn test_p0g_no_pad_when_vocab_size_smaller() {
use crate::format::gguf::{GgufTokenizer, GgufValue};

let tok = GgufTokenizer {
vocabulary: vec!["a".into(), "b".into(), "c".into(), "d".into()],
merges: vec![],
model_type: Some("gpt2".into()),
bos_token_id: None,
eos_token_id: None,
architecture: None,
model_name: None,
token_type: vec![],
padding_token_id: None,
add_bos_token: None,
chat_template: None,
pre_type: None,
scores: Vec::new(),
};

let metadata = build_tokenizer_gguf_metadata(&tok, "qwen2", "model", 2);

let tokens_val = metadata
.iter()
.find(|(k, _)| k == "tokenizer.ggml.tokens")
.map(|(_, v)| v)
.expect("tokens metadata key");

match tokens_val {
GgufValue::ArrayString(arr) => {
assert_eq!(arr.len(), 4, "do not truncate when vocab_size < tokens (would lose data)");
}
_ => panic!("tokens should be ArrayString"),
}
}

// ========================================================================
// GH-258: build_tied_output_weight
// ========================================================================
Expand Down Expand Up @@ -350,7 +506,7 @@ fn test_append_tokenizer_prefers_json_over_apr_fallback() {
serde_json::json!(["x", "y"]),
);

append_tokenizer_to_metadata(&mut metadata, Some(&tok), Some(&apr), "qwen2", "model", &input);
append_tokenizer_to_metadata(&mut metadata, Some(&tok), Some(&apr), "qwen2", "model", 0, &input);

// Should have tokenizer metadata from the GgufTokenizer, not APR
let keys: Vec<&str> = metadata.iter().map(|(k, _)| k.as_str()).collect();
Expand All @@ -372,7 +528,7 @@ fn test_append_tokenizer_uses_apr_fallback_when_no_json() {
apr.custom
.insert("tokenizer.model".to_string(), serde_json::json!("gpt2"));

append_tokenizer_to_metadata(&mut metadata, None, Some(&apr), "qwen2", "model", &input);
append_tokenizer_to_metadata(&mut metadata, None, Some(&apr), "qwen2", "model", 0, &input);

let keys: Vec<&str> = metadata.iter().map(|(k, _)| k.as_str()).collect();
assert!(
Expand All @@ -391,7 +547,7 @@ fn test_append_tokenizer_no_metadata_when_neither_source() {
let dir = tempfile::tempdir().expect("temp dir");
let input = dir.path().join("dummy.safetensors");

append_tokenizer_to_metadata(&mut metadata, None, None, "qwen2", "model", &input);
append_tokenizer_to_metadata(&mut metadata, None, None, "qwen2", "model", 0, &input);

// Should have no tokenizer metadata entries
let tok_keys: Vec<&str> = metadata
Expand Down
Loading
Loading