diff --git a/external/transformers/scripts/encode_compare_tokenizer.py b/external/transformers/scripts/encode_compare_tokenizer.py new file mode 100644 index 00000000..90dd7e53 --- /dev/null +++ b/external/transformers/scripts/encode_compare_tokenizer.py @@ -0,0 +1,302 @@ +""" +This script compares the basic `.encode` tokenization +between Hugging Face and Mistral Common tokenizers +across multiple datasets. + +It identifies and reports mismatches in tokenization results, +helping ensure consistency between the two tokenizers. +The comparison is limited to a user-specified number of characters +to avoid excessively long sequences. + +The mismatch rate is calculated as the percentage of samples +where tokenization results differ at any point. +Results include the exact token where the mismatch starts, +as well as the three tokens immediately before and after, +totaling 7 tokens. + +If no mismatch is detected, it only indicates that +the first `max_chars` characters are consistent— +not necessarily the entire sample. + +Some datasets may show a high mismatch rate due to +frequent occurrences of the same problematic string. +We recommend reviewing the raw results for details. + +Usage: + python3 compare_tokenizer.py --hf_model --mc_model + [--num_samples ] + [--max_chars ] + [--hf_token ] + [--save_results] + +Arguments: +- `--hf_model`: Model name or path for the Hugging Face tokenizer. +- `--mc_model`: Model name or path for the Mistral-Common tokenizer. + If not provided, defaults to `hf_model`. +- `--num_samples`: Maximum number of samples to test (default: 3000). +- `--max_chars`: Maximum number of characters to tokenize per sample (default: 10,000). +- `--hf_token`: Hugging Face token for private models (optional). +- `--save_results`: If provided, saves raw mismatch results to a JSON file + (default: `tokenizer_mismatches_data.json`). + +Examples: + python3 compare_tokenizer.py + --hf_model mistralai/Mistral-Small-3.1-24b-Instruct-2503 + --mc_model mistralai/Mistral-Small-3.1-24b-Instruct-2503 + --save_results + python3 compare_tokenizer.py + --hf_model mistralai/Mistral-Nemo-Instruct-2407 + --mc_model mistralai/Mistral-Small-3.1-24b-Instruct-2503 + --num_samples 1000 + --max_chars 5000 +""" + +import argparse +import json +from collections import Counter +from typing import Any + +from datasets import load_dataset +from tqdm import tqdm +from transformers import AutoTokenizer + +from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + + +def compare_tokenizers( + hf_tokenizer: AutoTokenizer, + mc_tokenizer: MistralTokenizer, + content: str, + max_chars: int = 10_000, +) -> dict[str, Any] | None: + """Compare tokenization between Hugging Face and Mistral tokenizers.""" + content = content[:max_chars] # Limit to max_chars characters to avoid long sequences + hf_tokens = hf_tokenizer.encode(content, add_special_tokens=False) + mc_tokens = mc_tokenizer.instruct_tokenizer.tokenizer.encode(content, bos=False, eos=False) + if hf_tokens != mc_tokens: + mismatch_id = next( + (idx for idx, (a, b) in enumerate(zip(hf_tokens, mc_tokens)) if a != b), + min(len(hf_tokens), len(mc_tokens)), + ) + start = max(0, mismatch_id - 3) + end = min(len(hf_tokens), mismatch_id + 4) + hf_str = [hf_tokenizer.decode([t]) for t in hf_tokens[start:end]] + mc_str = [mc_tokenizer.decode([t], special_token_policy=SpecialTokenPolicy.KEEP) for t in mc_tokens[start:end]] + return { + "string_content": "".join(mc_str), + "mistral_common": mc_str, + "hugging_face": hf_str, + } + return None + + +def process_dataset( + dataset_name: str, + split: str, + hf_tokenizer: AutoTokenizer, + mc_tokenizer: MistralTokenizer, + num_samples: int, + max_chars: int, + **kwargs: Any, +) -> list[dict[str, Any]]: + """Process a dataset and collect tokenizer mismatches.""" + mismatches = [] + dataset = load_dataset(dataset_name, split=split, streaming=True) + for i, example in tqdm( + enumerate(dataset), + desc=f"Processing {dataset_name} ({split})", + total=num_samples, + ): + if i >= num_samples: + break + content = example["text"] + mismatch = compare_tokenizers(hf_tokenizer, mc_tokenizer, content, max_chars) + if mismatch: + mismatch.update(kwargs) + mismatches.append(mismatch) + return mismatches + + +def test_tokenizer( + hf_model: str, + mc_model: str, + num_samples: int = 3000, + max_chars: int = 10_000, + hf_token: str | None = None, +) -> dict[str, list[dict[str, Any]]]: + """ + Test tokenizer consistency across multiple datasets. + Output Example: + === SUMMARY === + Total Mismatch Rate: 1.87% ( 56 / 3000 ) + - HuggingFaceFW/fineweb: 0.60% ( 6 / 1000 ) + - HuggingFaceFW/finepdfs: 1.70% ( 17 / 1000 ) + - manu/project_gutenberg: 3.30% ( 33 / 1000 ) + === MISMATCH REPORT: TOP 10 FREQUENT TOKENS === + Total mismatched unique tokens analyzed: 199 + Token -> Frequency (across all mismatches) + ---------------------------------------- + "'" -> 73 times + ' O' -> 44 times + ' Jimmy' -> 32 times + ' (' -> 22 times + '/' -> 21 times + 'Re' -> 18 times + 'Produ' -> 18 times + ' and' -> 16 times + 'gan' -> 16 times + 'Reg' -> 16 times + """ + print("Loading HF Tokenizer.") + hf_tokenizer = AutoTokenizer.from_pretrained(hf_model, token=hf_token) + print("Loading Mistral-Common Tokenizer.") + mc_tokenizer = MistralTokenizer.from_hf_hub(mc_model, token=hf_token) + n_per_set = num_samples // 3 + # Web Data - Mostly English + print(f"Testing Tokenizers on HuggingFaceFW/fineweb with {n_per_set} samples.") + web_mismatches = process_dataset( + "HuggingFaceFW/fineweb", + "train", + hf_tokenizer, + mc_tokenizer, + n_per_set, + max_chars, + ) + web_mismatch_rate = (len(web_mismatches) / n_per_set) * 100 + print(f"HuggingFaceFW/fineweb: {web_mismatch_rate:.2f}% Sample Mismatch ({len(web_mismatches)}/{n_per_set})") + # Web PDF Data - Mostly English + print(f"Testing Tokenizers on HuggingFaceFW/finepdfs with {n_per_set} samples.") + pdf_mismatches = process_dataset( + "HuggingFaceFW/finepdfs", + "train", + hf_tokenizer, + mc_tokenizer, + n_per_set, + max_chars, + ) + pdf_mismatch_rate = (len(pdf_mismatches) / n_per_set) * 100 + print(f"HuggingFaceFW/finepdfs: {pdf_mismatch_rate:.2f}% Sample Mismatch ({len(pdf_mismatches)}/{n_per_set})") + # Multilingual Creative Writing Data + print(f"Testing Tokenizers on manu/project_gutenberg with {n_per_set} samples.") + multi_mismatches = [] + dataset_info = load_dataset("manu/project_gutenberg", streaming=True) + splits = [s for s in list(dataset_info.keys()) if s != "en"] + n_per_split = n_per_set // len(splits) + collected = 0 + for split in splits: + if collected >= n_per_set: + break + split_mismatches = process_dataset( + "manu/project_gutenberg", + split, + hf_tokenizer, + mc_tokenizer, + n_per_split, + max_chars, + ) + multi_mismatches.extend(split_mismatches) + collected += n_per_split + if collected < n_per_set: + remaining = num_samples - collected + en_mismatches = process_dataset( + "manu/project_gutenberg", + "en", + hf_tokenizer, + mc_tokenizer, + remaining, + max_chars, + ) + multi_mismatches.extend(en_mismatches) + multi_mismatch_rate = (len(multi_mismatches) / n_per_set) * 100 + print(f"manu/project_gutenberg: {multi_mismatch_rate:.2f}% Sample Mismatch ({len(multi_mismatches)}/{n_per_set})") + # Total Mismatch + print("\n=== SUMMARY ===") + rate = (len(web_mismatches) + len(pdf_mismatches) + len(multi_mismatches)) / num_samples * 100 + print( + f"Total Mismatch Rate: {rate:.2f}% " + f"( {len(web_mismatches) + len(pdf_mismatches) + len(multi_mismatches)} / {num_samples} )" + ) + print( + f" - HuggingFaceFW/fineweb: {len(web_mismatches) / n_per_set * 100:.2f}% " + f"( {len(web_mismatches)} / {n_per_set} )" + ) + print( + f" - HuggingFaceFW/finepdfs: {len(pdf_mismatches) / n_per_set * 100:.2f}% " + f"( {len(pdf_mismatches)} / {n_per_set} )" + ) + print( + f" - manu/project_gutenberg: {len(multi_mismatches) / n_per_set * 100:.2f}% " + f"( {len(multi_mismatches)} / {n_per_set} )" + ) + return { + "web_model_mismatches": web_mismatches, + "pdf_model_mismatches": pdf_mismatches, + "multi_model_mismatches": multi_mismatches, + } + + +def generate_mismatch_report(mismatch_results: dict[str, list[dict[str, Any]]]) -> None: + """Generate a report of the most frequent tokens in mismatched samples.""" + all_tokens = [] + # Flatten all mismatches from all datasets + for dataset_key in mismatch_results: + for mismatch in mismatch_results[dataset_key]: + hf_tokens = mismatch["hugging_face"] + mc_tokens = mismatch["mistral_common"] + all_tokens.extend(hf_tokens) + all_tokens.extend(mc_tokens) + # Count and rank tokens + token_counter = Counter(all_tokens) + most_common = token_counter.most_common(10) # Top 10 + print("\n=== MISMATCH REPORT: TOP 10 FREQUENT TOKENS ===") + print(f"Total mismatched unique tokens analyzed: {len(token_counter)}") + print("Token -> Frequency (across all mismatches)") + print("-" * 40) + for token, count in most_common: + print(f"{token!r} -> {count} times") + + +def save_results( + results: dict[str, list[dict[str, Any]]], + filename: str = "tokenizer_mismatches_data.json", +) -> None: + """Save the results to a JSON file.""" + with open(filename, "w") as f: + json.dump(results, f, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test tokenizer consistency between Hugging Face and Mistral Common.") + parser.add_argument("--hf_model", type=str, help="Model name or path for the HF tokenizer.") + parser.add_argument( + "--mc_model", + type=str, + default=None, + help="Model name or path for the Mistral-Common tokenizer.", + ) + parser.add_argument("--n", type=int, default=3000, help="Maximum number of samples to test") + parser.add_argument( + "--max_chars", + type=int, + default=10_000, + help="Maximum number of characters to tokenize per sample", + ) + parser.add_argument( + "--hf_token", + type=str, + default=None, + help="Hugging Face token for private models", + ) + parser.add_argument("--save_results", action="store_true", help="Save results to a JSON file") + args = parser.parse_args() + results = test_tokenizer( + args.hf_model, + args.mc_model if args.mc_model else args.hf_model, + args.n, + args.max_chars, + args.hf_token, + ) + generate_mismatch_report(results) + if args.save_results: + save_results(results) diff --git a/external/transformers/scripts/full_compare_tokenizer.py b/external/transformers/scripts/full_compare_tokenizer.py new file mode 100644 index 00000000..eeab2aed --- /dev/null +++ b/external/transformers/scripts/full_compare_tokenizer.py @@ -0,0 +1,444 @@ +""" +This script compares tokenization between Hugging Face and Mistral Common tokenizers +across multiple datasets and modes (e.g., text_encode, text_instruct). +It identifies and reports mismatches in tokenization results, +helping ensure consistency between the two tokenizers. +The comparison is limited to a user-specified number of tokens +to avoid excessively long sequences. +The mismatch rate is calculated as the percentage of samples +where tokenization results differ at any point. +Results include the exact token where the mismatch starts, +as well as the three tokens immediately before and after, +totaling 7 tokens. +If no mismatch is detected, it only indicates that +the first `max_tokens` tokens are consistent— +not necessarily the entire sample. +Some datasets may show a high mismatch rate due to +frequent occurrences of the same problematic string. +We recommend reviewing the raw results for details. + +Usage: + python3 compare_tokenizer.py --hf_model [--mc_model ] + [--config ] + [--type ...] + [--hf_token ] + [--save_results] + +Arguments: +- `--hf_model`: Model name or path for the Hugging Face tokenizer (required). +- `--mc_model`: Model name or path for the Mistral-Common tokenizer. + If not provided, defaults to `hf_model`. +- `--random_only`: If provided, only use random UTF-8 strings and no datasets for quick testing. +- `--config`: Path to JSON config file with mode-specific settings + (default: "external/transformers/scripts/full_compare_tokenizer_config.json"). +- `--type`: Type(s) of tokenization to perform (space-separated). + Supported modes: text_encode, text_instruct, vision_encode, vision_instruct, + tool_call_instruct, text_reasoning, vision_reasoning, tool_call_reasoning. + Default: ["text_encode"]. +- `--hf_token`: Hugging Face token for private models (optional). +- `--save_results`: If provided, saves raw mismatch results and report to JSON files + (default: "tokenizer_mismatches_data.json" and "tokenizer_mismatches_report.json"). + +Examples: + python3 compare_tokenizer.py + --hf_model mistralai/Mistral-Small-3.1-24b-Instruct-2503 + --type text_encode text_instruct + --save_results + + python3 compare_tokenizer.py + --hf_model mistralai/Mistral-Nemo-Instruct-2407 + --mc_model mistralai/Mistral-Small-3.1-24b-Instruct-2503 + --config custom_config.json + --type text_encode text_instruct + --save_results + + python3 compare_tokenizer.py + --hf_model mistralai/Mistral-Small-24b-Instruct-2501 + --type text_encode text_instruct + --random_only +""" + +import argparse +import json +import random +from collections import Counter + +from datasets import load_dataset +from tqdm import tqdm +from transformers import AutoTokenizer + +from mistral_common.protocol.instruct.request import ChatCompletionRequest +from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + + +def compare_tokenizers( + hf_tokenizer: AutoTokenizer, + mc_tokenizer: MistralTokenizer, + content: str | dict | list, + max_tokens: int = 2_048, + mode: str = "text_encode", +) -> dict[str] | None: + """Compare tokenization between Hugging Face and Mistral tokenizers.""" + if mode == "text_encode": + assert isinstance(content, str), "Content must be a string for text_encode mode." + content_str = content + hf_tokens = hf_tokenizer.encode(content_str, add_special_tokens=False)[:max_tokens] + mc_tokens = mc_tokenizer.instruct_tokenizer.tokenizer.encode(content_str, bos=False, eos=False)[:max_tokens] + elif mode == "text_instruct": + assert isinstance(content, list), "Content must be a list of messages for text_instruct mode." + if content and content[-1]["role"] == "assistant": + content = content[:-1] + if content and content[0]["role"] != "system": + content = [{"role": "system", "content": ""}] + content + hf_tokens = hf_tokenizer.apply_chat_template(content, tokenize=True)[:max_tokens] + request = ChatCompletionRequest(messages=content) + mc_tokens = mc_tokenizer.encode_chat_completion(request).tokens[:max_tokens] + else: + raise NotImplementedError(f"Mode '{mode}' is not yet implemented.") + if hf_tokens != mc_tokens: + mismatch_id = next( + (idx for idx, (a, b) in enumerate(zip(hf_tokens, mc_tokens)) if a != b), + min(len(hf_tokens), len(mc_tokens)), + ) + start = max(0, mismatch_id - 3) + end = min(len(hf_tokens), mismatch_id + 4) + hf_str = [hf_tokenizer.decode([t]) for t in hf_tokens[start:end]] + mc_str = [mc_tokenizer.decode([t], special_token_policy=SpecialTokenPolicy.KEEP) for t in mc_tokens[start:end]] + return { + "string_content": "".join(mc_str), + "mistral_common": mc_str, + "hugging_face": hf_str, + "mode": mode, + } + return None + + +def process_dataset( + dataset_name: str, + split: str, + column: str, + hf_tokenizer: AutoTokenizer, + mc_tokenizer: MistralTokenizer, + num_samples: int, + max_tokens: int, + mode: str, +) -> list[dict[str]]: + """Process a dataset and collect tokenizer mismatches.""" + mismatches = [] + if dataset_name not in ["random_utf8", "random_instruct_utf8"]: + dataset = load_dataset(dataset_name, split=split, streaming=True) + try: + for i, example in tqdm( + enumerate(dataset), + desc=f"Processing {dataset_name} ({split}, mode: {mode})", + total=num_samples, + ): + if i >= num_samples: + break + content = example[column] + mismatch = compare_tokenizers(hf_tokenizer, mc_tokenizer, content, max_tokens, mode) + if mismatch: + mismatch.update( + { + "dataset_name": dataset_name, + "dataset_column": column, + "sample_index": i, + } + ) + mismatches.append(mismatch) + finally: + if hasattr(dataset, "close"): + print(f"Closing dataset {dataset_name}.") + dataset.close() + else: + if dataset_name == "random_utf8": + for i in tqdm(range(num_samples), desc=f"Processing random_utf8 (mode: {mode})"): + content = "".join([chr(random.randint(0, 127)) for _ in range(max_tokens)]) + mismatch = compare_tokenizers(hf_tokenizer, mc_tokenizer, content, max_tokens, mode) + if mismatch: + mismatch.update( + { + "dataset_name": dataset_name, + "dataset_column": column, + "sample_index": i, + } + ) + mismatches.append(mismatch) + elif dataset_name == "random_instruct_utf8": + for i in tqdm( + range(num_samples), + desc=f"Processing random_instruct_utf8 (mode: {mode})", + ): + content = ( + [ + { + "role": "system", + "content": "".join([chr(random.randint(0, 127)) for _ in range(512)]), + } + ] + if random.random() < 0.5 + else [] + ) + for i in range(random.randint(1, 10)): + if len(content) == 0: + content.append( + { + "role": "user", + "content": "".join([chr(random.randint(0, 127)) for _ in range(512)]), + } + ) + else: + content.append( + { + "role": ("assistant" if content[-1]["role"] == "user" else "user"), + "content": "".join([chr(random.randint(0, 127)) for _ in range(512)]), + } + ) + mismatch = compare_tokenizers(hf_tokenizer, mc_tokenizer, content, max_tokens, mode) + if mismatch: + mismatch.update( + { + "dataset_name": dataset_name, + "dataset_column": column, + "sample_index": i, + } + ) + mismatches.append(mismatch) + return mismatches + + +def test_tokenizer( + hf_model: str, + mc_model: str, + random_only: bool, + modes: list[str], + hf_token: str | None = None, + config: dict[str] = {}, +) -> tuple[dict[str, list[dict[str]]], dict[str]]: + """Test tokenizer consistency for each mode and dataset in the config. + Returns: (mismatch_results, stats) + """ + print("Loading HF Tokenizer.") + hf_tokenizer = AutoTokenizer.from_pretrained(hf_model, token=hf_token) + print("Loading Mistral-Common Tokenizer.") + mc_tokenizer = MistralTokenizer.from_hf_hub(mc_model, token=hf_token) + all_mismatches = {} + stats = {} + + for mode in modes: + mode_config = config.get(mode, {}) + if not mode_config or "datasets" not in mode_config: + print(f"Warning: No config found for mode '{mode}'. Skipping.") + continue + + all_mismatches[mode] = [] + mode_total_samples = 0 + mode_total_mismatches = 0 + mode_dataset_stats = {} + + for dataset_config in mode_config["datasets"]: + dataset_name = dataset_config["name"] + if random_only and dataset_name not in [ + "random_utf8", + "random_instruct_utf8", + ]: + continue + split = dataset_config.get("split", "train") + column = dataset_config.get("column", "text") + num_samples = dataset_config.get("num_samples", 1000) + max_tokens = dataset_config.get("max_tokens", 2048) + + print( + f"\nTesting Tokenizers on {dataset_name} (column: {column}, mode: {mode}) with {num_samples} samples." + ) + mismatches = process_dataset( + dataset_name, + split, + column, + hf_tokenizer, + mc_tokenizer, + num_samples, + max_tokens, + mode, + ) + + dataset_mismatches = len(mismatches) + mismatch_rate = (dataset_mismatches / num_samples) * 100 + print(f"{dataset_name} ({mode}): {mismatch_rate:.2f}% Sample Mismatch ({dataset_mismatches}/{num_samples})") + + all_mismatches[mode].extend(mismatches) + mode_total_samples += num_samples + mode_total_mismatches += dataset_mismatches + mode_dataset_stats[dataset_name] = { + "total_samples": num_samples, + "mismatches": dataset_mismatches, + "mismatch_rate": mismatch_rate, + } + + # Save mode-level stats + mode_mismatch_rate = (mode_total_mismatches / mode_total_samples) * 100 if mode_total_samples > 0 else 0 + stats[mode] = { + "total_samples": mode_total_samples, + "total_mismatches": mode_total_mismatches, + "mismatch_rate": mode_mismatch_rate, + "datasets": mode_dataset_stats, + } + + return all_mismatches, stats + + +def generate_mismatch_report(mismatch_results: dict[str, list[dict[str]]], stats: dict[str]) -> dict[str]: + """Generate a detailed report using precomputed stats and return it as a dict.""" + report = {"summary_by_mode": {}, "top_tokens_by_mode": {}, "top_tokens_overall": {}} + + # Summary by mode + for mode, mode_stat in stats.items(): + report["summary_by_mode"][mode] = { + "total_samples": mode_stat["total_samples"], + "total_mismatches": mode_stat["total_mismatches"], + "mismatch_rate": mode_stat["mismatch_rate"], + "datasets": mode_stat["datasets"], + } + + # Top tokens by mode + for mode in mismatch_results: + mode_tokens = [] + for mismatch in mismatch_results[mode]: + mode_tokens.extend(mismatch["hugging_face"]) + mode_tokens.extend(mismatch["mistral_common"]) + token_counter = Counter(mode_tokens) + report["top_tokens_by_mode"][mode] = { + "total_unique_tokens": len(token_counter), + "top_5_tokens": token_counter.most_common(5), + } + + # Top tokens overall + all_tokens = [] + for mode in mismatch_results: + for mismatch in mismatch_results[mode]: + all_tokens.extend(mismatch["hugging_face"]) + all_tokens.extend(mismatch["mistral_common"]) + token_counter = Counter(all_tokens) + report["top_tokens_overall"] = { + "total_unique_tokens": len(token_counter), + "top_10_tokens": token_counter.most_common(10), + } + + # Print the report + print("\n=== MISMATCH SUMMARY BY MODE ===") + for mode, mode_stat in stats.items(): + print(f"\n--- Mode: {mode} ---") + print(f"Total Samples: {mode_stat['total_samples']}") + print(f"Total Mismatches: {mode_stat['total_mismatches']}") + print(f"Mismatch Rate: {mode_stat['mismatch_rate']:.2f}%") + print("Per-Dataset Breakdown:") + for dataset, ds_stat in mode_stat["datasets"].items(): + print( + f" - {dataset}: {ds_stat['mismatch_rate']:.2f}% ({ds_stat['mismatches']}/{ds_stat['total_samples']})" + ) + + print("\n=== TOP 5 MISMATCHED TOKENS BY MODE ===") + for mode in mismatch_results: + mode_tokens = [] + for mismatch in mismatch_results[mode]: + mode_tokens.extend(mismatch["hugging_face"]) + mode_tokens.extend(mismatch["mistral_common"]) + token_counter = Counter(mode_tokens) + most_common = token_counter.most_common(5) + print(f"\n--- Mode: {mode} ---") + print(f"Total unique tokens: {len(token_counter)}") + print("Token -> Frequency") + print("-" * 30) + for token, count in most_common: + print(f"{token!r} -> {count} times") + + print("\n=== OVERALL TOP 10 MISMATCHED TOKENS (ALL MODES) ===") + print(f"Total unique tokens analyzed: {len(token_counter)}") + print("Token -> Frequency (across all mismatches)") + print("-" * 40) + for token, count in token_counter.most_common(10): + print(f"{token!r} -> {count} times") + + return report + + +def save_results( + results: dict[str, list[dict[str]]], + report: dict[str], + mismatches_filename: str = "tokenizer_mismatches_data.json", + report_filename: str = "tokenizer_mismatches_report.json", +) -> None: + """Save the results and report to JSON files.""" + with open(mismatches_filename, "w") as f: + json.dump(results, f, indent=2) + with open(report_filename, "w") as f: + json.dump(report, f, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test tokenizer consistency between Hugging Face and Mistral Common.") + parser.add_argument( + "--hf_model", + type=str, + required=True, + help="Model name or path for the HF tokenizer.", + ) + parser.add_argument( + "--mc_model", + type=str, + default=None, + help="Model name or path for the Mistral-Common tokenizer.", + ) + parser.add_argument( + "--config", + type=str, + default="external/transformers/scripts/full_compare_tokenizer_config.json", + help="Path to JSON config file with mode-specific settings.", + ) + parser.add_argument( + "--random_only", + action="store_true", + help="Only test random UTF-8 data (for quick testing).", + ) + parser.add_argument( + "--hf_token", + type=str, + help="Hugging Face token for private models.", + ) + parser.add_argument( + "--type", + type=str, + nargs="+", + default=["text_encode"], + choices=[ + "text_encode", + "vision_encode", + "text_instruct", + "vision_instruct", + "tool_call_instruct", + "text_reasoning", + "vision_reasoning", + "tool_call_reasoning", + ], + help="Type(s) of tokenization to perform (space-separated).", + ) + parser.add_argument("--save_results", action="store_true", help="Save results to JSON files.") + args = parser.parse_args() + + with open(args.config, "r") as f: + config = json.load(f) + + results, stats = test_tokenizer( + args.hf_model, + args.mc_model if args.mc_model else args.hf_model, + args.random_only, + args.type, + args.hf_token, + config, + ) + + report = generate_mismatch_report(results, stats) + + if args.save_results: + save_results(results, report) diff --git a/external/transformers/scripts/full_compare_tokenizer_config.json b/external/transformers/scripts/full_compare_tokenizer_config.json new file mode 100644 index 00000000..3ba66372 --- /dev/null +++ b/external/transformers/scripts/full_compare_tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "text_encode": { + "datasets": [ + { + "name": "random_utf8", + "num_samples": 1000, + "max_tokens": 2048 + }, + { + "name": "HuggingFaceFW/fineweb", + "split": "train", + "column": "text", + "num_samples": 500, + "max_tokens": 2048 + }, + { + "name": "HuggingFaceFW/finepdfs", + "split": "train", + "column": "text", + "num_samples": 500, + "max_tokens": 2048 + } + ] + }, + "text_instruct": { + "datasets": [ + { + "name": "random_instruct_utf8", + "num_samples": 1000, + "max_tokens": 2048 + }, + { + "name": "HuggingFaceH4/ultrachat_200k", + "split": "train_sft", + "column": "messages", + "num_samples": 500, + "max_tokens": 2048 + } + ] + } + } \ No newline at end of file