ml-rust · farhan-syah · Dec 24, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -100,6 +100,9 @@ jobs:
 
       - name: Publish to crates.io
         run: cargo publish --allow-dirty --token ${{ secrets.CARGO_REGISTRY_TOKEN }}
+        env:
+          # Enable PCRE2 JIT compilation
+          PCRE2_SYS_JIT: '1'
 
   # Build Python wheels for multiple platforms
   build-wheels:
@@ -145,13 +148,19 @@ jobs:
         run: |
           vcpkg install pcre2:x64-windows
           echo "PCRE2_SYS_STATIC=1" >> $env:GITHUB_ENV
+          echo "PCRE2_SYS_JIT=1" >> $env:GITHUB_ENV
 
       - name: Build wheels
         uses: PyO3/maturin-action@v1
         with:
-          args: --release --out dist
+          # Build with all features: pcre2 (default), regexr jit+simd
+          # Note: regexr's SIMD uses runtime detection, JIT is compiled at build time
+          args: --release --out dist --features pcre2
           sccache: 'true'
           manylinux: auto
+        env:
+          # Enable PCRE2 JIT compilation
+          PCRE2_SYS_JIT: '1'
 
       - name: Upload wheels
         uses: actions/upload-artifact@v4

diff --git a/.version b/.version
@@ -1 +1 @@
-0.6.0
+0.8.0
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,11 +1,11 @@
 [package]
 name = "splintr"
-version = "0.6.0"
+version = "0.8.0"
 edition = "2021"
 description = "Fast Rust BPE tokenizer with Python bindings"
 license = "MIT"
-repository = "https://github.com/farhan/splintr"
-homepage = "https://github.com/farhan/splintr"
+repository = "https://github.com/ml-rust/splintr"
+homepage = "https://github.com/ml-rust/splintr"
 readme = "README.md"
 keywords = ["tokenizer", "bpe", "tiktoken", "gpt", "llm"]
 categories = ["text-processing", "encoding"]
@@ -15,7 +15,7 @@ name = "splintr"
 crate-type = ["cdylib", "rlib"]
 
 [features]
-default = []
+default = ["pcre2"]
 python = ["dep:pyo3"]
 pcre2 = ["dep:pcre2"]
 
@@ -37,7 +37,7 @@ aho-corasick = "1.1"
 # LRU cache for frequent token sequences
 lru = "0.16"
 # regexr regex engine (default backend)
-regexr = { version = "0.1.0-beta.4", features = ["jit", "simd"] }
+regexr = { version = "0.1.0-beta.5", features = ["jit", "simd"] }
 
 [dev-dependencies]
 # PCRE2 for benchmarking comparisons

diff --git a/README.md b/README.md
@@ -39,6 +39,9 @@ from splintr import Tokenizer
 tokenizer = Tokenizer.from_pretrained("cl100k_base")  # OpenAI GPT-4/3.5
 # tokenizer = Tokenizer.from_pretrained("llama3")      # Meta Llama 3 family
 # tokenizer = Tokenizer.from_pretrained("deepseek_v3") # DeepSeek V3/R1
+# tokenizer = Tokenizer.from_pretrained("mistral_v1")  # Mistral 7B v0.1/v0.2
+# tokenizer = Tokenizer.from_pretrained("mistral_v2")  # Mistral 7B v0.3, Codestral
+# tokenizer = Tokenizer.from_pretrained("mistral_v3")  # Mistral NeMo, Large 2
 
 # Encode and decode
 tokens = tokenizer.encode("Hello, world!")
@@ -55,7 +58,7 @@ See the [API Guide](docs/api_guide.md) for complete documentation and examples.
 
 ```toml
 [dependencies]
-splintr = "0.6.0"
+splintr = "*"  # or pin to a specific version
 ```
 
 ```rust
@@ -79,7 +82,7 @@ See the [API Guide](docs/api_guide.md) and [docs.rs](https://docs.rs/splintr) fo
 
 **Built for production:**
 
-- **Compatible vocabularies** - Supports cl100k_base, o200k_base (OpenAI), Llama 3 family (Meta), and DeepSeek V3 (DeepSeek)
+- **Compatible vocabularies** - Supports cl100k_base, o200k_base (OpenAI), Llama 3 family (Meta), DeepSeek V3 (DeepSeek), and Mistral V1/V2/V3 (Mistral AI)
 - **Streaming decoders** - Real-time LLM output display with proper UTF-8 handling ([guide](docs/api_guide.md#streaming-decoder))
 - **54 agent tokens** - Built-in support for chat, CoT reasoning, ReAct agents, tool calling, RAG citations ([docs](docs/special_tokens.md))
 - **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE
@@ -139,7 +142,7 @@ This architecture ensures splintr is optimized for the most common tokenization
 
 ```bash
 # Clone and install
-git clone https://github.com/farhan-syah/splintr.git
+git clone https://github.com/ml-rust/splintr.git
 cd splintr
 pip install -e .
 pip install tiktoken
@@ -159,6 +162,7 @@ The benchmark suite tests single text encoding, batch encoding, streaming decode
 Splintr uses a pure-Rust regex engine ([`regexr`](https://crates.io/crates/regexr)) by default, with optional PCRE2 support for compatibility.
 
 **Default Backend (regexr):**
+
 - Pure Rust implementation (no C dependencies)
 - JIT compilation and SIMD acceleration
 - Native UTF-8 and Unicode property support
@@ -215,12 +219,15 @@ See the [API Guide](docs/api_guide.md#streaming-decoder) for detailed usage, exa
 
 ## Supported Vocabularies
 
-| Vocabulary     | Used By                       | Vocabulary Size | Special Tokens | Import Constant       |
-| -------------- | ----------------------------- | --------------- | -------------- | --------------------- |
-| `cl100k_base`  | GPT-4, GPT-3.5-turbo          | ~100,000        | 5 + 54 agent   | `CL100K_BASE_PATTERN` |
-| `o200k_base`   | GPT-4o                        | ~200,000        | 2 + 54 agent   | `O200K_BASE_PATTERN`  |
-| `llama3`       | Llama 3, 3.1, 3.2, 3.3 (Meta) | ~128,000        | 11 + 54 agent  | `LLAMA3_PATTERN`      |
-| `deepseek_v3`  | DeepSeek V3, DeepSeek R1      | ~128,000        | 17 + 54 agent  | `LLAMA3_PATTERN`      |
+| Vocabulary    | Used By                             | Vocabulary Size | Special Tokens  | Import Constant            |
+| ------------- | ----------------------------------- | --------------- | --------------- | -------------------------- |
+| `cl100k_base` | GPT-4, GPT-3.5-turbo                | ~100,000        | 5 + 54 agent    | `CL100K_BASE_PATTERN`      |
+| `o200k_base`  | GPT-4o                              | ~200,000        | 2 + 54 agent    | `O200K_BASE_PATTERN`       |
+| `llama3`      | Llama 3, 3.1, 3.2, 3.3 (Meta)       | ~128,000        | 11 + 54 agent   | `LLAMA3_PATTERN`           |
+| `deepseek_v3` | DeepSeek V3, DeepSeek R1            | ~128,000        | 17 + 54 agent   | `LLAMA3_PATTERN`           |
+| `mistral_v1`  | Mistral 7B v0.1/v0.2, Mixtral 8x7B  | ~32,000         | 3 + 54 agent    | `SENTENCEPIECE_PATTERN`    |
+| `mistral_v2`  | Mistral 7B v0.3, Codestral, 8x22B   | ~32,768         | 10 + 54 agent   | `SENTENCEPIECE_PATTERN`    |
+| `mistral_v3`  | Mistral NeMo, Large 2, Pixtral      | ~131,000        | 10 + 54 agent   | `MISTRAL_V3_PATTERN`       |
 
 **OpenAI standard tokens:**
 
@@ -235,6 +242,12 @@ See the [API Guide](docs/api_guide.md#streaming-decoder) for detailed usage, exa
 
 - **deepseek_v3**: `<｜begin▁of▁sentence｜>`, `<｜end▁of▁sentence｜>`, `<think>`, `</think>`, `<｜User｜>`, `<｜Assistant｜>`, `<|EOT|>`, FIM tokens (`<｜fim▁hole｜>`, `<｜fim▁begin｜>`, `<｜fim▁end｜>`), tool calling tokens (`<｜tool▁calls▁begin｜>`, `<｜tool▁call▁begin｜>`, etc.)
 
+**Mistral standard tokens:**
+
+- **mistral_v1**: `<unk>`, `<s>`, `</s>` (SentencePiece native)
+- **mistral_v2**: Same as V1 + control tokens: `[INST]`, `[/INST]`, `[TOOL_CALLS]`, `[AVAILABLE_TOOLS]`, `[/AVAILABLE_TOOLS]`, `[TOOL_RESULTS]`, `[/TOOL_RESULTS]`
+- **mistral_v3**: `<unk>`, `<s>`, `</s>` + control tokens (Tekken/Tiktoken-based, NOT SentencePiece)
+
 ### Agent Tokens (54 per model)
 
 Splintr extends all vocabularies with 54 specialized tokens for building agent systems:
@@ -317,7 +330,7 @@ Contributions are welcome! Here's how you can help:
 
 ```bash
 # Clone the repository
-git clone https://github.com/farhan-syah/splintr.git
+git clone https://github.com/ml-rust/splintr.git
 cd splintr
 
 # Install pre-commit hook (recommended)
@@ -357,6 +370,6 @@ If you use Splintr in your research, please cite:
   author = {Farhan Syah},
   title = {Splintr: High-Performance BPE Tokenizer},
   year = {2025},
-  url = {https://github.com/farhan-syah/splintr}
+  url = {https://github.com/ml-rust/splintr}
 }
 ```
diff --git a/docs/api_guide.md b/docs/api_guide.md
@@ -38,6 +38,9 @@ tokenizer = Tokenizer.from_pretrained("cl100k_base")  # OpenAI GPT-4/3.5
 tokenizer = Tokenizer.from_pretrained("o200k_base")   # OpenAI GPT-4o
 tokenizer = Tokenizer.from_pretrained("llama3")       # Meta Llama 3 family
 tokenizer = Tokenizer.from_pretrained("deepseek_v3")  # DeepSeek V3/R1
+tokenizer = Tokenizer.from_pretrained("mistral_v1")   # Mistral 7B v0.1/v0.2, Mixtral 8x7B
+tokenizer = Tokenizer.from_pretrained("mistral_v2")   # Mistral 7B v0.3, Codestral, Mixtral 8x22B
+tokenizer = Tokenizer.from_pretrained("mistral_v3")   # Mistral NeMo, Large 2, Pixtral
 ```
 
 **Load from custom vocabulary file:**
@@ -280,7 +283,7 @@ Add Splintr to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-splintr = "0.6.0"
+splintr = "*"  # or pin to a specific version
 ```
 
 ### Basic Usage
@@ -609,4 +612,4 @@ print("\nDone!")
 - [Special Tokens Documentation](special_tokens.md) - Complete agent tokens reference
 - [ByteLevel BPE Documentation](bytelevel_bpe.md) - ByteLevel encoding details
 - [API Documentation (Rust)](https://docs.rs/splintr) - Complete Rust API reference
-- [GitHub Repository](https://github.com/farhan-syah/splintr) - Source code and examples
+- [GitHub Repository](https://github.com/ml-rust/splintr) - Source code and examples