ml-rust · farhan-syah · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -60,4 +60,7 @@ coverage.xml
 *.cover
 
 /research
-.python-version
+.python-version
+
+# Local cargo config (dev overrides)
+.cargo/
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,9 +14,13 @@ categories = ["text-processing", "encoding"]
 name = "splintr"
 crate-type = ["cdylib", "rlib"]
 
+[features]
+default = []
+pcre2 = ["dep:pcre2"]
+
 [dependencies]
-# PCRE2 regex with JIT support (2-4x faster than fancy-regex)
-pcre2 = "0.2"
+# PCRE2 regex with JIT support (optional, for benchmarking)
+pcre2 = { version = "0.2", optional = true }
 # Rayon for internal parallelism
 rayon = "1.10"
 # Fast hashing (FxHashMap)
@@ -31,6 +35,12 @@ base64 = "0.22"
 aho-corasick = "1.1"
 # LRU cache for frequent token sequences
 lru = "0.12"
+# regexr regex engine (default backend)
+regexr = { version = "0.1.0-beta.4", features = ["jit", "simd"] }
+
+[dev-dependencies]
+# PCRE2 for benchmarking comparisons
+pcre2 = "0.2"
 
 [profile.release]
 opt-level = 3

diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ See the [API Guide](docs/api_guide.md) and [docs.rs](https://docs.rs/splintr) fo
 - **Compatible vocabularies** - Supports cl100k_base, o200k_base (OpenAI), Llama 3 family (Meta), and DeepSeek V3 (DeepSeek)
 - **Streaming decoders** - Real-time LLM output display with proper UTF-8 handling ([guide](docs/api_guide.md#streaming-decoder))
 - **54 agent tokens** - Built-in support for chat, CoT reasoning, ReAct agents, tool calling, RAG citations ([docs](docs/special_tokens.md))
-- **Battle-tested algorithms** - PCRE2 with JIT, Aho-Corasick for special tokens, linked-list BPE
+- **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE
 
 **Cross-platform:**
 
@@ -154,6 +154,43 @@ cat results/my_benchmark.md
 
 The benchmark suite tests single text encoding, batch encoding, streaming decoder performance, and special token handling across various content types.
 
+### Regex Backends
+
+Splintr uses a pure-Rust regex engine ([`regexr`](https://crates.io/crates/regexr)) by default, with optional PCRE2 support for compatibility.
+
+**Default Backend (regexr):**
+- Pure Rust implementation (no C dependencies)
+- JIT compilation and SIMD acceleration
+- Native UTF-8 and Unicode property support
+
+**Optional PCRE2 Backend:**
+
+```python
+from splintr import Tokenizer
+
+# Default: regexr backend (pure Rust)
+tokenizer = Tokenizer.from_pretrained("cl100k_base")
+
+# Optional: switch to PCRE2 (requires --features pcre2)
+tokenizer = Tokenizer.from_pretrained("cl100k_base").pcre2(True)
+```
+
+To enable PCRE2, build with the feature flag:
+
+```bash
+maturin develop --release --features pcre2
+```
+
+**Benchmarking:**
+
+```bash
+# Compare backends (requires PCRE2 feature)
+python benchmarks/benchmark_regexr_comparison.py --model cl100k_base
+
+# Visual comparison with charts
+python benchmarks/benchmark_regexr_viz.py --model cl100k_base
+```
+
 ## Streaming Decoders
 
 For real-time LLM applications where tokens arrive one at a time, Splintr provides streaming decoders that handle UTF-8 boundary alignment:
@@ -226,7 +263,7 @@ See [docs/special_tokens.md](docs/special_tokens.md) for the complete list and [
 
 Splintr implements several optimizations that make tokenization faster:
 
-- **PCRE2 with JIT compilation**: 2-4x speedup on regex pattern matching
+- **Regexr with JIT compilation**: Pure Rust regex engine with SIMD acceleration
 - **Rayon parallelism**: Leverages multiple CPU cores for batch encoding
 - **Linked-list BPE algorithm**: Avoids O(N²) complexity on pathological inputs
 - **FxHashMap**: Faster lookups than default SipHash for non-adversarial contexts

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -32,8 +32,16 @@
 try:
     from splintr import Tokenizer as SplintrTokenizer
     HAS_SPLINTR = True
+    # Test if PCRE2 is available
+    try:
+        test_tok = SplintrTokenizer.from_pretrained("cl100k_base").pcre2(True)
+        HAS_PCRE2 = True
+        del test_tok
+    except ValueError:
+        HAS_PCRE2 = False
 except ImportError:
     HAS_SPLINTR = False
+    HAS_PCRE2 = False
     print("Warning: splintr not installed. Run: pip install -e . or maturin develop")
 
 try:
@@ -671,6 +679,13 @@ def main():
         action="store_true",
         help="Skip cache benchmarks"
     )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="regexr",
+        choices=["regexr", "pcre2"],
+        help="Regex backend to use: regexr (default, pure Rust) or pcre2 (requires feature flag)"
+    )
     args = parser.parse_args()
 
     if not HAS_SPLINTR:
@@ -689,9 +704,19 @@ def main():
     print("=" * 70)
 
     # Load tokenizers
-    print(f"\nLoading tokenizers (model: {args.model})...")
-    splintr_enc = SplintrTokenizer.from_pretrained(args.model)
-    print(f"  Splintr: {splintr_enc}")
+    backend_str = "PCRE2" if args.backend == "pcre2" else "Regexr"
+    print(f"\nLoading tokenizers (model: {args.model}, backend: {backend_str})...")
+
+    if args.backend == "pcre2":
+        if not HAS_PCRE2:
+            print("Error: PCRE2 backend requested but not available.")
+            print("       Build with: maturin develop --release --features pcre2")
+            return 1
+        splintr_enc = SplintrTokenizer.from_pretrained(args.model).pcre2(True)
+    else:  # regexr (default)
+        splintr_enc = SplintrTokenizer.from_pretrained(args.model)
+
+    print(f"  Splintr ({backend_str}): {splintr_enc}")
 
     tiktoken_enc = None
     if args.compare or args.correctness_only:

diff --git a/benchmarks/benchmark_batch.py b/benchmarks/benchmark_batch.py
@@ -23,10 +23,11 @@
 Tokenizers convert text into numerical representations that models can understand."""
 
 TOKENIZER_COLORS = {
-    "splintr": "#2ecc71",      # Green
-    "tiktoken": "#3498db",     # Blue
-    "huggingface": "#e74c3c",  # Red
-    "tokendagger": "#9b59b6",  # Purple
+    "splintr": "#2ecc71",          # Green (default, pure Rust)
+    "splintr-pcre2": "#27ae60",    # Dark Green (optional)
+    "tiktoken": "#3498db",         # Blue
+    "huggingface": "#e74c3c",      # Red
+    "tokendagger": "#9b59b6",      # Purple
 }
 
 
@@ -86,14 +87,15 @@ def load_tokenizers():
     """Load all available tokenizers with batch functions.
 
     All tokenizers use their native batch encoding methods:
-    - splintr: encode_batch (Rayon parallel)
+    - splintr: encode_batch (Rayon parallel, pure Rust regex with JIT)
+    - splintr-pcre2: encode_batch (Rayon parallel, PCRE2 with JIT)
     - tiktoken: encode_ordinary_batch (native batch)
     - huggingface: encode_batch (native batch)
     - tokendagger: encode_batch (native batch)
     """
     tokenizers = {}
 
-    # splintr - native batch via Rayon
+    # splintr - default backend (pure Rust with JIT)
     try:
         import splintr
         enc = splintr.Tokenizer.from_pretrained("cl100k_base")
@@ -102,6 +104,15 @@ def load_tokenizers():
     except ImportError:
         print("splintr not available")
 
+    # splintr-pcre2 - optional backend (requires --features pcre2)
+    try:
+        import splintr
+        enc_pcre2 = splintr.Tokenizer.from_pretrained("cl100k_base").pcre2(True)
+        tokenizers["splintr-pcre2"] = enc_pcre2.encode_batch
+        print("Loaded: splintr-pcre2 (native encode_batch)")
+    except (ImportError, ValueError) as e:
+        print(f"splintr-pcre2 not available: {e}")
+
     # tiktoken - native batch
     try:
         import tiktoken