Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,7 @@ coverage.xml
*.cover

/research
.python-version
.python-version

# Local cargo config (dev overrides)
.cargo/
14 changes: 12 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ categories = ["text-processing", "encoding"]
name = "splintr"
crate-type = ["cdylib", "rlib"]

[features]
default = []
pcre2 = ["dep:pcre2"]

[dependencies]
# PCRE2 regex with JIT support (2-4x faster than fancy-regex)
pcre2 = "0.2"
# PCRE2 regex with JIT support (optional, for benchmarking)
pcre2 = { version = "0.2", optional = true }
# Rayon for internal parallelism
rayon = "1.10"
# Fast hashing (FxHashMap)
Expand All @@ -31,6 +35,12 @@ base64 = "0.22"
aho-corasick = "1.1"
# LRU cache for frequent token sequences
lru = "0.12"
# regexr regex engine (default backend)
regexr = { version = "0.1.0-beta.4", features = ["jit", "simd"] }

[dev-dependencies]
# PCRE2 for benchmarking comparisons
pcre2 = "0.2"

[profile.release]
opt-level = 3
Expand Down
41 changes: 39 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ See the [API Guide](docs/api_guide.md) and [docs.rs](https://docs.rs/splintr) fo
- **Compatible vocabularies** - Supports cl100k_base, o200k_base (OpenAI), Llama 3 family (Meta), and DeepSeek V3 (DeepSeek)
- **Streaming decoders** - Real-time LLM output display with proper UTF-8 handling ([guide](docs/api_guide.md#streaming-decoder))
- **54 agent tokens** - Built-in support for chat, CoT reasoning, ReAct agents, tool calling, RAG citations ([docs](docs/special_tokens.md))
- **Battle-tested algorithms** - PCRE2 with JIT, Aho-Corasick for special tokens, linked-list BPE
- **Battle-tested algorithms** - Regexr with JIT (pure Rust), Aho-Corasick for special tokens, linked-list BPE

**Cross-platform:**

Expand Down Expand Up @@ -154,6 +154,43 @@ cat results/my_benchmark.md

The benchmark suite tests single text encoding, batch encoding, streaming decoder performance, and special token handling across various content types.

### Regex Backends

Splintr uses a pure-Rust regex engine ([`regexr`](https://crates.io/crates/regexr)) by default, with optional PCRE2 support for compatibility.

**Default Backend (regexr):**
- Pure Rust implementation (no C dependencies)
- JIT compilation and SIMD acceleration
- Native UTF-8 and Unicode property support

**Optional PCRE2 Backend:**

```python
from splintr import Tokenizer

# Default: regexr backend (pure Rust)
tokenizer = Tokenizer.from_pretrained("cl100k_base")

# Optional: switch to PCRE2 (requires --features pcre2)
tokenizer = Tokenizer.from_pretrained("cl100k_base").pcre2(True)
```

To enable PCRE2, build with the feature flag:

```bash
maturin develop --release --features pcre2
```

**Benchmarking:**

```bash
# Compare backends (requires PCRE2 feature)
python benchmarks/benchmark_regexr_comparison.py --model cl100k_base

# Visual comparison with charts
python benchmarks/benchmark_regexr_viz.py --model cl100k_base
```

## Streaming Decoders

For real-time LLM applications where tokens arrive one at a time, Splintr provides streaming decoders that handle UTF-8 boundary alignment:
Expand Down Expand Up @@ -226,7 +263,7 @@ See [docs/special_tokens.md](docs/special_tokens.md) for the complete list and [

Splintr implements several optimizations that make tokenization faster:

- **PCRE2 with JIT compilation**: 2-4x speedup on regex pattern matching
- **Regexr with JIT compilation**: Pure Rust regex engine with SIMD acceleration
- **Rayon parallelism**: Leverages multiple CPU cores for batch encoding
- **Linked-list BPE algorithm**: Avoids O(N²) complexity on pathological inputs
- **FxHashMap**: Faster lookups than default SipHash for non-adversarial contexts
Expand Down
31 changes: 28 additions & 3 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,16 @@
try:
from splintr import Tokenizer as SplintrTokenizer
HAS_SPLINTR = True
# Test if PCRE2 is available
try:
test_tok = SplintrTokenizer.from_pretrained("cl100k_base").pcre2(True)
HAS_PCRE2 = True
del test_tok
except ValueError:
HAS_PCRE2 = False
except ImportError:
HAS_SPLINTR = False
HAS_PCRE2 = False
print("Warning: splintr not installed. Run: pip install -e . or maturin develop")

try:
Expand Down Expand Up @@ -671,6 +679,13 @@ def main():
action="store_true",
help="Skip cache benchmarks"
)
parser.add_argument(
"--backend",
type=str,
default="regexr",
choices=["regexr", "pcre2"],
help="Regex backend to use: regexr (default, pure Rust) or pcre2 (requires feature flag)"
)
args = parser.parse_args()

if not HAS_SPLINTR:
Expand All @@ -689,9 +704,19 @@ def main():
print("=" * 70)

# Load tokenizers
print(f"\nLoading tokenizers (model: {args.model})...")
splintr_enc = SplintrTokenizer.from_pretrained(args.model)
print(f" Splintr: {splintr_enc}")
backend_str = "PCRE2" if args.backend == "pcre2" else "Regexr"
print(f"\nLoading tokenizers (model: {args.model}, backend: {backend_str})...")

if args.backend == "pcre2":
if not HAS_PCRE2:
print("Error: PCRE2 backend requested but not available.")
print(" Build with: maturin develop --release --features pcre2")
return 1
splintr_enc = SplintrTokenizer.from_pretrained(args.model).pcre2(True)
else: # regexr (default)
splintr_enc = SplintrTokenizer.from_pretrained(args.model)

print(f" Splintr ({backend_str}): {splintr_enc}")

tiktoken_enc = None
if args.compare or args.correctness_only:
Expand Down
23 changes: 17 additions & 6 deletions benchmarks/benchmark_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@
Tokenizers convert text into numerical representations that models can understand."""

TOKENIZER_COLORS = {
"splintr": "#2ecc71", # Green
"tiktoken": "#3498db", # Blue
"huggingface": "#e74c3c", # Red
"tokendagger": "#9b59b6", # Purple
"splintr": "#2ecc71", # Green (default, pure Rust)
"splintr-pcre2": "#27ae60", # Dark Green (optional)
"tiktoken": "#3498db", # Blue
"huggingface": "#e74c3c", # Red
"tokendagger": "#9b59b6", # Purple
}


Expand Down Expand Up @@ -86,14 +87,15 @@ def load_tokenizers():
"""Load all available tokenizers with batch functions.

All tokenizers use their native batch encoding methods:
- splintr: encode_batch (Rayon parallel)
- splintr: encode_batch (Rayon parallel, pure Rust regex with JIT)
- splintr-pcre2: encode_batch (Rayon parallel, PCRE2 with JIT)
- tiktoken: encode_ordinary_batch (native batch)
- huggingface: encode_batch (native batch)
- tokendagger: encode_batch (native batch)
"""
tokenizers = {}

# splintr - native batch via Rayon
# splintr - default backend (pure Rust with JIT)
try:
import splintr
enc = splintr.Tokenizer.from_pretrained("cl100k_base")
Expand All @@ -102,6 +104,15 @@ def load_tokenizers():
except ImportError:
print("splintr not available")

# splintr-pcre2 - optional backend (requires --features pcre2)
try:
import splintr
enc_pcre2 = splintr.Tokenizer.from_pretrained("cl100k_base").pcre2(True)
tokenizers["splintr-pcre2"] = enc_pcre2.encode_batch
print("Loaded: splintr-pcre2 (native encode_batch)")
except (ImportError, ValueError) as e:
print(f"splintr-pcre2 not available: {e}")

# tiktoken - native batch
try:
import tiktoken
Expand Down
Loading
Loading