Add: rust-bio comparison

ashvardanian · ashvardanian · commit 6b98ebd1d0b3 · 2025-09-27T14:53:06.000Z
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -9,6 +9,7 @@
     "corasick",
     "CUDF",
     "Dataframe",
+    "Gotoh",
     "gxhash",
     "lexsort",
     "Melem",
diff --git a/Cargo.toml b/Cargo.toml
@@ -76,6 +76,7 @@ optional = true
 [dependencies.bio]
 version = "3.0.0"
 default-features = false
+features = ["runtime-dispatch-simd"]
 optional = true
 
 [dependencies.bstr]
diff --git a/README.md b/README.md
@@ -221,19 +221,28 @@ Performing in-place lookups in a precomputed table of 256 bytes:
 
 Edit Distance calculation is a common component of Search Engines, Data Cleaning, and Natural Language Processing, as well as in Bioinformatics.
 It's a computationally expensive operation, generally implemented using dynamic programming, with a quadratic time complexity upper bound.
+For biological sequences, the Needleman-Wunsch and Smith-Waterman algorithms are more appropriate, as they allow overriding the default substitution costs.
+Each of those has two flavors - with linear and affine gap penalties, also known as the "Gotoh" variation.
+
+- byte-level and unicode [Levenshtein](#levenshtein) distance;
+- [Needleman-Wunsch](#needleman-wunsch), [Needleman-Wunsch-Gotoh](#needleman-wunsch-gotoh);
+- [Smith-Waterman](#smith-waterman), [Smith-Waterman-Gotoh](#smith-waterman-gotoh).
+
+### Levenshtein
 
 | Library                                              | ≅ 100 bytes lines | ≅ 1'000 bytes lines |
 | ---------------------------------------------------- | ----------------: | ------------------: |
 | Rust 🦀                                               |                   |
-| `rapidfuzz::levenshtein<Bytes>`                      |       4'633 MCUPS |        14'316 MCUPS |
+| `bio::levenshtein` on 1x SPR                         |         428 MCUPS |           823 MCUPS |
+| `rapidfuzz::levenshtein<Bytes>` on 1x SPR            |       4'633 MCUPS |        14'316 MCUPS |
+| `rapidfuzz::levenshtein<Chars>` on 1x SPR            |       3'877 MCUPS |        13'179 MCUPS |
 | `stringzillas::LevenshteinDistances` on 1x SPR       |       3'315 MCUPS |        13'084 MCUPS |
+| `stringzillas::LevenshteinDistancesUtf8` on 1x SPR   |       3'283 MCUPS |        11'690 MCUPS |
 | `stringzillas::LevenshteinDistances` on 16x SPR      |      29'430 MCUPS |       105'400 MCUPS |
+| `stringzillas::LevenshteinDistancesUtf8` on 16x SPR  |      38'954 MCUPS |       103'500 MCUPS |
 | `stringzillas::LevenshteinDistances` on RTX6000      |  __32'030 MCUPS__ |   __901'990 MCUPS__ |
 | `stringzillas::LevenshteinDistances` on H100         |  __31'913 MCUPS__ |   __925'890 MCUPS__ |
 | `stringzillas::LevenshteinDistances` on 384x GNR     | __114'190 MCUPS__ | __3'084'270 MCUPS__ |
-| `rapidfuzz::levenshtein<Chars>`                      |       3'877 MCUPS |        13'179 MCUPS |
-| `stringzillas::LevenshteinDistancesUtf8` on 1x SPR   |       3'283 MCUPS |        11'690 MCUPS |
-| `stringzillas::LevenshteinDistancesUtf8` on 16x SPR  |      38'954 MCUPS |       103'500 MCUPS |
 | `stringzillas::LevenshteinDistancesUtf8` on 384x GNR | __103'590 MCUPS__ | __2'938'320 MCUPS__ |
 |                                                      |                   |                     |
 | Python 🐍                                             |                   |                     |
@@ -250,42 +259,61 @@ It's a computationally expensive operation, generally implemented using dynamic
 | `stringzillas.LevenshteinDistances` batch on 16x SPR |       3'762 MCUPS |       119'261 MCUPS |
 | `stringzillas.LevenshteinDistances` batch on H100    |  __18'081 MCUPS__ |   __320'109 MCUPS__ |
 
-
-For biological sequences, the Needleman-Wunsch and Smith-Waterman algorithms are more appropriate, as they allow overriding the default substitution costs.
-Another common adaptation is to used Gotoh's affine gap penalties, which better model the evolutionary events in DNA and Protein sequences.
+### Needleman-Wunsch
 
 | Library                                               | ≅ 100 bytes lines | ≅ 1'000 bytes lines |
 | ----------------------------------------------------- | ----------------: | ------------------: |
-| Rust 🦀 with linear gaps                               |                   |
+| Rust 🦀                                                |                   |                     |
+| `bio::pairwise::global` on 1x SPR                     |          51 MCUPS |            57 MCUPS |
 | `stringzillas::NeedlemanWunschScores` on 1x SPR       |         278 MCUPS |           612 MCUPS |
 | `stringzillas::NeedlemanWunschScores` on 16x SPR      |       4'057 MCUPS |         8'492 MCUPS |
 | `stringzillas::NeedlemanWunschScores` on 384x GNR     |  __64'290 MCUPS__ |   __331'340 MCUPS__ |
 | `stringzillas::NeedlemanWunschScores` on H100         |         131 MCUPS |    __12'113 MCUPS__ |
-| `stringzillas::SmithWatermanScores` on 1x SPR         |         263 MCUPS |           552 MCUPS |
-| `stringzillas::SmithWatermanScores` on 16x SPR        |       3'883 MCUPS |         8'011 MCUPS |
-| `stringzillas::SmithWatermanScores` on 384x GNR       |  __58'880 MCUPS__ |   __285'480 MCUPS__ |
-| `stringzillas::SmithWatermanScores` on H100           |         143 MCUPS |    __12'921 MCUPS__ |
 |                                                       |                   |                     |
-| Python 🐍 with linear gaps                             |                   |                     |
+| Python 🐍                                              |                   |                     |
 | `biopython.PairwiseAligner.score` on 1x SPR           |          95 MCUPS |           557 MCUPS |
 | `stringzillas.NeedlemanWunschScores` on 1x SPR        |          30 MCUPS |           481 MCUPS |
 | `stringzillas.NeedlemanWunschScores` batch on 1x SPR  |         246 MCUPS |           570 MCUPS |
 | `stringzillas.NeedlemanWunschScores` batch on 16x SPR |       3'103 MCUPS |         9'208 MCUPS |
 | `stringzillas.NeedlemanWunschScores` batch on H100    |         127 MCUPS |        12'246 MCUPS |
-| `stringzillas.SmithWatermanScores` on 1x SPR          |          28 MCUPS |           440 MCUPS |
-| `stringzillas.SmithWatermanScores` batch on 1x SPR    |         255 MCUPS |           582 MCUPS |
-| `stringzillas.SmithWatermanScores` batch on 16x SPR   |   __3'535 MCUPS__ |         8'235 MCUPS |
-| `stringzillas.SmithWatermanScores` batch on H100      |         130 MCUPS |    __12'702 MCUPS__ |
-|                                                       |                   |                     |
-| Rust 🦀 with affine gaps                               |                   |                     |
-| `stringzillas::NeedlemanWunschScores` on 1x SPR       |          83 MCUPS |           354 MCUPS |
-| `stringzillas::NeedlemanWunschScores` on 16x SPR      |       1'267 MCUPS |         4'694 MCUPS |
-| `stringzillas::NeedlemanWunschScores` on 384x GNR     |  __42'050 MCUPS__ |   __155'920 MCUPS__ |
-| `stringzillas::NeedlemanWunschScores` on H100         |         128 MCUPS |    __13'799 MCUPS__ |
-| `stringzillas::SmithWatermanScores` on 1x SPR         |          79 MCUPS |           284 MCUPS |
-| `stringzillas::SmithWatermanScores` on 16x SPR        |       1'026 MCUPS |         3'776 MCUPS |
-| `stringzillas::SmithWatermanScores` on 384x GNR       |  __38'430 MCUPS__ |   __129'140 MCUPS__ |
-| `stringzillas::SmithWatermanScores` on H100           |         127 MCUPS |    __13'205 MCUPS__ |
+
+### Smith-Waterman
+
+| Library                                             | ≅ 100 bytes lines | ≅ 1'000 bytes lines |
+| --------------------------------------------------- | ----------------: | ------------------: |
+| Rust 🦀                                              |                   |                     |
+| `bio::pairwise::local` on 1x SPR                    |          49 MCUPS |            50 MCUPS |
+| `stringzillas::SmithWatermanScores` on 1x SPR       |         263 MCUPS |           552 MCUPS |
+| `stringzillas::SmithWatermanScores` on 16x SPR      |       3'883 MCUPS |         8'011 MCUPS |
+| `stringzillas::SmithWatermanScores` on 384x GNR     |  __58'880 MCUPS__ |   __285'480 MCUPS__ |
+| `stringzillas::SmithWatermanScores` on H100         |         143 MCUPS |    __12'921 MCUPS__ |
+|                                                     |                   |                     |
+| Python 🐍                                            |                   |                     |
+| `biopython.PairwiseAligner.score` on 1x SPR         |          95 MCUPS |           557 MCUPS |
+| `stringzillas.SmithWatermanScores` on 1x SPR        |          28 MCUPS |           440 MCUPS |
+| `stringzillas.SmithWatermanScores` batch on 1x SPR  |         255 MCUPS |           582 MCUPS |
+| `stringzillas.SmithWatermanScores` batch on 16x SPR |   __3'535 MCUPS__ |         8'235 MCUPS |
+| `stringzillas.SmithWatermanScores` batch on H100    |         130 MCUPS |    __12'702 MCUPS__ |
+
+### Needleman-Wunsch-Gotoh
+
+| Library                                           | ≅ 100 bytes lines | ≅ 1'000 bytes lines |
+| ------------------------------------------------- | ----------------: | ------------------: |
+| Rust 🦀                                            |                   |                     |
+| `stringzillas::NeedlemanWunschScores` on 1x SPR   |          83 MCUPS |           354 MCUPS |
+| `stringzillas::NeedlemanWunschScores` on 16x SPR  |       1'267 MCUPS |         4'694 MCUPS |
+| `stringzillas::NeedlemanWunschScores` on 384x GNR |  __42'050 MCUPS__ |   __155'920 MCUPS__ |
+| `stringzillas::NeedlemanWunschScores` on H100     |         128 MCUPS |    __13'799 MCUPS__ |
+
+### Smith-Waterman-Gotoh
+
+| Library                                         | ≅ 100 bytes lines | ≅ 1'000 bytes lines |
+| ----------------------------------------------- | ----------------: | ------------------: |
+| Rust 🦀                                          |                   |                     |
+| `stringzillas::SmithWatermanScores` on 1x SPR   |          79 MCUPS |           284 MCUPS |
+| `stringzillas::SmithWatermanScores` on 16x SPR  |       1'026 MCUPS |         3'776 MCUPS |
+| `stringzillas::SmithWatermanScores` on 384x GNR |  __38'430 MCUPS__ |   __129'140 MCUPS__ |
+| `stringzillas::SmithWatermanScores` on H100     |         127 MCUPS |    __13'205 MCUPS__ |
 
 ## Byte-level Fingerprinting & Sketching Benchmarks
 
diff --git a/bench_fingerprints.rs b/bench_fingerprints.rs
@@ -387,7 +387,10 @@ fn bench_fingerprints(c: &mut Criterion<HashesWallTime>) {
     }
 
     // StringZilla: Nx CPU
-    if should_run(&format!("fingerprinting/stringzillas::Fingerprints({}xCPU)", num_cores)) {
+    if should_run(&format!(
+        "fingerprinting/stringzillas::Fingerprints({}xCPU)",
+        num_cores
+    )) {
         g.throughput(Throughput::Elements(per_batch_hash_ops));
         g.bench_function(
             &format!("stringzillas::Fingerprints({}xCPU)", num_cores),
diff --git a/bench_similarities.rs b/bench_similarities.rs
@@ -50,6 +50,7 @@ use criterion::{Criterion, Throughput};
 use fork_union::count_logical_cores;
 use stringtape::{BytesTape, BytesTapeView, CharsTapeView};
 
+use bio::alignment::{distance as bio_distance, pairwise::Aligner};
 use rapidfuzz::distance::levenshtein;
 use stringzilla::szs::{
     error_costs_256x256_unary, AnyBytesTape, AnyCharsTape, DeviceScope, LevenshteinDistances,
@@ -434,8 +435,22 @@ fn perform_uniform_benchmarks(
                 levenshtein::distance(a_str.chars(), b_str.chars())
             })
         });
+    }
 
-        // StringZilla Binary Levenshtein Distance (uniform costs: 0,1,1,1)
+    if should_run("uniform/bio::levenshtein(1xCPU)") {
+        g.throughput(Throughput::Elements(per_pair_bytes));
+        g.bench_function("bio::levenshtein(1xCPU)", |b| {
+            let mut pair_index = 0;
+            b.iter(|| {
+                let a_bytes = &tape_a_view[pair_index % pairs_count];
+                let b_bytes = &tape_b_view[pair_index % pairs_count];
+                pair_index = (pair_index + 1) % pairs_count;
+                std::hint::black_box(bio_distance::levenshtein(a_bytes, b_bytes))
+            })
+        });
+    }
+
+    if should_run("uniform/stringzillas::LevenshteinDistances(1xCPU)") {
         g.throughput(Throughput::Elements(per_batch_bytes));
         g.bench_function("stringzillas::LevenshteinDistances(1xCPU)", |b| {
             let mut results = UnifiedVec::<usize>::with_capacity_in(batch_size, UnifiedAlloc);
@@ -644,7 +659,65 @@ fn perform_linear_benchmarks(
         .ok()
         .and_then(|gpu| SmithWatermanScores::new(gpu, &matrix, -2, -2).ok());
 
+    let mut max_len = 0usize;
+    for idx in 0..pairs_count {
+        let a_len = tape_a_view[idx].len();
+        let b_len = tape_b_view[idx].len();
+        if a_len > max_len {
+            max_len = a_len;
+        }
+        if b_len > max_len {
+            max_len = b_len;
+        }
+    }
+    let max_len = std::cmp::max(1, max_len);
+
     let per_batch = (batch_size as u64) * avg_cells_bytes;
+    let per_pair = avg_cells_bytes;
+
+    if should_run("linear/bio::pairwise::global(1xCPU)") {
+        g.throughput(Throughput::Elements(per_pair));
+        g.bench_function("bio::pairwise::global(1xCPU)", |b| {
+            let mut aligner =
+                Aligner::with_capacity(
+                    max_len,
+                    max_len,
+                    -2,
+                    -2,
+                    |a: u8, b: u8| if a == b { 2 } else { -1 },
+                );
+            let mut pair_index = 0;
+            b.iter(|| {
+                let a_bytes = &tape_a_view[pair_index % pairs_count];
+                let b_bytes = &tape_b_view[pair_index % pairs_count];
+                pair_index = (pair_index + 1) % pairs_count;
+                let score = aligner.global(a_bytes, b_bytes).score;
+                std::hint::black_box(score);
+            })
+        });
+    }
+
+    if should_run("linear/bio::pairwise::local(1xCPU)") {
+        g.throughput(Throughput::Elements(per_pair));
+        g.bench_function("bio::pairwise::local(1xCPU)", |b| {
+            let mut aligner =
+                Aligner::with_capacity(
+                    max_len,
+                    max_len,
+                    -2,
+                    -2,
+                    |a: u8, b: u8| if a == b { 2 } else { -1 },
+                );
+            let mut pair_index = 0;
+            b.iter(|| {
+                let a_bytes = &tape_a_view[pair_index % pairs_count];
+                let b_bytes = &tape_b_view[pair_index % pairs_count];
+                pair_index = (pair_index + 1) % pairs_count;
+                let score = aligner.local(a_bytes, b_bytes).score;
+                std::hint::black_box(score);
+            })
+        });
+    }
 
     // Needleman-Wunsch (Global alignment)
     if should_run("stringzillas::NeedlemanWunschScores(1xCPU)") {
@@ -890,7 +963,65 @@ fn perform_affine_benchmarks(
         .ok()
         .and_then(|gpu| SmithWatermanScores::new(gpu, &matrix, -5, -1).ok());
 
+    let mut max_len = 0usize;
+    for idx in 0..pairs_count {
+        let a_len = tape_a_view[idx].len();
+        let b_len = tape_b_view[idx].len();
+        if a_len > max_len {
+            max_len = a_len;
+        }
+        if b_len > max_len {
+            max_len = b_len;
+        }
+    }
+    let max_len = std::cmp::max(1, max_len);
+
     let per_batch = (batch_size as u64) * avg_cells_bytes;
+    let per_pair = avg_cells_bytes;
+
+    if should_run("affine/bio::pairwise::global(1xCPU)") {
+        g.throughput(Throughput::Elements(per_pair));
+        g.bench_function("bio::pairwise::global(1xCPU)", |b| {
+            let mut aligner =
+                Aligner::with_capacity(
+                    max_len,
+                    max_len,
+                    -5,
+                    -1,
+                    |a: u8, b: u8| if a == b { 2 } else { -1 },
+                );
+            let mut pair_index = 0;
+            b.iter(|| {
+                let a_bytes = &tape_a_view[pair_index % pairs_count];
+                let b_bytes = &tape_b_view[pair_index % pairs_count];
+                pair_index = (pair_index + 1) % pairs_count;
+                let score = aligner.global(a_bytes, b_bytes).score;
+                std::hint::black_box(score);
+            })
+        });
+    }
+
+    if should_run("affine/bio::pairwise::local(1xCPU)") {
+        g.throughput(Throughput::Elements(per_pair));
+        g.bench_function("bio::pairwise::local(1xCPU)", |b| {
+            let mut aligner =
+                Aligner::with_capacity(
+                    max_len,
+                    max_len,
+                    -5,
+                    -1,
+                    |a: u8, b: u8| if a == b { 2 } else { -1 },
+                );
+            let mut pair_index = 0;
+            b.iter(|| {
+                let a_bytes = &tape_a_view[pair_index % pairs_count];
+                let b_bytes = &tape_b_view[pair_index % pairs_count];
+                pair_index = (pair_index + 1) % pairs_count;
+                let score = aligner.local(a_bytes, b_bytes).score;
+                std::hint::black_box(score);
+            })
+        });
+    }
 
     // Needleman-Wunsch (Global alignment)
     if should_run("stringzillas::NeedlemanWunschScores(1xCPU)") {