Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
af253ab
perf(quantize): AVX2 SIMD Q4_0/Q8_0 dequant — ~8-9× speedup (closes #…
noahgift May 15, 2026
7241ac3
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
3e5c533
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
0c4016f
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
431093e
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
59a9f38
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
23e1065
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
f010be4
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
64a7362
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
358154e
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
fe7fbcd
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
cfef281
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
d4f5f67
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
867690d
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
e491d97
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
8994d06
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
b17d8c6
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
b683932
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 15, 2026
33c2008
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 16, 2026
010f330
Merge branch 'main' into perf/386-q4-q8-avx2-dequant
noahgift May 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crates/aprender-core/src/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ use std::path::Path;
#[cfg(feature = "format-quantize")]
pub mod quantize;

// GH-386: AVX2 SIMD fast paths for Q4_0/Q8_0 dequant.
#[cfg(feature = "format-quantize")]
mod quantize_simd;

// Homomorphic encryption module (spec: homomorphic-encryption-spec.md)
#[cfg(feature = "format-homomorphic")]
pub mod homomorphic;
Expand Down
51 changes: 38 additions & 13 deletions crates/aprender-core/src/format/quantize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,13 +263,24 @@ impl Quantizer for Q8_0Quantizer {
});
}

// GH-386: pre-allocate output and write directly to slice ranges so
// the inner loop is a tight `i8 → f32 * scale` over a fixed-size local
// [i8; 32]. LLVM auto-vectorizes this pattern into AVX2/NEON SIMD,
// replacing the previous `Vec::push` loop that bottlenecked on
// per-element capacity-growth checks.
let mut result = vec![0.0f32; total_elements];
// GH-386: AVX2 fast path. When the host CPU has AVX2, hand off to
// the SIMD implementation in `crate::format::quantize_simd`, which
// processes 32 elements per block as four 8-wide f32 vectors. The
// SIMD output is bit-exact relative to this scalar loop (verified by
// `quantize_simd::tests::scalar_simd_parity_q8_0`).
let mut result = vec![0.0f32; num_blocks * BLOCK_SIZE];
if crate::format::quantize_simd::dequantize_q8_0_avx2_dispatch(
&block.blocks,
num_blocks,
&mut result,
) {
result.truncate(total_elements);
return Ok(result);
}

// Scalar fallback. Pre-allocate output and write directly to slice
// ranges so the inner loop is a tight `i8 → f32 * scale` that LLVM
// can auto-vectorize on non-x86 targets.
for block_idx in 0..num_blocks {
let block_start = block_idx * Q8_0_BLOCK_BYTES;

Expand Down Expand Up @@ -297,6 +308,7 @@ impl Quantizer for Q8_0Quantizer {
}
}

result.truncate(total_elements);
Ok(result)
}

Expand Down Expand Up @@ -390,14 +402,26 @@ impl Quantizer for Q4_0Quantizer {
});
}

// GH-386: pre-allocate output + write to slice ranges. Layout matches
// the interleaved pack used by `quantize` above (byte_i carries data
// positions 2i and 2i+1) — NOT the GGML half-half layout used in
// format::gguf::dequant.rs. Kept identical to the previous code's
// observable behavior; only the dispatch is tightened so LLVM can
// auto-vectorize the per-byte unpack + multiply.
let mut result = vec![0.0f32; total_elements];
// GH-386: AVX2 fast path. When the host CPU has AVX2, hand off to
// the SIMD implementation in `crate::format::quantize_simd`. The
// SIMD path produces bit-exact output relative to this scalar loop
// (verified by `quantize_simd::tests::scalar_simd_parity_q4_0`).
let mut result = vec![0.0f32; num_blocks * BLOCK_SIZE];
if crate::format::quantize_simd::dequantize_q4_0_avx2_dispatch(
&block.blocks,
num_blocks,
&mut result,
) {
result.truncate(total_elements);
return Ok(result);
}

// Scalar fallback. Layout matches the interleaved pack used by
// `quantize` above (byte_i carries data positions 2i and 2i+1) —
// NOT the GGML half-half layout used in format::gguf::dequant.rs.
// Kept identical to the previous code's observable behavior; only
// the dispatch is tightened so LLVM can auto-vectorize the per-byte
// unpack + multiply on non-x86 targets.
for block_idx in 0..num_blocks {
let block_start = block_idx * Q4_0_BLOCK_BYTES;

Expand Down Expand Up @@ -432,6 +456,7 @@ impl Quantizer for Q4_0Quantizer {
}
}

result.truncate(total_elements);
Ok(result)
}

Expand Down
Loading
Loading