From 21044ae273e273cc5549aa8e7853a36ebb98d3ec Mon Sep 17 00:00:00 2001 From: Noah Gift Date: Sat, 2 May 2026 11:36:53 +0200 Subject: [PATCH] feat(format): cpu-work-stealing-v1 + encoder-forward-v1 8-gate PARTIAL discharge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundles two sister contracts in one verdict module: cpu-work-stealing-v1 (FALSIFY-WS-001..004): - WS-001: dispatch overhead < 1ms per forward pass - WS-002: L1 cache miss rate < 5% during inner loop - WS-003: matvec parity vs Rayon within 1e-6 - WS-004: 4-thread throughput ≥ 3.5× single-thread encoder-forward-v1 (FALSIFY-ENC-001..004): - ENC-001: 12 encoder layers preserve (n, 768) shape - ENC-002: every output element finite for inputs in [-10, 10] - ENC-003: aprender output matches HF reference within 1e-4 - ENC-004: CLS pooling extracts encoder_output[0] bit-exactly ## Five Whys 1. Why bundle these two contracts? Both peripheral, span the parallel-runtime + encoder-forward coverage band; one verdict module captures both without duplicate provenance pin. 2. Why does this block ship? Coverage % cannot move while these peripheral contracts are unbound at PARTIAL_ALGORITHM_LEVEL. 3. Why strict `<` for WS-001 (not `<=`)? The contract says "< 1ms per forward pass." Equality at exactly 1ms would mean the dispatcher is consuming the entire budget — there's no room for the actual matmul. Strict `<` catches the regression class "atomic contention saturated the overhead window." 4. Why bit-exact (`to_bits()`) for ENC-004 (CLS pooling)? The spec calls it "extract row 0" — pooling is a pure index operation, no float arithmetic. Any drift between `encoder_output[0]` and `cls_embedding` indicates the pooler is averaging or selecting a different row, not just precision loss. Strict bit-equal catches the regression class. 5. Why a separate dimension check for ENC-001 (`AC_ENC_HIDDEN_DIM` AND `AC_ENC_LAYER_COUNT` AND seq-len preservation)? The contract bundles three invariants — count of layers, hidden dim 768, sequence length preserved through layers. A single shape-equal check would let "layer dropped a dim AND added another to compensate" pass. Modeling the three invariants separately catches every mutation class independently. Adds 28 unit tests including 6-bucket scaling sweep + 5-bucket layer-count sweep. Realistic-healthy walks the canonical 4-thread RTX-4090 + BERT-base scenario; pre-fix walks 8 simultaneous regressions across both contracts. No runtime % shift; algorithm-level coverage advances by 8 gates. --- crates/aprender-core/src/format/mod.rs | 6 + .../src/format/ws_enc_001_008.rs | 421 ++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 crates/aprender-core/src/format/ws_enc_001_008.rs diff --git a/crates/aprender-core/src/format/mod.rs b/crates/aprender-core/src/format/mod.rs index c855d3592..0181f5da2 100644 --- a/crates/aprender-core/src/format/mod.rs +++ b/crates/aprender-core/src/format/mod.rs @@ -550,6 +550,12 @@ pub mod gate_ship_011; // GATE-SHIP-012 — Line-coverage percentage inclusive-floor threshold (≥ 95.0). pub mod gate_ship_012; +// FALSIFY-WS-001..004 + FALSIFY-ENC-001..004 — sister bundle of +// cpu-work-stealing-v1 (dispatch overhead, L1 miss rate, parity vs +// Rayon, scaling efficiency) and encoder-forward-v1 (shape preservation, +// finite output, HF reference parity, CLS pooling). +pub mod ws_enc_001_008; + // Re-export types (PMAT-198 - backward compatibility) pub use types::*; diff --git a/crates/aprender-core/src/format/ws_enc_001_008.rs b/crates/aprender-core/src/format/ws_enc_001_008.rs new file mode 100644 index 000000000..9f96847c2 --- /dev/null +++ b/crates/aprender-core/src/format/ws_enc_001_008.rs @@ -0,0 +1,421 @@ +// Bundles two sister contracts in one verdict module: +// +// `cpu-work-stealing-v1` (FALSIFY-WS-001..004) +// `encoder-forward-v1` (FALSIFY-ENC-001..004) +// +// WS-001: dispatch overhead < 1ms per forward pass +// WS-002: L1 cache miss rate < 5% +// WS-003: matvec parity vs Rayon within 1e-6 +// WS-004: 4-thread throughput ≥ 3.5× single-thread +// ENC-001: 12 encoder layers preserve (n, 768) shape +// ENC-002: no NaN/Inf for inputs in [-10, 10] +// ENC-003: entrenar output matches HF reference within 1e-4 +// ENC-004: CLS pooling extracts encoder_output[0] + +/// WS-001 dispatch overhead ceiling (ms). +pub const AC_WS_DISPATCH_OVERHEAD_MAX_MS: f32 = 1.0; +/// WS-002 L1 cache miss rate ceiling (%). +pub const AC_WS_L1_MISS_RATE_MAX_PCT: f32 = 5.0; +/// WS-003 matvec parity tolerance. +pub const AC_WS_MATVEC_TOLERANCE: f32 = 1e-6; +/// WS-004 scaling efficiency floor (4 threads ≥ 3.5× single). +pub const AC_WS_SCALING_FLOOR: f32 = 3.5; +/// ENC-001 hidden dim. +pub const AC_ENC_HIDDEN_DIM: usize = 768; +/// ENC-001 layer count. +pub const AC_ENC_LAYER_COUNT: u32 = 12; +/// ENC-003 reference parity tolerance. +pub const AC_ENC_REFERENCE_TOLERANCE: f32 = 1e-4; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WsEncVerdict { + Pass, + Fail, +} + +// ---------------------------------------------------------------- +// WS-001..004 +// ---------------------------------------------------------------- + +#[must_use] +pub fn verdict_from_dispatch_overhead(overhead_ms: f32) -> WsEncVerdict { + if !overhead_ms.is_finite() || overhead_ms < 0.0 { + return WsEncVerdict::Fail; + } + if overhead_ms < AC_WS_DISPATCH_OVERHEAD_MAX_MS { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + } +} + +#[must_use] +pub fn verdict_from_l1_miss_rate(miss_rate_pct: f32) -> WsEncVerdict { + if !miss_rate_pct.is_finite() || miss_rate_pct < 0.0 || miss_rate_pct > 100.0 { + return WsEncVerdict::Fail; + } + if miss_rate_pct < AC_WS_L1_MISS_RATE_MAX_PCT { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + } +} + +#[must_use] +pub fn verdict_from_matvec_parity(workst: &[f32], rayon: &[f32]) -> WsEncVerdict { + if workst.is_empty() || workst.len() != rayon.len() { + return WsEncVerdict::Fail; + } + for (a, b) in workst.iter().zip(rayon.iter()) { + if !a.is_finite() || !b.is_finite() { + return WsEncVerdict::Fail; + } + if (a - b).abs() > AC_WS_MATVEC_TOLERANCE { + return WsEncVerdict::Fail; + } + } + WsEncVerdict::Pass +} + +#[must_use] +pub fn verdict_from_scaling_efficiency(t1_tps: f32, t4_tps: f32) -> WsEncVerdict { + if !t1_tps.is_finite() || !t4_tps.is_finite() { + return WsEncVerdict::Fail; + } + if t1_tps <= 0.0 || t4_tps <= 0.0 { + return WsEncVerdict::Fail; + } + if t4_tps >= AC_WS_SCALING_FLOOR * t1_tps { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + } +} + +// ---------------------------------------------------------------- +// ENC-001..004 +// ---------------------------------------------------------------- + +/// ENC-001: 12 encoder layers preserve (n, 768) shape. +#[must_use] +pub fn verdict_from_shape_preservation( + layer_count: u32, + in_seq_len: usize, + in_hidden: usize, + out_seq_len: usize, + out_hidden: usize, +) -> WsEncVerdict { + if layer_count != AC_ENC_LAYER_COUNT { + return WsEncVerdict::Fail; + } + if in_hidden != AC_ENC_HIDDEN_DIM || out_hidden != AC_ENC_HIDDEN_DIM { + return WsEncVerdict::Fail; + } + if in_seq_len == out_seq_len && in_seq_len > 0 { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + } +} + +/// ENC-002: every output element is finite. +#[must_use] +pub fn verdict_from_finite_output(output: &[f32]) -> WsEncVerdict { + if output.is_empty() { + return WsEncVerdict::Fail; + } + if output.iter().all(|x| x.is_finite()) { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + } +} + +/// ENC-003: max|aprender_out - hf_reference| ≤ 1e-4. +#[must_use] +pub fn verdict_from_hf_reference_parity( + aprender_out: &[f32], + hf_reference: &[f32], +) -> WsEncVerdict { + if aprender_out.is_empty() || aprender_out.len() != hf_reference.len() { + return WsEncVerdict::Fail; + } + for (a, b) in aprender_out.iter().zip(hf_reference.iter()) { + if !a.is_finite() || !b.is_finite() { + return WsEncVerdict::Fail; + } + if (a - b).abs() > AC_ENC_REFERENCE_TOLERANCE { + return WsEncVerdict::Fail; + } + } + WsEncVerdict::Pass +} + +/// ENC-004: CLS pooling extracts row 0 of the encoder output. +#[must_use] +pub fn verdict_from_cls_pooling( + encoder_output_first_row: &[f32], + cls_embedding: &[f32], +) -> WsEncVerdict { + if encoder_output_first_row.is_empty() + || encoder_output_first_row.len() != cls_embedding.len() + { + return WsEncVerdict::Fail; + } + for (a, b) in encoder_output_first_row.iter().zip(cls_embedding.iter()) { + if a.to_bits() != b.to_bits() { + return WsEncVerdict::Fail; + } + } + WsEncVerdict::Pass +} + +#[cfg(test)] +mod tests { + use super::*; + + // ----------------------------------------------------------------- + // Section 1: Provenance pin. + // ----------------------------------------------------------------- + #[test] + fn provenance_constants() { + assert_eq!(AC_WS_DISPATCH_OVERHEAD_MAX_MS, 1.0); + assert_eq!(AC_WS_L1_MISS_RATE_MAX_PCT, 5.0); + assert_eq!(AC_WS_MATVEC_TOLERANCE, 1e-6); + assert_eq!(AC_WS_SCALING_FLOOR, 3.5); + assert_eq!(AC_ENC_HIDDEN_DIM, 768); + assert_eq!(AC_ENC_LAYER_COUNT, 12); + assert_eq!(AC_ENC_REFERENCE_TOLERANCE, 1e-4); + } + + // ----------------------------------------------------------------- + // Section 2: WS-001..004. + // ----------------------------------------------------------------- + #[test] + fn fws001_pass_under_1ms() { + let v = verdict_from_dispatch_overhead(0.5); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fws001_fail_at_1ms() { + let v = verdict_from_dispatch_overhead(1.0); // strict < + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fws001_fail_negative() { + let v = verdict_from_dispatch_overhead(-0.1); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fws002_pass_low_miss_rate() { + let v = verdict_from_l1_miss_rate(2.0); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fws002_fail_high_miss_rate() { + let v = verdict_from_l1_miss_rate(10.0); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fws003_pass_within_tolerance() { + let workst = vec![1.0_f32, 2.0, 3.0]; + let rayon = vec![1.0000001_f32, 1.9999999, 3.0000001]; + let v = verdict_from_matvec_parity(&workst, &rayon); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fws003_fail_drift() { + let workst = vec![1.0_f32, 2.0]; + let rayon = vec![1.0_f32, 2.5]; + let v = verdict_from_matvec_parity(&workst, &rayon); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fws004_pass_3_5x() { + let v = verdict_from_scaling_efficiency(100.0, 350.0); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fws004_fail_only_2x() { + let v = verdict_from_scaling_efficiency(100.0, 200.0); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fws004_fail_zero_baseline() { + let v = verdict_from_scaling_efficiency(0.0, 100.0); + assert_eq!(v, WsEncVerdict::Fail); + } + + // ----------------------------------------------------------------- + // Section 3: ENC-001..004. + // ----------------------------------------------------------------- + #[test] + fn fenc001_pass_canonical_shape() { + let v = verdict_from_shape_preservation(12, 50, 768, 50, 768); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fenc001_fail_dropped_layer() { + let v = verdict_from_shape_preservation(11, 50, 768, 50, 768); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc001_fail_wrong_hidden() { + let v = verdict_from_shape_preservation(12, 50, 768, 50, 1024); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc001_fail_seq_len_changed() { + let v = verdict_from_shape_preservation(12, 50, 768, 49, 768); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc002_pass_finite_output() { + let v = verdict_from_finite_output(&[1.0, 2.0, -3.0, 0.0]); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fenc002_fail_nan() { + let v = verdict_from_finite_output(&[1.0, f32::NAN]); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc002_fail_infinity() { + let v = verdict_from_finite_output(&[1.0, f32::INFINITY]); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc003_pass_within_1e_4() { + let apr = vec![1.0_f32, 2.0, 3.0]; + let hf = vec![1.00005_f32, 1.99995, 3.00001]; + let v = verdict_from_hf_reference_parity(&apr, &hf); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fenc003_fail_drift() { + let apr = vec![1.0_f32, 2.0]; + let hf = vec![1.0_f32, 2.5]; + let v = verdict_from_hf_reference_parity(&apr, &hf); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc004_pass_first_row_match() { + let row0 = vec![1.0_f32, 2.0, 3.0]; + let cls = row0.clone(); + let v = verdict_from_cls_pooling(&row0, &cls); + assert_eq!(v, WsEncVerdict::Pass); + } + + #[test] + fn fenc004_fail_one_ulp_drift() { + let row0 = vec![1.0_f32, 2.0]; + let bumped = f32::from_bits(2.0_f32.to_bits() + 1); + let cls = vec![1.0_f32, bumped]; + let v = verdict_from_cls_pooling(&row0, &cls); + assert_eq!(v, WsEncVerdict::Fail); + } + + #[test] + fn fenc004_fail_length_mismatch() { + let row0 = vec![1.0_f32, 2.0]; + let cls = vec![1.0_f32]; + let v = verdict_from_cls_pooling(&row0, &cls); + assert_eq!(v, WsEncVerdict::Fail); + } + + // ----------------------------------------------------------------- + // Section 4: Mutation surveys. + // ----------------------------------------------------------------- + #[test] + fn mutation_survey_ws_scaling_band() { + let baseline = 100.0_f32; + for ratio_x10 in [10_u32, 30, 35, 36, 40, 80] { + let t4 = baseline * (ratio_x10 as f32 / 10.0); + let v = verdict_from_scaling_efficiency(baseline, t4); + let want = if t4 >= AC_WS_SCALING_FLOOR * baseline { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + }; + assert_eq!(v, want, "ratio={ratio_x10}"); + } + } + + #[test] + fn mutation_survey_enc_layer_count_band() { + for n in [10_u32, 11, 12, 13, 24] { + let v = verdict_from_shape_preservation(n, 50, 768, 50, 768); + let want = if n == 12 { + WsEncVerdict::Pass + } else { + WsEncVerdict::Fail + }; + assert_eq!(v, want, "n={n}"); + } + } + + // ----------------------------------------------------------------- + // Section 5: Realistic. + // ----------------------------------------------------------------- + #[test] + fn realistic_healthy_passes_all_8() { + let v1 = verdict_from_dispatch_overhead(0.3); + let v2 = verdict_from_l1_miss_rate(2.5); + let v3 = verdict_from_matvec_parity(&[1.0_f32, 2.0], &[1.0_f32, 2.0]); + let v4 = verdict_from_scaling_efficiency(100.0, 380.0); + let v5 = verdict_from_shape_preservation(12, 50, 768, 50, 768); + let v6 = verdict_from_finite_output(&[1.0, 2.0, 3.0]); + let v7 = verdict_from_hf_reference_parity(&[1.0_f32, 2.0], &[1.00005, 1.99995]); + let v8 = verdict_from_cls_pooling(&[1.0_f32, 2.0], &[1.0, 2.0]); + for v in [v1, v2, v3, v4, v5, v6, v7, v8] { + assert_eq!(v, WsEncVerdict::Pass); + } + } + + // ----------------------------------------------------------------- + // Section 6: Pre-fix regressions. + // ----------------------------------------------------------------- + #[test] + fn realistic_pre_fix_all_8_failures() { + let v1 = verdict_from_dispatch_overhead(2.0); // 2ms — atomic contention + let v2 = verdict_from_l1_miss_rate(15.0); // tile too big + let v3 = verdict_from_matvec_parity(&[1.0_f32], &[1.5_f32]); // race + let v4 = verdict_from_scaling_efficiency(100.0, 200.0); // only 2× + let v5 = verdict_from_shape_preservation(11, 50, 768, 50, 768); // dropped layer + let v6 = verdict_from_finite_output(&[1.0, f32::NAN]); + let v7 = verdict_from_hf_reference_parity(&[1.0_f32], &[2.0]); + let v8 = verdict_from_cls_pooling(&[1.0_f32], &[2.0]); + for v in [v1, v2, v3, v4, v5, v6, v7, v8] { + assert_eq!(v, WsEncVerdict::Fail); + } + } + + // ----------------------------------------------------------------- + // Section 7: Edge cases. + // ----------------------------------------------------------------- + #[test] + fn edge_empty_inputs_fail_closed() { + let v = verdict_from_matvec_parity(&[], &[]); + assert_eq!(v, WsEncVerdict::Fail); + let v = verdict_from_finite_output(&[]); + assert_eq!(v, WsEncVerdict::Fail); + } +}