From e00c32edef0c030d14f4dc64168506152b80e47f Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Wed, 8 Oct 2025 16:53:05 -0300
Subject: [PATCH 1/5] use par_iter and hardcode accumulatorsc

---
 Cargo.toml                              |   4 +
 benches/sumcheck_svo.rs                 | 104 +++++++++++
 src/sumcheck/small_value_utils.rs       |  18 +-
 src/sumcheck/sumcheck_small_value.rs    |  16 +-
 src/sumcheck/sumcheck_small_value_eq.rs | 229 ++++++++++++++++++------
 5 files changed, 310 insertions(+), 61 deletions(-)
 create mode 100644 benches/sumcheck_svo.rs
diff --git a/Cargo.toml b/Cargo.toml
index 10a965b8..9a3b63b0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -103,3 +103,7 @@ harness = false
 [[bench]]
 name = "sumcheck"
 harness = false
+
+[[bench]]
+name = "sumcheck_svo"
+harness = false
diff --git a/benches/sumcheck_svo.rs b/benches/sumcheck_svo.rs
new file mode 100644
index 00000000..cefda5bb
--- /dev/null
+++ b/benches/sumcheck_svo.rs
@@ -0,0 +1,104 @@
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use p3_challenger::DuplexChallenger;
+use p3_field::extension::BinomialExtensionField;
+use p3_koala_bear::{KoalaBear, Poseidon2KoalaBear};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+use std::hint::black_box;
+use whir::{
+    fiat_shamir::{domain_separator::DomainSeparator, prover::ProverState},
+    poly::{evals::EvaluationsList, multilinear::MultilinearPoint},
+    sumcheck::sumcheck_single::SumcheckSingle,
+    whir::statement::{Statement, point::ConstraintPoint},
+};
+use whir_p3 as whir;
+
+type F = KoalaBear;
+type EF = BinomialExtensionField<F, 8>;
+type Poseidon16 = Poseidon2KoalaBear<16>;
+type MyChallenger = DuplexChallenger<F, Poseidon16, 16, 8>;
+
+const NUM_CONSTRAINTS: usize = 1;
+const FOLDING_FACTOR: usize = 5;
+const POW_BITS: usize = 0;
+
+fn setup_prover() -> ProverState<F, EF, MyChallenger> {
+    let mut rng = StdRng::seed_from_u64(0);
+    let poseidon = Poseidon16::new_from_rng_128(&mut rng);
+    let challenger = MyChallenger::new(poseidon);
+    DomainSeparator::new(vec![]).to_prover_state(challenger)
+}
+
+fn generate_poly(num_vars: usize) -> EvaluationsList<F> {
+    let mut rng = StdRng::seed_from_u64(1 + num_vars as u64);
+    EvaluationsList::new((0..1 << num_vars).map(|_| rng.random()).collect())
+}
+
+fn generate_statement(
+    num_vars: usize,
+    poly: &EvaluationsList<F>,
+    num_constraints: usize,
+) -> Statement<EF> {
+    let mut rng = StdRng::seed_from_u64(42 + num_vars as u64);
+    let mut statement = Statement::new(num_vars);
+    for _ in 0..num_constraints {
+        let point = MultilinearPoint::rand(&mut rng, num_vars);
+        let eval = poly.evaluate(&point);
+        statement.add_constraint(ConstraintPoint::new(point), eval);
+    }
+    statement
+}
+
+fn bench_sumcheck_prover_svo(c: &mut Criterion) {
+    let mut group = c.benchmark_group("SumcheckProver");
+    group.sample_size(30);
+
+    for &num_vars in &[16, 18, 20] {
+        let poly = generate_poly(num_vars);
+        let statement = generate_statement(num_vars, &poly, NUM_CONSTRAINTS);
+
+        group.bench_with_input(
+            BenchmarkId::new("Classic", num_vars),
+            &num_vars,
+            |b, &_num_vars| {
+                b.iter(|| {
+                    let mut prover = setup_prover();
+                    let combination_randomness: EF = prover.sample();
+                    let result = SumcheckSingle::from_base_evals(
+                        &poly,
+                        &statement,
+                        combination_randomness,
+                        &mut prover,
+                        FOLDING_FACTOR,
+                        POW_BITS,
+                    );
+                    black_box(result);
+                });
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("SVO", num_vars),
+            &num_vars,
+            |b, &_num_vars| {
+                b.iter(|| {
+                    let mut prover = setup_prover();
+                    let combination_randomness: EF = prover.sample();
+                    let result = SumcheckSingle::from_base_evals_svo(
+                        &poly,
+                        &statement,
+                        combination_randomness,
+                        &mut prover,
+                        FOLDING_FACTOR,
+                        POW_BITS,
+                    );
+                    black_box(result);
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_sumcheck_prover_svo);
+criterion_main!(benches);
diff --git a/src/sumcheck/small_value_utils.rs b/src/sumcheck/small_value_utils.rs
index f2392a52..ada27ee4 100644
--- a/src/sumcheck/small_value_utils.rs
+++ b/src/sumcheck/small_value_utils.rs
@@ -1,4 +1,5 @@
 use p3_field::Field;
+use std::ops::Add;
 
 pub const NUM_OF_ROUNDS: usize = 3;
 
@@ -49,6 +50,21 @@ where
         &self.accumulators[round]
     }
 }
+
+impl<F: Field> Add for Accumulators<F> {
+    type Output = Self;
+
+    fn add(mut self, other: Self) -> Self {
+        for i in 0..NUM_OF_ROUNDS {
+            // NUM_OF_ROUNDS is 3
+            for j in 0..self.accumulators[i].len() {
+                self.accumulators[i][j] += other.accumulators[i][j];
+            }
+        }
+        self
+    }
+}
+
 // For round i, RoundAccumulators has all the accumulators of the form A_i(u, v).
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub struct RoundAccumlators<F: Field> {
@@ -129,7 +145,7 @@ pub fn idx4_v2(index_beta: usize) -> [Option<usize>; 3] {
 // Implement Procedure 6 (Page 34).
 // Fijado x'' en {0, 1}^{l-3}, dadas las evaluaciones del multilineal q(x1, x2, x3) = p(x1, x2, x3, x'') en el booleano devuelve las
 // evaluaciones de q en beta para todo beta in {0, 1, inf}^3.
-pub fn compute_p_beta<F: Field>(current_evals: Vec<F>) -> Vec<F> {
+pub fn compute_p_beta<F: Field>(current_evals: &[F; 8]) -> Vec<F> {
     let mut next_evals = vec![F::ZERO; 27];
 
     next_evals[0] = current_evals[0]; // 000
diff --git a/src/sumcheck/sumcheck_small_value.rs b/src/sumcheck/sumcheck_small_value.rs
index 2ef1f019..95479c26 100644
--- a/src/sumcheck/sumcheck_small_value.rs
+++ b/src/sumcheck/sumcheck_small_value.rs
@@ -22,22 +22,26 @@ pub fn compute_accumulators<F: Field>(
     // For x'' in {0 .. 2^{l - 3}}:
     for x in 0..1 << (l - NUM_OF_ROUNDS) {
         // We compute p_1(beta, x'') for all beta in {0, 1, inf}^3
-        let current_evals_1: Vec<F> = poly_1
+        let current_evals_1_array: [F; 8] = poly_1
             .iter()
             .skip(x)
             .step_by(1 << (l - NUM_OF_ROUNDS))
             .cloned()
-            .collect();
-        let evals_1 = compute_p_beta(current_evals_1);
+            .collect::<Vec<F>>()
+            .try_into()
+            .unwrap();
+        let evals_1 = compute_p_beta(&current_evals_1_array);
 
         // We compute p_2(beta, x'') for all beta in {0, 1, inf}^3
-        let current_evals_2: Vec<F> = poly_2
+        let current_evals_2_array: [F; 8] = poly_2
             .iter()
             .skip(x)
             .step_by(1 << (l - NUM_OF_ROUNDS))
             .cloned()
-            .collect();
-        let evals_2 = compute_p_beta(current_evals_2);
+            .collect::<Vec<F>>()
+            .try_into()
+            .unwrap();
+        let evals_2 = compute_p_beta(&current_evals_2_array);
 
         // For each beta in {0, 1, inf}^3:
         // (We have 27 = 3 ^ NUM_OF_ROUNDS number of betas)
diff --git a/src/sumcheck/sumcheck_small_value_eq.rs b/src/sumcheck/sumcheck_small_value_eq.rs
index 79082c8d..c50c77e9 100644
--- a/src/sumcheck/sumcheck_small_value_eq.rs
+++ b/src/sumcheck/sumcheck_small_value_eq.rs
@@ -10,6 +10,7 @@ use p3_challenger::{FieldChallenger, GrindingChallenger};
 use p3_field::{ExtensionField, Field};
 
 use super::sumcheck_polynomial::SumcheckPolynomial;
+use p3_maybe_rayon::prelude::*;
 use p3_multilinear_util::eq::eval_eq;
 
 // WE ASSUME THE NUMBER OF ROUNDS WE ARE DOING WITH SMALL VALUES IS 3
@@ -33,6 +34,8 @@ fn precompute_e_out<F: Field>(w: &MultilinearPoint<F>) -> [Vec<F>; NUM_OF_ROUNDS
     })
 }
 
+//
+
 // Procedure 9. Page 37.
 fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
     poly: &EvaluationsList<F>,
@@ -42,63 +45,178 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
     let l = poly.num_variables();
     let half_l = l / 2;
 
-    let mut accumulators = Accumulators::<EF>::new_empty();
-
     let x_out_num_variables = half_l - NUM_OF_ROUNDS + (l % 2);
     debug_assert_eq!(half_l + x_out_num_variables, l - NUM_OF_ROUNDS);
 
-    for x_out in 0..1 << (x_out_num_variables) {
-        let mut temp_accumulators: Vec<EF> = vec![EF::ZERO; 27];
+    // Parallelize the outer loop over `x_out`
+    (0..1 << x_out_num_variables)
+        .into_par_iter()
+        .map(|x_out| {
+            // Each thread will compute its own set of local accumulators.
+            // This avoids mutable state sharing and the need for locks.
+            let mut local_accumulators = Accumulators::<EF>::new_empty();
+
+            // This inner part remains the same, but operates on local variables.
+            let mut temp_accumulators: Vec<EF> = vec![EF::ZERO; 27];
+
+            for x_in in 0..1 << half_l {
+                let start_index = (x_in << x_out_num_variables) | x_out;
+                let step_size = 1 << (l - NUM_OF_ROUNDS);
+
+                let current_evals_array: [F; 8] = poly
+                    .iter()
+                    .skip(start_index)
+                    .step_by(step_size)
+                    .copied()
+                    .collect::<Vec<F>>()
+                    .try_into()
+                    .unwrap();
+
+                let p_evals = compute_p_beta(&current_evals_array);
+                let e_in_value = e_in[x_in];
+
+                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals) {
+                    *accumulator += e_in_value * p_eval;
+                }
+            }
 
-        for x_in in 0..1 << half_l {
-            // We collect the evaluations of p(X_0, X_1, X_2, x_in, x_out) where
-            // x_in and x_out are fixed and X_0, X_1, X_2 are variables.
-            let start_index = (x_in << x_out_num_variables) | x_out;
-            let step_size = 1 << (l - NUM_OF_ROUNDS);
+            // hardcoded accumulator distribution
+            // This now populates the `local_accumulators` for this specific `x_out`.
+            let temp_acc = &temp_accumulators;
+            let e_out_2 = e_out[2][x_out];
 
-            let current_evals: Vec<F> = poly
-                .iter()
-                .skip(start_index)
-                .step_by(step_size)
-                .copied()
+            // Pre-fetch e_out values to avoid repeated indexing
+            let e_out_0: Vec<EF> = (0..4)
+                .map(|y| e_out[0][(y << x_out_num_variables) | x_out])
+                .collect();
+            let e_out_1: Vec<EF> = (0..2)
+                .map(|y| e_out[1][(y << x_out_num_variables) | x_out])
                 .collect();
 
-            // We compute p(beta, x_in, x_out) for all beta in {0, 1, inf}^3
-            let p_evals = compute_p_beta(current_evals);
-            let e_in_value = e_in[x_in];
+            // Now we do not use the idx4 function since we are directly computing the indices.
 
-            for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals) {
-                *accumulator += e_in_value * p_eval;
-            }
-        }
+            // beta_index = 0; b=(0,0,0);
+            local_accumulators.accumulate(0, 0, e_out_0[0] * temp_acc[0]); // y=0<<1|0=0
+            local_accumulators.accumulate(1, 0, e_out_1[0] * temp_acc[0]); // y=0
+            local_accumulators.accumulate(2, 0, e_out_2 * temp_acc[0]);
 
-        // TODO: This can be hardcoded for better performance.
-        for beta_index in 0..27 {
-            let [index_1, index_2, index_3] = idx4_v2(beta_index);
-            let [_, beta_2, beta_3] = to_base_three_coeff(beta_index);
-            let temp_acc = temp_accumulators[beta_index];
-
-            // Accumulator 1: uses y = beta_2 || beta_3
-            if let Some(index) = index_1 {
-                let y = beta_2 << 1 | beta_3;
-                let e_out_value = e_out[0][(y << x_out_num_variables) | x_out];
-                accumulators.accumulate(0, index, e_out_value * temp_acc);
-            }
+            // beta_index = 1; b=(0,0,1);
+            local_accumulators.accumulate(0, 0, e_out_0[1] * temp_acc[1]); // y=0<<1|1=1
+            local_accumulators.accumulate(1, 0, e_out_1[1] * temp_acc[1]); // y=1
+            local_accumulators.accumulate(2, 1, e_out_2 * temp_acc[1]);
 
-            // Accumulator 2: uses y = beta_3
-            if let Some(index) = index_2 {
-                let y = beta_3;
-                let e_out_value = e_out[1][(y << x_out_num_variables) | x_out];
-                accumulators.accumulate(1, index, e_out_value * temp_acc);
-            }
+            // beta_index = 2; b=(0,0,2);
+            local_accumulators.accumulate(2, 2, e_out_2 * temp_acc[2]);
 
-            // Accumulator 3: uses x_out directly
-            if let Some(index) = index_3 {
-                accumulators.accumulate(2, index, e_out[2][x_out] * temp_acc);
-            }
-        }
-    }
-    accumulators
+            // beta_index = 3; b=(0,1,0);
+            local_accumulators.accumulate(0, 0, e_out_0[2] * temp_acc[3]); // y=1<<1|0=2
+            local_accumulators.accumulate(1, 1, e_out_1[0] * temp_acc[3]); // y=0
+            local_accumulators.accumulate(2, 3, e_out_2 * temp_acc[3]);
+
+            // beta_index = 4; b=(0,1,1);
+            local_accumulators.accumulate(0, 0, e_out_0[3] * temp_acc[4]); // y=1<<1|1=3
+            local_accumulators.accumulate(1, 1, e_out_1[1] * temp_acc[4]); // y=1
+            local_accumulators.accumulate(2, 4, e_out_2 * temp_acc[4]);
+
+            // beta_index = 5; b=(0,1,2);
+            local_accumulators.accumulate(2, 5, e_out_2 * temp_acc[5]);
+
+            // beta_index = 6; b=(0,2,0);
+            local_accumulators.accumulate(1, 2, e_out_1[0] * temp_acc[6]); // y=0
+            local_accumulators.accumulate(2, 6, e_out_2 * temp_acc[6]);
+
+            // beta_index = 7; b=(0,2,1);
+            local_accumulators.accumulate(1, 2, e_out_1[1] * temp_acc[7]); // y=1
+            local_accumulators.accumulate(2, 7, e_out_2 * temp_acc[7]);
+
+            // beta_index = 8; b=(0,2,2);
+            local_accumulators.accumulate(2, 8, e_out_2 * temp_acc[8]);
+
+            // beta_index = 9; b=(1,0,0);
+            local_accumulators.accumulate(0, 1, e_out_0[0] * temp_acc[9]); // y=0<<1|0=0
+            local_accumulators.accumulate(1, 3, e_out_1[0] * temp_acc[9]); // y=0
+            local_accumulators.accumulate(2, 9, e_out_2 * temp_acc[9]);
+
+            // beta_index = 10; b=(1,0,1);
+            local_accumulators.accumulate(0, 1, e_out_0[1] * temp_acc[10]); // y=0<<1|1=1
+            local_accumulators.accumulate(1, 3, e_out_1[1] * temp_acc[10]); // y=1
+            local_accumulators.accumulate(2, 10, e_out_2 * temp_acc[10]);
+
+            // beta_index = 11; b=(1,0,2);
+            local_accumulators.accumulate(2, 11, e_out_2 * temp_acc[11]);
+
+            // beta_index = 12; b=(1,1,0);
+            local_accumulators.accumulate(0, 1, e_out_0[2] * temp_acc[12]); // y=1<<1|0=2
+            local_accumulators.accumulate(1, 4, e_out_1[0] * temp_acc[12]); // y=0
+            local_accumulators.accumulate(2, 12, e_out_2 * temp_acc[12]);
+
+            // beta_index = 13; b=(1,1,1);
+            local_accumulators.accumulate(0, 1, e_out_0[3] * temp_acc[13]); // y=1<<1|1=3
+            local_accumulators.accumulate(1, 4, e_out_1[1] * temp_acc[13]); // y=1
+            local_accumulators.accumulate(2, 13, e_out_2 * temp_acc[13]);
+
+            // beta_index = 14; b=(1,1,2);
+            local_accumulators.accumulate(2, 14, e_out_2 * temp_acc[14]);
+
+            // beta_index = 15; b=(1,2,0);
+            local_accumulators.accumulate(1, 5, e_out_1[0] * temp_acc[15]); // y=0
+            local_accumulators.accumulate(2, 15, e_out_2 * temp_acc[15]);
+
+            // beta_index = 16; b=(1,2,1);
+            local_accumulators.accumulate(1, 5, e_out_1[1] * temp_acc[16]); // y=1
+            local_accumulators.accumulate(2, 16, e_out_2 * temp_acc[16]);
+
+            // beta_index = 17; b=(1,2,2);
+            local_accumulators.accumulate(2, 17, e_out_2 * temp_acc[17]);
+
+            // beta_index = 18; b=(2,0,0);
+            local_accumulators.accumulate(1, 6, e_out_1[0] * temp_acc[18]); // y=0
+            local_accumulators.accumulate(2, 18, e_out_2 * temp_acc[18]);
+
+            // beta_index = 19; b=(2,0,1);
+            local_accumulators.accumulate(1, 6, e_out_1[1] * temp_acc[19]); // y=1
+            local_accumulators.accumulate(2, 19, e_out_2 * temp_acc[19]);
+
+            // beta_index = 20; b=(2,0,2);
+            local_accumulators.accumulate(2, 20, e_out_2 * temp_acc[20]);
+
+            // beta_index = 21; b=(2,1,0);
+            local_accumulators.accumulate(1, 7, e_out_1[0] * temp_acc[21]); // y=0
+            local_accumulators.accumulate(2, 21, e_out_2 * temp_acc[21]);
+
+            // beta_index = 22; b=(2,1,1);
+            local_accumulators.accumulate(1, 7, e_out_1[1] * temp_acc[22]); // y=1
+            local_accumulators.accumulate(2, 22, e_out_2 * temp_acc[22]);
+
+            // beta_index = 23; b=(2,1,2);
+            local_accumulators.accumulate(2, 23, e_out_2 * temp_acc[23]);
+
+            // beta_index = 24; b=(2,2,0);
+            local_accumulators.accumulate(1, 8, e_out_1[0] * temp_acc[24]); // y=0
+            local_accumulators.accumulate(2, 24, e_out_2 * temp_acc[24]);
+
+            // beta_index = 25; b=(2,2,1);
+            local_accumulators.accumulate(1, 8, e_out_1[1] * temp_acc[25]); // y=1
+            local_accumulators.accumulate(2, 25, e_out_2 * temp_acc[25]);
+
+            // beta_index = 26; b=(2,2,2);
+            local_accumulators.accumulate(2, 26, e_out_2 * temp_acc[26]);
+
+            // Return the computed local accumulators for this thread.
+            local_accumulators
+        })
+        // Reduce the results from all threads into a single Accumulators struct.
+        .reduce(
+            || Accumulators::<EF>::new_empty(),
+            |mut a, b| {
+                for (round_a, round_b) in a.accumulators.iter_mut().zip(b.accumulators.iter()) {
+                    for (acc_a, acc_b) in round_a.iter_mut().zip(round_b.iter()) {
+                        *acc_a += *acc_b;
+                    }
+                }
+                a
+            },
+        )
 }
 
 pub fn eval_eq_in_hypercube<F: Field>(point: &Vec<F>) -> Vec<F> {
@@ -133,7 +251,7 @@ pub fn compute_linear_function<F: Field>(w: &[F], r: &[F]) -> [F; 2] {
 
 fn get_evals_from_l_and_t<F: Field>(l: &[F; 2], t: &[F]) -> [F; 2] {
     [
-        t[0] * l[0],          // s(0)
+        t[0] * l[0],                   // s(0)
         (t[1] - t[0]) * (l[1] - l[0]), //s(inf) -> l(inf) = l(1) - l(0)
     ]
 }
@@ -176,8 +294,10 @@ where
     // 4. Receive the challenge r_1 from the verifier.
     let r_1: EF = prover_state.sample();
 
-    let eval_1 = *sum - round_poly_evals[0] ;
-    *sum = round_poly_evals[1] * r_1.square() + (eval_1 - round_poly_evals[0] - round_poly_evals[1]) * r_1 + round_poly_evals[0];
+    let eval_1 = *sum - round_poly_evals[0];
+    *sum = round_poly_evals[1] * r_1.square()
+        + (eval_1 - round_poly_evals[0] - round_poly_evals[1]) * r_1
+        + round_poly_evals[0];
 
     // 5. Compte R_2 = [L_0(r_1), L_1(r_1), L_inf(r_1)]
     // L_0 (x) = 1 - x
@@ -231,7 +351,9 @@ where
     ];
 
     let eval_1 = *sum - round_poly_evals[0];
-    *sum = round_poly_evals[1] * r_2.square() + (eval_1 - round_poly_evals[0] - round_poly_evals[1]) * r_2 + round_poly_evals[0];
+    *sum = round_poly_evals[1] * r_2.square()
+        + (eval_1 - round_poly_evals[0] - round_poly_evals[1]) * r_2
+        + round_poly_evals[0];
 
     // Round 3
 
@@ -265,11 +387,12 @@ where
     // TODO: En realidad no hace falta mandar S_3(1) porque se dedecue usando S_3(0).
     prover_state.add_extension_scalars(&round_poly_evals);
 
-
     let r_3: EF = prover_state.sample();
 
     let eval_1 = *sum - round_poly_evals[0];
-    *sum = round_poly_evals[1] * r_3.square() + (eval_1 - round_poly_evals[0] - round_poly_evals[1]) * r_3 + round_poly_evals[0];
+    *sum = round_poly_evals[1] * r_3.square()
+        + (eval_1 - round_poly_evals[0] - round_poly_evals[1]) * r_3
+        + round_poly_evals[0];
 
     (r_1, r_2, r_3)
 }
@@ -1055,11 +1178,9 @@ mod tests {
         // We compute l_2(0) and l_2(inf)
         let linear_2_evals = compute_linear_function(&w.0[..2], &[r_1]);
 
-
         // We compute S_2(0) and S_2(inf)
         let round_poly_evals = get_evals_from_l_and_t(&linear_2_evals, &t_2_evals);
 
-
         println!("ROUND 2 EQ: {:?}", round_poly_evals);
 
         // 5. Compute R_3 = [L_00(r_1, r_2), L_01(r_1, r_2), ..., L_{inf inf}(r_1, r_2)]

From 690a2fc998f09086c177f19e0a647cae450ce50a Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Wed, 8 Oct 2025 18:23:02 -0300
Subject: [PATCH 2/5] reuse buffer in accumulator calculation

---
 benches/sumcheck_svo.rs                 | 70 +++++++++++++++----------
 src/sumcheck/small_value_utils.rs       |  4 +-
 src/sumcheck/sumcheck_small_value.rs    | 12 +++--
 src/sumcheck/sumcheck_small_value_eq.rs |  5 +-
 4 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/benches/sumcheck_svo.rs b/benches/sumcheck_svo.rs
index cefda5bb..79edb72b 100644
--- a/benches/sumcheck_svo.rs
+++ b/benches/sumcheck_svo.rs
@@ -1,9 +1,10 @@
-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
 use p3_challenger::DuplexChallenger;
 use p3_field::extension::BinomialExtensionField;
 use p3_koala_bear::{KoalaBear, Poseidon2KoalaBear};
 use rand::{Rng, SeedableRng, rngs::StdRng};
 use std::hint::black_box;
+use std::time::Duration;
 use whir::{
     fiat_shamir::{domain_separator::DomainSeparator, prover::ProverState},
     poly::{evals::EvaluationsList, multilinear::MultilinearPoint},
@@ -50,7 +51,8 @@ fn generate_statement(
 
 fn bench_sumcheck_prover_svo(c: &mut Criterion) {
     let mut group = c.benchmark_group("SumcheckProver");
-    group.sample_size(30);
+    group.sample_size(200);
+    group.measurement_time(Duration::from_secs(5));
 
     for &num_vars in &[16, 18, 20] {
         let poly = generate_poly(num_vars);
@@ -60,19 +62,25 @@ fn bench_sumcheck_prover_svo(c: &mut Criterion) {
             BenchmarkId::new("Classic", num_vars),
             &num_vars,
             |b, &_num_vars| {
-                b.iter(|| {
-                    let mut prover = setup_prover();
-                    let combination_randomness: EF = prover.sample();
-                    let result = SumcheckSingle::from_base_evals(
-                        &poly,
-                        &statement,
-                        combination_randomness,
-                        &mut prover,
-                        FOLDING_FACTOR,
-                        POW_BITS,
-                    );
-                    black_box(result);
-                });
+                b.iter_batched(
+                    || {
+                        let mut prover = setup_prover();
+                        let combination_randomness: EF = prover.sample();
+                        (prover, combination_randomness)
+                    },
+                    |(mut prover, combination_randomness)| {
+                        let result = SumcheckSingle::from_base_evals(
+                            &poly,
+                            &statement,
+                            combination_randomness,
+                            &mut prover,
+                            FOLDING_FACTOR,
+                            POW_BITS,
+                        );
+                        black_box(result);
+                    },
+                    BatchSize::SmallInput,
+                );
             },
         );
 
@@ -80,19 +88,25 @@ fn bench_sumcheck_prover_svo(c: &mut Criterion) {
             BenchmarkId::new("SVO", num_vars),
             &num_vars,
             |b, &_num_vars| {
-                b.iter(|| {
-                    let mut prover = setup_prover();
-                    let combination_randomness: EF = prover.sample();
-                    let result = SumcheckSingle::from_base_evals_svo(
-                        &poly,
-                        &statement,
-                        combination_randomness,
-                        &mut prover,
-                        FOLDING_FACTOR,
-                        POW_BITS,
-                    );
-                    black_box(result);
-                });
+                b.iter_batched(
+                    || {
+                        let mut prover = setup_prover();
+                        let combination_randomness: EF = prover.sample();
+                        (prover, combination_randomness)
+                    },
+                    |(mut prover, combination_randomness)| {
+                        let result = SumcheckSingle::from_base_evals_svo(
+                            &poly,
+                            &statement,
+                            combination_randomness,
+                            &mut prover,
+                            FOLDING_FACTOR,
+                            POW_BITS,
+                        );
+                        black_box(result);
+                    },
+                    BatchSize::SmallInput,
+                );
             },
         );
     }
diff --git a/src/sumcheck/small_value_utils.rs b/src/sumcheck/small_value_utils.rs
index ada27ee4..3531cbb2 100644
--- a/src/sumcheck/small_value_utils.rs
+++ b/src/sumcheck/small_value_utils.rs
@@ -145,7 +145,7 @@ pub fn idx4_v2(index_beta: usize) -> [Option<usize>; 3] {
 // Implement Procedure 6 (Page 34).
 // Fijado x'' en {0, 1}^{l-3}, dadas las evaluaciones del multilineal q(x1, x2, x3) = p(x1, x2, x3, x'') en el booleano devuelve las
 // evaluaciones de q en beta para todo beta in {0, 1, inf}^3.
-pub fn compute_p_beta<F: Field>(current_evals: &[F; 8]) -> Vec<F> {
+pub fn compute_p_beta<F: Field>(current_evals: &[F; 8], next_evals: &mut [F; 27]) {
     let mut next_evals = vec![F::ZERO; 27];
 
     next_evals[0] = current_evals[0]; // 000
@@ -181,6 +181,4 @@ pub fn compute_p_beta<F: Field>(current_evals: &[F; 8]) -> Vec<F> {
     next_evals[20] = next_evals[19] - next_evals[18]; // 202
     next_evals[23] = next_evals[22] - next_evals[21]; // 212
     next_evals[26] = next_evals[25] - next_evals[24]; // 222
-
-    next_evals
 }
diff --git a/src/sumcheck/sumcheck_small_value.rs b/src/sumcheck/sumcheck_small_value.rs
index 95479c26..1d69aec1 100644
--- a/src/sumcheck/sumcheck_small_value.rs
+++ b/src/sumcheck/sumcheck_small_value.rs
@@ -19,6 +19,9 @@ pub fn compute_accumulators<F: Field>(
     let mut round_2_accumulator = RoundAccumlators::<F>::new_empty(2);
     let mut round_3_accumulator = RoundAccumlators::<F>::new_empty(3);
 
+    let mut evals_1_buffer = [F::ZERO; 27];
+    let mut evals_2_buffer = [F::ZERO; 27];
+
     // For x'' in {0 .. 2^{l - 3}}:
     for x in 0..1 << (l - NUM_OF_ROUNDS) {
         // We compute p_1(beta, x'') for all beta in {0, 1, inf}^3
@@ -30,7 +33,7 @@ pub fn compute_accumulators<F: Field>(
             .collect::<Vec<F>>()
             .try_into()
             .unwrap();
-        let evals_1 = compute_p_beta(&current_evals_1_array);
+        compute_p_beta(&current_evals_1_array, &mut evals_1_buffer);
 
         // We compute p_2(beta, x'') for all beta in {0, 1, inf}^3
         let current_evals_2_array: [F; 8] = poly_2
@@ -41,7 +44,7 @@ pub fn compute_accumulators<F: Field>(
             .collect::<Vec<F>>()
             .try_into()
             .unwrap();
-        let evals_2 = compute_p_beta(&current_evals_2_array);
+        compute_p_beta(&current_evals_2_array, &mut evals_2_buffer);
 
         // For each beta in {0, 1, inf}^3:
         // (We have 27 = 3 ^ NUM_OF_ROUNDS number of betas)
@@ -58,7 +61,10 @@ pub fn compute_accumulators<F: Field>(
                 (index_accumulator_3, &mut round_3_accumulator),
             ] {
                 if let Some(index) = index_opt {
-                    acc.accumulate_eval(evals_1[beta_index] * evals_2[beta_index], index);
+                    acc.accumulate_eval(
+                        evals_1_buffer[beta_index] * evals_2_buffer[beta_index],
+                        index,
+                    );
                 }
             }
         }
diff --git a/src/sumcheck/sumcheck_small_value_eq.rs b/src/sumcheck/sumcheck_small_value_eq.rs
index c50c77e9..b5328158 100644
--- a/src/sumcheck/sumcheck_small_value_eq.rs
+++ b/src/sumcheck/sumcheck_small_value_eq.rs
@@ -58,6 +58,7 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
 
             // This inner part remains the same, but operates on local variables.
             let mut temp_accumulators: Vec<EF> = vec![EF::ZERO; 27];
+            let mut p_evals_buffer = [F::ZERO; 27];
 
             for x_in in 0..1 << half_l {
                 let start_index = (x_in << x_out_num_variables) | x_out;
@@ -72,10 +73,10 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
                     .try_into()
                     .unwrap();
 
-                let p_evals = compute_p_beta(&current_evals_array);
+                compute_p_beta(&current_evals_array, &mut p_evals_buffer);
                 let e_in_value = e_in[x_in];
 
-                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals) {
+                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals_buffer) {
                     *accumulator += e_in_value * p_eval;
                 }
             }

From 2b5113e0a769bfa9641f65da93ef591962cf6217 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 9 Oct 2025 11:27:36 -0300
Subject: [PATCH 3/5] Revert "reuse buffer in accumulator calculation"

This reverts commit 690a2fc998f09086c177f19e0a647cae450ce50a.
---
 benches/sumcheck_svo.rs                 | 70 ++++++++++---------------
 src/sumcheck/small_value_utils.rs       |  4 +-
 src/sumcheck/sumcheck_small_value.rs    | 12 ++---
 src/sumcheck/sumcheck_small_value_eq.rs |  5 +-
 4 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/benches/sumcheck_svo.rs b/benches/sumcheck_svo.rs
index 79edb72b..cefda5bb 100644
--- a/benches/sumcheck_svo.rs
+++ b/benches/sumcheck_svo.rs
@@ -1,10 +1,9 @@
-use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 use p3_challenger::DuplexChallenger;
 use p3_field::extension::BinomialExtensionField;
 use p3_koala_bear::{KoalaBear, Poseidon2KoalaBear};
 use rand::{Rng, SeedableRng, rngs::StdRng};
 use std::hint::black_box;
-use std::time::Duration;
 use whir::{
     fiat_shamir::{domain_separator::DomainSeparator, prover::ProverState},
     poly::{evals::EvaluationsList, multilinear::MultilinearPoint},
@@ -51,8 +50,7 @@ fn generate_statement(
 
 fn bench_sumcheck_prover_svo(c: &mut Criterion) {
     let mut group = c.benchmark_group("SumcheckProver");
-    group.sample_size(200);
-    group.measurement_time(Duration::from_secs(5));
+    group.sample_size(30);
 
     for &num_vars in &[16, 18, 20] {
         let poly = generate_poly(num_vars);
@@ -62,25 +60,19 @@ fn bench_sumcheck_prover_svo(c: &mut Criterion) {
             BenchmarkId::new("Classic", num_vars),
             &num_vars,
             |b, &_num_vars| {
-                b.iter_batched(
-                    || {
-                        let mut prover = setup_prover();
-                        let combination_randomness: EF = prover.sample();
-                        (prover, combination_randomness)
-                    },
-                    |(mut prover, combination_randomness)| {
-                        let result = SumcheckSingle::from_base_evals(
-                            &poly,
-                            &statement,
-                            combination_randomness,
-                            &mut prover,
-                            FOLDING_FACTOR,
-                            POW_BITS,
-                        );
-                        black_box(result);
-                    },
-                    BatchSize::SmallInput,
-                );
+                b.iter(|| {
+                    let mut prover = setup_prover();
+                    let combination_randomness: EF = prover.sample();
+                    let result = SumcheckSingle::from_base_evals(
+                        &poly,
+                        &statement,
+                        combination_randomness,
+                        &mut prover,
+                        FOLDING_FACTOR,
+                        POW_BITS,
+                    );
+                    black_box(result);
+                });
             },
         );
 
@@ -88,25 +80,19 @@ fn bench_sumcheck_prover_svo(c: &mut Criterion) {
             BenchmarkId::new("SVO", num_vars),
             &num_vars,
             |b, &_num_vars| {
-                b.iter_batched(
-                    || {
-                        let mut prover = setup_prover();
-                        let combination_randomness: EF = prover.sample();
-                        (prover, combination_randomness)
-                    },
-                    |(mut prover, combination_randomness)| {
-                        let result = SumcheckSingle::from_base_evals_svo(
-                            &poly,
-                            &statement,
-                            combination_randomness,
-                            &mut prover,
-                            FOLDING_FACTOR,
-                            POW_BITS,
-                        );
-                        black_box(result);
-                    },
-                    BatchSize::SmallInput,
-                );
+                b.iter(|| {
+                    let mut prover = setup_prover();
+                    let combination_randomness: EF = prover.sample();
+                    let result = SumcheckSingle::from_base_evals_svo(
+                        &poly,
+                        &statement,
+                        combination_randomness,
+                        &mut prover,
+                        FOLDING_FACTOR,
+                        POW_BITS,
+                    );
+                    black_box(result);
+                });
             },
         );
     }
diff --git a/src/sumcheck/small_value_utils.rs b/src/sumcheck/small_value_utils.rs
index 3531cbb2..ada27ee4 100644
--- a/src/sumcheck/small_value_utils.rs
+++ b/src/sumcheck/small_value_utils.rs
@@ -145,7 +145,7 @@ pub fn idx4_v2(index_beta: usize) -> [Option<usize>; 3] {
 // Implement Procedure 6 (Page 34).
 // Fijado x'' en {0, 1}^{l-3}, dadas las evaluaciones del multilineal q(x1, x2, x3) = p(x1, x2, x3, x'') en el booleano devuelve las
 // evaluaciones de q en beta para todo beta in {0, 1, inf}^3.
-pub fn compute_p_beta<F: Field>(current_evals: &[F; 8], next_evals: &mut [F; 27]) {
+pub fn compute_p_beta<F: Field>(current_evals: &[F; 8]) -> Vec<F> {
     let mut next_evals = vec![F::ZERO; 27];
 
     next_evals[0] = current_evals[0]; // 000
@@ -181,4 +181,6 @@ pub fn compute_p_beta<F: Field>(current_evals: &[F; 8], next_evals: &mut [F; 27]
     next_evals[20] = next_evals[19] - next_evals[18]; // 202
     next_evals[23] = next_evals[22] - next_evals[21]; // 212
     next_evals[26] = next_evals[25] - next_evals[24]; // 222
+
+    next_evals
 }
diff --git a/src/sumcheck/sumcheck_small_value.rs b/src/sumcheck/sumcheck_small_value.rs
index 1d69aec1..95479c26 100644
--- a/src/sumcheck/sumcheck_small_value.rs
+++ b/src/sumcheck/sumcheck_small_value.rs
@@ -19,9 +19,6 @@ pub fn compute_accumulators<F: Field>(
     let mut round_2_accumulator = RoundAccumlators::<F>::new_empty(2);
     let mut round_3_accumulator = RoundAccumlators::<F>::new_empty(3);
 
-    let mut evals_1_buffer = [F::ZERO; 27];
-    let mut evals_2_buffer = [F::ZERO; 27];
-
     // For x'' in {0 .. 2^{l - 3}}:
     for x in 0..1 << (l - NUM_OF_ROUNDS) {
         // We compute p_1(beta, x'') for all beta in {0, 1, inf}^3
@@ -33,7 +30,7 @@ pub fn compute_accumulators<F: Field>(
             .collect::<Vec<F>>()
             .try_into()
             .unwrap();
-        compute_p_beta(&current_evals_1_array, &mut evals_1_buffer);
+        let evals_1 = compute_p_beta(&current_evals_1_array);
 
         // We compute p_2(beta, x'') for all beta in {0, 1, inf}^3
         let current_evals_2_array: [F; 8] = poly_2
@@ -44,7 +41,7 @@ pub fn compute_accumulators<F: Field>(
             .collect::<Vec<F>>()
             .try_into()
             .unwrap();
-        compute_p_beta(&current_evals_2_array, &mut evals_2_buffer);
+        let evals_2 = compute_p_beta(&current_evals_2_array);
 
         // For each beta in {0, 1, inf}^3:
         // (We have 27 = 3 ^ NUM_OF_ROUNDS number of betas)
@@ -61,10 +58,7 @@ pub fn compute_accumulators<F: Field>(
                 (index_accumulator_3, &mut round_3_accumulator),
             ] {
                 if let Some(index) = index_opt {
-                    acc.accumulate_eval(
-                        evals_1_buffer[beta_index] * evals_2_buffer[beta_index],
-                        index,
-                    );
+                    acc.accumulate_eval(evals_1[beta_index] * evals_2[beta_index], index);
                 }
             }
         }
diff --git a/src/sumcheck/sumcheck_small_value_eq.rs b/src/sumcheck/sumcheck_small_value_eq.rs
index b5328158..c50c77e9 100644
--- a/src/sumcheck/sumcheck_small_value_eq.rs
+++ b/src/sumcheck/sumcheck_small_value_eq.rs
@@ -58,7 +58,6 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
 
             // This inner part remains the same, but operates on local variables.
             let mut temp_accumulators: Vec<EF> = vec![EF::ZERO; 27];
-            let mut p_evals_buffer = [F::ZERO; 27];
 
             for x_in in 0..1 << half_l {
                 let start_index = (x_in << x_out_num_variables) | x_out;
@@ -73,10 +72,10 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
                     .try_into()
                     .unwrap();
 
-                compute_p_beta(&current_evals_array, &mut p_evals_buffer);
+                let p_evals = compute_p_beta(&current_evals_array);
                 let e_in_value = e_in[x_in];
 
-                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals_buffer) {
+                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals) {
                     *accumulator += e_in_value * p_eval;
                 }
             }

From 65534dad1609677be2f7f8a362bc529ce87b21f6 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 9 Oct 2025 12:47:48 -0300
Subject: [PATCH 4/5] reuse buffer in accumulator calculation

---
 benches/sumcheck_svo.rs                 |  6 +++---
 src/sumcheck/small_value_utils.rs       |  5 +----
 src/sumcheck/sumcheck_small_value.rs    | 11 ++++++++---
 src/sumcheck/sumcheck_small_value_eq.rs |  5 +++--
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/benches/sumcheck_svo.rs b/benches/sumcheck_svo.rs
index cefda5bb..695a4fee 100644
--- a/benches/sumcheck_svo.rs
+++ b/benches/sumcheck_svo.rs
@@ -4,6 +4,7 @@ use p3_field::extension::BinomialExtensionField;
 use p3_koala_bear::{KoalaBear, Poseidon2KoalaBear};
 use rand::{Rng, SeedableRng, rngs::StdRng};
 use std::hint::black_box;
+use std::time::Duration;
 use whir::{
     fiat_shamir::{domain_separator::DomainSeparator, prover::ProverState},
     poly::{evals::EvaluationsList, multilinear::MultilinearPoint},
@@ -11,7 +12,6 @@ use whir::{
     whir::statement::{Statement, point::ConstraintPoint},
 };
 use whir_p3 as whir;
-
 type F = KoalaBear;
 type EF = BinomialExtensionField<F, 8>;
 type Poseidon16 = Poseidon2KoalaBear<16>;
@@ -50,8 +50,8 @@ fn generate_statement(
 
 fn bench_sumcheck_prover_svo(c: &mut Criterion) {
     let mut group = c.benchmark_group("SumcheckProver");
-    group.sample_size(30);
-
+    group.sample_size(100);
+    group.warm_up_time(Duration::from_secs(10));
     for &num_vars in &[16, 18, 20] {
         let poly = generate_poly(num_vars);
         let statement = generate_statement(num_vars, &poly, NUM_CONSTRAINTS);
diff --git a/src/sumcheck/small_value_utils.rs b/src/sumcheck/small_value_utils.rs
index ada27ee4..f5faa57b 100644
--- a/src/sumcheck/small_value_utils.rs
+++ b/src/sumcheck/small_value_utils.rs
@@ -145,8 +145,7 @@ pub fn idx4_v2(index_beta: usize) -> [Option<usize>; 3] {
 // Implement Procedure 6 (Page 34).
 // Fijado x'' en {0, 1}^{l-3}, dadas las evaluaciones del multilineal q(x1, x2, x3) = p(x1, x2, x3, x'') en el booleano devuelve las
 // evaluaciones de q en beta para todo beta in {0, 1, inf}^3.
-pub fn compute_p_beta<F: Field>(current_evals: &[F; 8]) -> Vec<F> {
-    let mut next_evals = vec![F::ZERO; 27];
+pub fn compute_p_beta<F: Field>(current_evals: &[F; 8], next_evals: &mut [F; 27]) {
 
     next_evals[0] = current_evals[0]; // 000
     next_evals[1] = current_evals[1]; // 001
@@ -181,6 +180,4 @@ pub fn compute_p_beta<F: Field>(current_evals: &[F; 8]) -> Vec<F> {
     next_evals[20] = next_evals[19] - next_evals[18]; // 202
     next_evals[23] = next_evals[22] - next_evals[21]; // 212
     next_evals[26] = next_evals[25] - next_evals[24]; // 222
-
-    next_evals
 }
diff --git a/src/sumcheck/sumcheck_small_value.rs b/src/sumcheck/sumcheck_small_value.rs
index 95479c26..2052622e 100644
--- a/src/sumcheck/sumcheck_small_value.rs
+++ b/src/sumcheck/sumcheck_small_value.rs
@@ -19,6 +19,8 @@ pub fn compute_accumulators<F: Field>(
     let mut round_2_accumulator = RoundAccumlators::<F>::new_empty(2);
     let mut round_3_accumulator = RoundAccumlators::<F>::new_empty(3);
 
+    let mut evals_1_buffer = [F::ZERO; 27];
+    let mut evals_2_buffer = [F::ZERO; 27];
     // For x'' in {0 .. 2^{l - 3}}:
     for x in 0..1 << (l - NUM_OF_ROUNDS) {
         // We compute p_1(beta, x'') for all beta in {0, 1, inf}^3
@@ -30,7 +32,7 @@ pub fn compute_accumulators<F: Field>(
             .collect::<Vec<F>>()
             .try_into()
             .unwrap();
-        let evals_1 = compute_p_beta(&current_evals_1_array);
+        compute_p_beta(&current_evals_1_array, &mut evals_1_buffer);
 
         // We compute p_2(beta, x'') for all beta in {0, 1, inf}^3
         let current_evals_2_array: [F; 8] = poly_2
@@ -41,7 +43,7 @@ pub fn compute_accumulators<F: Field>(
             .collect::<Vec<F>>()
             .try_into()
             .unwrap();
-        let evals_2 = compute_p_beta(&current_evals_2_array);
+        compute_p_beta(&current_evals_2_array, &mut evals_2_buffer);
 
         // For each beta in {0, 1, inf}^3:
         // (We have 27 = 3 ^ NUM_OF_ROUNDS number of betas)
@@ -58,7 +60,10 @@ pub fn compute_accumulators<F: Field>(
                 (index_accumulator_3, &mut round_3_accumulator),
             ] {
                 if let Some(index) = index_opt {
-                    acc.accumulate_eval(evals_1[beta_index] * evals_2[beta_index], index);
+                    acc.accumulate_eval(
+                        evals_1_buffer[beta_index] * evals_2_buffer[beta_index],
+                        index,
+                    );
                 }
             }
         }
diff --git a/src/sumcheck/sumcheck_small_value_eq.rs b/src/sumcheck/sumcheck_small_value_eq.rs
index c50c77e9..b5328158 100644
--- a/src/sumcheck/sumcheck_small_value_eq.rs
+++ b/src/sumcheck/sumcheck_small_value_eq.rs
@@ -58,6 +58,7 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
 
             // This inner part remains the same, but operates on local variables.
             let mut temp_accumulators: Vec<EF> = vec![EF::ZERO; 27];
+            let mut p_evals_buffer = [F::ZERO; 27];
 
             for x_in in 0..1 << half_l {
                 let start_index = (x_in << x_out_num_variables) | x_out;
@@ -72,10 +73,10 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
                     .try_into()
                     .unwrap();
 
-                let p_evals = compute_p_beta(&current_evals_array);
+                compute_p_beta(&current_evals_array, &mut p_evals_buffer);
                 let e_in_value = e_in[x_in];
 
-                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals) {
+                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals_buffer) {
                     *accumulator += e_in_value * p_eval;
                 }
             }

From a3c88b78f2ba6c4083f30f02b5f9cc799215a3cf Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 9 Oct 2025 16:32:47 -0300
Subject: [PATCH 5/5] Improve memory access pattern in accumulator loopp

---
 src/sumcheck/sumcheck_small_value_eq.rs | 131 +++++++++++++++---------
 1 file changed, 85 insertions(+), 46 deletions(-)

diff --git a/src/sumcheck/sumcheck_small_value_eq.rs b/src/sumcheck/sumcheck_small_value_eq.rs
index b5328158..36c6b7c3 100644
--- a/src/sumcheck/sumcheck_small_value_eq.rs
+++ b/src/sumcheck/sumcheck_small_value_eq.rs
@@ -34,8 +34,46 @@ fn precompute_e_out<F: Field>(w: &MultilinearPoint<F>) -> [Vec<F>; NUM_OF_ROUNDS
     })
 }
 
-//
+/// Reorders the polynomial evaluations to improve cache locality.
+///
+/// Instead of the original layout, this function groups the 8 values
+/// needed for each `compute_p_beta` call into contiguous blocks.
+fn transpose_poly_for_svo<F: Field>(
+    poly: &EvaluationsList<F>,
+    num_variables: usize,
+    x_out_num_vars: usize,
+    half_l: usize,
+) -> Vec<F> {
+    let num_x_in = 1 << half_l;
+    let num_x_out = 1 << x_out_num_vars;
+    let step_size = 1 << (num_variables - NUM_OF_ROUNDS);
+    let block_size = 8;
+
+    // Pre-allocate the full memory for the transposed data.
+    let mut transposed_poly = vec![F::ZERO; 1 << num_variables];
+    let x_out_block_size = num_x_in * block_size;
+
+    // Parallelize the transposition work.
+    transposed_poly
+        .par_chunks_mut(x_out_block_size)
+        .enumerate()
+        .for_each(|(x_out, chunk)| {
+            // Each thread works on a separate `x_out` chunk.
+            for x_in in 0..num_x_in {
+                let start_index = (x_in << x_out_num_vars) | x_out;
+
+                // The destination index is relative to the start of the current chunk.
+                let dest_base_index = x_in * block_size;
+
+                let mut iter = poly.iter().skip(start_index).step_by(step_size);
+                for i in 0..block_size {
+                    chunk[dest_base_index + i] = *iter.next().unwrap();
+                }
+            }
+        });
 
+    transposed_poly
+}
 // Procedure 9. Page 37.
 fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
     poly: &EvaluationsList<F>,
@@ -48,6 +86,10 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
     let x_out_num_variables = half_l - NUM_OF_ROUNDS + (l % 2);
     debug_assert_eq!(half_l + x_out_num_variables, l - NUM_OF_ROUNDS);
 
+    // Optimization number 3: Transpose the polynomial to improve cache locality.
+    // 1 . Transpose the polynomial befoere entering the parallel loop.
+    let transposed_poly = transpose_poly_for_svo(poly, l, x_out_num_variables, half_l);
+
     // Parallelize the outer loop over `x_out`
     (0..1 << x_out_num_variables)
         .into_par_iter()
@@ -59,24 +101,21 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
             // This inner part remains the same, but operates on local variables.
             let mut temp_accumulators: Vec<EF> = vec![EF::ZERO; 27];
             let mut p_evals_buffer = [F::ZERO; 27];
+            let num_x_in = 1 << half_l;
 
-            for x_in in 0..1 << half_l {
-                let start_index = (x_in << x_out_num_variables) | x_out;
-                let step_size = 1 << (l - NUM_OF_ROUNDS);
-
-                let current_evals_array: [F; 8] = poly
-                    .iter()
-                    .skip(start_index)
-                    .step_by(step_size)
-                    .copied()
-                    .collect::<Vec<F>>()
+            for x_in in 0..num_x_in {
+                // 2. Read a contiguous block instead of jumping through memory.
+                let block_start = (x_out * num_x_in + x_in) * 8;
+                let current_evals_arr: [F; 8] = transposed_poly[block_start..block_start + 8]
                     .try_into()
                     .unwrap();
 
-                compute_p_beta(&current_evals_array, &mut p_evals_buffer);
+                compute_p_beta(&current_evals_arr, &mut p_evals_buffer);
                 let e_in_value = e_in[x_in];
 
-                for (accumulator, &p_eval) in temp_accumulators.iter_mut().zip(&p_evals_buffer) {
+                for (accumulator, &p_eval) in
+                    temp_accumulators.iter_mut().zip(p_evals_buffer.iter())
+                {
                     *accumulator += e_in_value * p_eval;
                 }
             }
@@ -86,118 +125,118 @@ fn compute_accumulators_eq<F: Field, EF: ExtensionField<F>>(
             let temp_acc = &temp_accumulators;
             let e_out_2 = e_out[2][x_out];
 
-            // Pre-fetch e_out values to avoid repeated indexing
-            let e_out_0: Vec<EF> = (0..4)
-                .map(|y| e_out[0][(y << x_out_num_variables) | x_out])
-                .collect();
-            let e_out_1: Vec<EF> = (0..2)
-                .map(|y| e_out[1][(y << x_out_num_variables) | x_out])
-                .collect();
+            // Pre-fetch e_out values to avoid repeated indexing and allocations.
+            let e0_0 = e_out[0][(0 << x_out_num_variables) | x_out];
+            let e0_1 = e_out[0][(1 << x_out_num_variables) | x_out];
+            let e0_2 = e_out[0][(2 << x_out_num_variables) | x_out];
+            let e0_3 = e_out[0][(3 << x_out_num_variables) | x_out];
+            let e1_0 = e_out[1][(0 << x_out_num_variables) | x_out];
+            let e1_1 = e_out[1][(1 << x_out_num_variables) | x_out];
 
             // Now we do not use the idx4 function since we are directly computing the indices.
 
             // beta_index = 0; b=(0,0,0);
-            local_accumulators.accumulate(0, 0, e_out_0[0] * temp_acc[0]); // y=0<<1|0=0
-            local_accumulators.accumulate(1, 0, e_out_1[0] * temp_acc[0]); // y=0
+            local_accumulators.accumulate(0, 0, e0_0 * temp_acc[0]); // y=0<<1|0=0
+            local_accumulators.accumulate(1, 0, e1_0 * temp_acc[0]); // y=0
             local_accumulators.accumulate(2, 0, e_out_2 * temp_acc[0]);
 
             // beta_index = 1; b=(0,0,1);
-            local_accumulators.accumulate(0, 0, e_out_0[1] * temp_acc[1]); // y=0<<1|1=1
-            local_accumulators.accumulate(1, 0, e_out_1[1] * temp_acc[1]); // y=1
+            local_accumulators.accumulate(0, 0, e0_1 * temp_acc[1]); // y=0<<1|1=1
+            local_accumulators.accumulate(1, 0, e1_1 * temp_acc[1]); // y=1
             local_accumulators.accumulate(2, 1, e_out_2 * temp_acc[1]);
 
             // beta_index = 2; b=(0,0,2);
             local_accumulators.accumulate(2, 2, e_out_2 * temp_acc[2]);
 
             // beta_index = 3; b=(0,1,0);
-            local_accumulators.accumulate(0, 0, e_out_0[2] * temp_acc[3]); // y=1<<1|0=2
-            local_accumulators.accumulate(1, 1, e_out_1[0] * temp_acc[3]); // y=0
+            local_accumulators.accumulate(0, 0, e0_2 * temp_acc[3]); // y=1<<1|0=2
+            local_accumulators.accumulate(1, 1, e1_0 * temp_acc[3]); // y=0
             local_accumulators.accumulate(2, 3, e_out_2 * temp_acc[3]);
 
             // beta_index = 4; b=(0,1,1);
-            local_accumulators.accumulate(0, 0, e_out_0[3] * temp_acc[4]); // y=1<<1|1=3
-            local_accumulators.accumulate(1, 1, e_out_1[1] * temp_acc[4]); // y=1
+            local_accumulators.accumulate(0, 0, e0_3 * temp_acc[4]); // y=1<<1|1=3
+            local_accumulators.accumulate(1, 1, e1_1 * temp_acc[4]); // y=1
             local_accumulators.accumulate(2, 4, e_out_2 * temp_acc[4]);
 
             // beta_index = 5; b=(0,1,2);
             local_accumulators.accumulate(2, 5, e_out_2 * temp_acc[5]);
 
             // beta_index = 6; b=(0,2,0);
-            local_accumulators.accumulate(1, 2, e_out_1[0] * temp_acc[6]); // y=0
+            local_accumulators.accumulate(1, 2, e1_0 * temp_acc[6]); // y=0
             local_accumulators.accumulate(2, 6, e_out_2 * temp_acc[6]);
 
             // beta_index = 7; b=(0,2,1);
-            local_accumulators.accumulate(1, 2, e_out_1[1] * temp_acc[7]); // y=1
+            local_accumulators.accumulate(1, 2, e1_1 * temp_acc[7]); // y=1
             local_accumulators.accumulate(2, 7, e_out_2 * temp_acc[7]);
 
             // beta_index = 8; b=(0,2,2);
             local_accumulators.accumulate(2, 8, e_out_2 * temp_acc[8]);
 
             // beta_index = 9; b=(1,0,0);
-            local_accumulators.accumulate(0, 1, e_out_0[0] * temp_acc[9]); // y=0<<1|0=0
-            local_accumulators.accumulate(1, 3, e_out_1[0] * temp_acc[9]); // y=0
+            local_accumulators.accumulate(0, 1, e0_0 * temp_acc[9]); // y=0<<1|0=0
+            local_accumulators.accumulate(1, 3, e1_0 * temp_acc[9]); // y=0
             local_accumulators.accumulate(2, 9, e_out_2 * temp_acc[9]);
 
             // beta_index = 10; b=(1,0,1);
-            local_accumulators.accumulate(0, 1, e_out_0[1] * temp_acc[10]); // y=0<<1|1=1
-            local_accumulators.accumulate(1, 3, e_out_1[1] * temp_acc[10]); // y=1
+            local_accumulators.accumulate(0, 1, e0_1 * temp_acc[10]); // y=0<<1|1=1
+            local_accumulators.accumulate(1, 3, e1_1 * temp_acc[10]); // y=1
             local_accumulators.accumulate(2, 10, e_out_2 * temp_acc[10]);
 
             // beta_index = 11; b=(1,0,2);
             local_accumulators.accumulate(2, 11, e_out_2 * temp_acc[11]);
 
             // beta_index = 12; b=(1,1,0);
-            local_accumulators.accumulate(0, 1, e_out_0[2] * temp_acc[12]); // y=1<<1|0=2
-            local_accumulators.accumulate(1, 4, e_out_1[0] * temp_acc[12]); // y=0
+            local_accumulators.accumulate(0, 1, e0_2 * temp_acc[12]); // y=1<<1|0=2
+            local_accumulators.accumulate(1, 4, e1_0 * temp_acc[12]); // y=0
             local_accumulators.accumulate(2, 12, e_out_2 * temp_acc[12]);
 
             // beta_index = 13; b=(1,1,1);
-            local_accumulators.accumulate(0, 1, e_out_0[3] * temp_acc[13]); // y=1<<1|1=3
-            local_accumulators.accumulate(1, 4, e_out_1[1] * temp_acc[13]); // y=1
+            local_accumulators.accumulate(0, 1, e0_3 * temp_acc[13]); // y=1<<1|1=3
+            local_accumulators.accumulate(1, 4, e1_1 * temp_acc[13]); // y=1
             local_accumulators.accumulate(2, 13, e_out_2 * temp_acc[13]);
 
             // beta_index = 14; b=(1,1,2);
             local_accumulators.accumulate(2, 14, e_out_2 * temp_acc[14]);
 
             // beta_index = 15; b=(1,2,0);
-            local_accumulators.accumulate(1, 5, e_out_1[0] * temp_acc[15]); // y=0
+            local_accumulators.accumulate(1, 5, e1_0 * temp_acc[15]); // y=0
             local_accumulators.accumulate(2, 15, e_out_2 * temp_acc[15]);
 
             // beta_index = 16; b=(1,2,1);
-            local_accumulators.accumulate(1, 5, e_out_1[1] * temp_acc[16]); // y=1
+            local_accumulators.accumulate(1, 5, e1_1 * temp_acc[16]); // y=1
             local_accumulators.accumulate(2, 16, e_out_2 * temp_acc[16]);
 
             // beta_index = 17; b=(1,2,2);
             local_accumulators.accumulate(2, 17, e_out_2 * temp_acc[17]);
 
             // beta_index = 18; b=(2,0,0);
-            local_accumulators.accumulate(1, 6, e_out_1[0] * temp_acc[18]); // y=0
+            local_accumulators.accumulate(1, 6, e1_0 * temp_acc[18]); // y=0
             local_accumulators.accumulate(2, 18, e_out_2 * temp_acc[18]);
 
             // beta_index = 19; b=(2,0,1);
-            local_accumulators.accumulate(1, 6, e_out_1[1] * temp_acc[19]); // y=1
+            local_accumulators.accumulate(1, 6, e1_1 * temp_acc[19]); // y=1
             local_accumulators.accumulate(2, 19, e_out_2 * temp_acc[19]);
 
             // beta_index = 20; b=(2,0,2);
             local_accumulators.accumulate(2, 20, e_out_2 * temp_acc[20]);
 
             // beta_index = 21; b=(2,1,0);
-            local_accumulators.accumulate(1, 7, e_out_1[0] * temp_acc[21]); // y=0
+            local_accumulators.accumulate(1, 7, e1_0 * temp_acc[21]); // y=0
             local_accumulators.accumulate(2, 21, e_out_2 * temp_acc[21]);
 
             // beta_index = 22; b=(2,1,1);
-            local_accumulators.accumulate(1, 7, e_out_1[1] * temp_acc[22]); // y=1
+            local_accumulators.accumulate(1, 7, e1_1 * temp_acc[22]); // y=1
             local_accumulators.accumulate(2, 22, e_out_2 * temp_acc[22]);
 
             // beta_index = 23; b=(2,1,2);
             local_accumulators.accumulate(2, 23, e_out_2 * temp_acc[23]);
 
             // beta_index = 24; b=(2,2,0);
-            local_accumulators.accumulate(1, 8, e_out_1[0] * temp_acc[24]); // y=0
+            local_accumulators.accumulate(1, 8, e1_0 * temp_acc[24]); // y=0
             local_accumulators.accumulate(2, 24, e_out_2 * temp_acc[24]);
 
             // beta_index = 25; b=(2,2,1);
-            local_accumulators.accumulate(1, 8, e_out_1[1] * temp_acc[25]); // y=1
+            local_accumulators.accumulate(1, 8, e1_1 * temp_acc[25]); // y=1
             local_accumulators.accumulate(2, 25, e_out_2 * temp_acc[25]);
 
             // beta_index = 26; b=(2,2,2);