Skip to content

Commit 2b1f7a4

Browse files
Velacielahero78119
andauthored
feat: integrate GPU prover (#987)
### Description ### Performence - fibonacci $\text{fib}(2^{20})$: runs `12M` steps (warmup + rerun) - machine(node-19): `AMD EPYC 7702 64-Core`, `1 x RTX 3090` ``` # CPU RUST_LOG=debug cargo run --release --package ceno_zkvm --bin e2e -- --platform=ceno --hints=20 --public-io=3130 --profiling=2 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci 2>&1 | tee "fib_perf_cpu.log" ``` ``` # GPU make enable-gpu RUST_LOG=debug cargo run --release --features gpu --package ceno_zkvm --bin e2e -- --platform=ceno --hints=20 --public-io=3130 --profiling=2 --field=goldilocks examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee "fib_perf_gpu.log" make disable-gpu ``` - **phase1: basefold commit & open** | metric | CPU time | GPU time | speedup | |------|----------|----------|--------| | ZKVM_create_proof | 13.5s | 8.0s | 1.68x | | batch commit to traces | 6.07s | 1.15s | 5.27x | | transfer MLEs to host | 0 | 550ms | N/A | | transfer pk to device | 0 | 4.53ms | N/A | | main_proofs (no GPU yet) | 6.04s | 6.15s | 0.98x | | pcs_opening | 1.42s | 168ms | 8.45x | - **(WIP) phase2: main & tower prove** | metric | CPU time | GPU time | speedup | |------|----------|----------|--------| | ZKVM_create_proof | 13.6s | 6.72s | 2.02x | | batch commit to traces | 6.07s | 1.15s | 5.27x | | transfer MLEs to host | 0 | 550ms (next: eliminate) | N/A | | transfer pk to device | 0 | 4.53ms | N/A | | main & tower prove | 6.04s | 4.85s | N/A | | -- main (todo: update) | - | - | - | | -- tower | 4.12s | 2.72s (next: optimize) | 1.51x | | pcs_opening | 1.42s | 168ms | 8.45x | ### Current Status - [x] Add GPU Prover Hal - [x] Manually checked that `batch_commit` and `batch_open` produce consistent results between CPU and GPU provers - `cargo test --release test_single_add_instance_e2e -- --nocapture` - [x] API & IO, to pass ceno verifier - [x] full GPU pipeline for basefold commit -> open - [x] tower: witness, prove - [x] main: layer witness, prove ### Future improvements - code quality - performance issues - optimize GPU memory usage --------- Co-authored-by: sm.wu <[email protected]>
1 parent ec4f614 commit 2b1f7a4

37 files changed

+2825
-422
lines changed

Cargo.lock

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ tracing-forest = { version = "0.1.6" }
6464
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
6565
uint = "0.8"
6666

67+
ceno_gpu = { path = "utils/cuda_hal", package = "cuda_hal" }
68+
6769
[profile.dev]
6870
lto = "thin"
6971
# We are running our tests with optimizations turned on to make them faster.
@@ -76,3 +78,17 @@ opt-level = 3
7678

7779
[profile.release]
7880
lto = "thin"
81+
82+
#[patch."ssh://[email protected]/scroll-tech/ceno-gpu.git"]
83+
#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" }
84+
85+
#[patch."https://github.com/scroll-tech/gkr-backend"]
86+
#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
87+
#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
88+
#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
89+
#p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
90+
#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
91+
#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
92+
#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
93+
#whir = { path = "../gkr-backend/crates/whir", package = "whir" }
94+
#witness = { path = "../gkr-backend/crates/witness", package = "witness" }

Makefile

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Makefile for conditional GPU builds
2+
3+
.PHONY: enable-gpu disable-gpu clean help
4+
5+
help:
6+
@echo "Available targets:"
7+
@echo " enable-gpu - Switch to GPU mode (uses remote implementation, requires private repo access)"
8+
@echo " disable-gpu - Switch to CPU mode (uses local placeholder, default state)"
9+
@echo " clean - Clean build artifacts and reset to CPU mode"
10+
@echo ""
11+
@echo "Normal usage:"
12+
@echo " cargo build # CPU build (default, no private repo fetch)"
13+
@echo " make enable-gpu && cargo build # GPU build (requires private repo access)"
14+
15+
enable-gpu:
16+
@./build-scripts/conditional-patch.sh enable-gpu
17+
18+
disable-gpu:
19+
@./build-scripts/conditional-patch.sh disable-gpu
20+
21+
clean:
22+
@cargo clean
23+
@./build-scripts/conditional-patch.sh disable-gpu
24+
@echo "Cleaned build artifacts and reset to CPU mode"

build-scripts/conditional-patch.sh

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
3+
# Script to switch between local placeholder and remote GPU implementation
4+
# Usage: ./build-scripts/conditional-patch.sh [enable-gpu|disable-gpu]
5+
6+
WORKSPACE_CARGO="Cargo.toml"
7+
8+
# Workspace dependency declarations
9+
LOCAL_DEP='ceno_gpu = { path = "utils/cuda_hal", package = "cuda_hal" }'
10+
REMOTE_DEP='ceno_gpu = { git = "ssh://[email protected]/scroll-tech/ceno-gpu.git", package = "cuda_hal", branch = "dev/integrate-into-ceno-as-dep" }'
11+
12+
if [ "$1" = "enable-gpu" ]; then
13+
echo "Switching to GPU mode (using remote implementation)..."
14+
15+
# Replace local path with remote git in workspace dependencies
16+
if [[ "$OSTYPE" == "darwin"* ]]; then
17+
# macOS sed
18+
sed -i '' "s|${LOCAL_DEP}|${REMOTE_DEP}|g" "$WORKSPACE_CARGO"
19+
else
20+
# Linux sed
21+
sed -i "s|${LOCAL_DEP}|${REMOTE_DEP}|g" "$WORKSPACE_CARGO"
22+
fi
23+
24+
echo "✅ Switched to remote GPU implementation"
25+
echo "Now you can run: cargo build -p ceno_zkvm -F gpu"
26+
27+
elif [ "$1" = "disable-gpu" ]; then
28+
echo "Switching to CPU mode (using local placeholder)..."
29+
30+
# Replace remote git with local path in workspace dependencies
31+
if [[ "$OSTYPE" == "darwin"* ]]; then
32+
# macOS sed
33+
sed -i '' "s|${REMOTE_DEP}|${LOCAL_DEP}|g" "$WORKSPACE_CARGO"
34+
else
35+
# Linux sed
36+
sed -i "s|${REMOTE_DEP}|${LOCAL_DEP}|g" "$WORKSPACE_CARGO"
37+
fi
38+
39+
echo "✅ Switched to local placeholder implementation"
40+
echo "Now you can run: cargo build -p ceno_zkvm --no-default-features"
41+
42+
else
43+
echo "Usage: $0 [enable-gpu|disable-gpu]"
44+
echo " enable-gpu - Switch to remote GPU implementation (requires private repo access)"
45+
echo " disable-gpu - Switch to local placeholder (default, no private repo access)"
46+
exit 1
47+
fi
48+
49+
echo "Done."

ceno_cli/src/commands/common_args/ceno.rs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,20 @@ use ceno_host::{CenoStdin, memory_from_file};
66
use ceno_zkvm::{
77
e2e::*,
88
scheme::{
9-
constants::MAX_NUM_VARIABLES, mock_prover::LkMultiplicityKey, verifier::ZKVMVerifier,
9+
constants::MAX_NUM_VARIABLES, create_backend, create_prover,
10+
mock_prover::LkMultiplicityKey, verifier::ZKVMVerifier,
1011
},
1112
};
1213
use clap::Args;
1314
use ff_ext::{BabyBearExt4, ExtensionField, GoldilocksExt2};
14-
use gkr_iop::cpu::{CpuBackend, CpuProver};
15+
1516
use mpcs::{
1617
Basefold, BasefoldRSParams, PolynomialCommitmentScheme, SecurityLevel, Whir, WhirDefaultSpec,
1718
};
1819
use serde::Serialize;
1920
use std::{
2021
fs::File,
2122
path::{Path, PathBuf},
22-
rc::Rc,
2323
};
2424

2525
/// Ceno options
@@ -373,12 +373,9 @@ fn run_elf_inner<
373373
platform.hints.len()
374374
);
375375

376-
// TODO support GPU backend + prover
377-
let backend: Rc<_> =
378-
CpuBackend::<E, PCS>::new(options.max_num_variables, options.security_level).into();
379-
376+
let backend = create_backend(options.max_num_variables, options.security_level);
380377
Ok(run_e2e_with_checkpoint::<E, PCS, _, _>(
381-
CpuProver::new(backend.clone()),
378+
create_prover(backend.clone()),
382379
program,
383380
platform,
384381
&hints,

ceno_zkvm/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,15 @@ version.workspace = true
1313
base64 = "0.22"
1414
ceno-examples = { path = "../examples-builder" }
1515
ceno_emul = { path = "../ceno_emul" }
16+
ceno_gpu = { workspace = true, optional = true }
1617
ceno_host = { path = "../ceno_host" }
18+
cudarc = { version = "0.13.0", features = ["driver", "cuda-version-from-build-system"], optional = true }
1719
either.workspace = true
1820
ff_ext.workspace = true
1921
gkr_iop = { path = "../gkr_iop" }
2022
mpcs.workspace = true
2123
multilinear_extensions.workspace = true
24+
once_cell = "1.21.3"
2225
p3.workspace = true
2326
rayon.workspace = true
2427
serde.workspace = true
@@ -64,6 +67,7 @@ default = ["forbid_overflow", "nightly-features", "u16limb_circuit", "parallel"]
6467
flamegraph = ["pprof2/flamegraph", "pprof2/criterion"]
6568
forbid_overflow = []
6669
goldilocks = ["forbid_overflow", "nightly-features"]
70+
gpu = ["gkr_iop/gpu", "dep:ceno_gpu"]
6771
jemalloc = ["dep:tikv-jemallocator", "dep:tikv-jemalloc-ctl"]
6872
jemalloc-prof = ["jemalloc", "tikv-jemallocator?/profiling"]
6973
nightly-features = [

ceno_zkvm/benches/bitwise_keccakf.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ fn keccak_f_fn(c: &mut Criterion) {
3636

3737
#[allow(clippy::unit_arg)]
3838
run_bitwise_keccakf::<GoldilocksExt2, BasefoldDefault<GoldilocksExt2>>(
39-
setup_bitwise_keccak_gkr_circuit().expect("setup circuit error"),
39+
setup_bitwise_keccak_gkr_circuit()
40+
.expect("setup circuit error")
41+
.1,
4042
black_box(states),
4143
false,
4244
false,

ceno_zkvm/benches/fibonacci.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1-
use std::{rc::Rc, time::Duration};
1+
use std::time::Duration;
22

33
use ceno_emul::{Platform, Program};
44
use ceno_host::CenoStdin;
55
use ceno_zkvm::{
66
self,
77
e2e::{Checkpoint, Preset, run_e2e_with_checkpoint, setup_platform},
8-
scheme::verifier::ZKVMVerifier,
8+
scheme::{create_backend, create_prover},
99
};
1010
mod alloc;
1111
use criterion::*;
1212

1313
use ff_ext::BabyBearExt4;
14-
use gkr_iop::cpu::{CpuBackend, CpuProver};
14+
use gkr_iop::cpu::default_backend_config;
15+
16+
use ceno_zkvm::scheme::verifier::ZKVMVerifier;
1517
use mpcs::BasefoldDefault;
1618
use transcript::BasicTranscript;
1719

@@ -40,14 +42,16 @@ fn setup() -> (Program, Platform) {
4042

4143
fn fibonacci_prove(c: &mut Criterion) {
4244
let (program, platform) = setup();
43-
let backend: Rc<_> = CpuBackend::<E, Pcs>::default().into();
45+
let (max_num_variables, security_level) = default_backend_config();
46+
let backend = create_backend::<E, Pcs>(max_num_variables, security_level);
47+
4448
for max_steps in [1usize << 20, 1usize << 21, 1usize << 22] {
4549
// retrive 1 << 20th fibonacci element >> max_steps
4650
let mut hints = CenoStdin::default();
4751
let _ = hints.write(&20);
4852
// estimate proof size data first
4953
let result = run_e2e_with_checkpoint::<E, Pcs, _, _>(
50-
CpuProver::new(backend.clone()),
54+
create_prover(backend.clone()),
5155
program.clone(),
5256
platform.clone(),
5357
&Vec::from(&hints),
@@ -84,7 +88,7 @@ fn fibonacci_prove(c: &mut Criterion) {
8488
let mut time = Duration::new(0, 0);
8589
for _ in 0..iters {
8690
let result = run_e2e_with_checkpoint::<E, Pcs, _, _>(
87-
CpuProver::new(backend.clone()),
91+
create_prover(backend.clone()),
8892
program.clone(),
8993
platform.clone(),
9094
&Vec::from(&hints),

ceno_zkvm/benches/fibonacci_witness.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ use ceno_host::CenoStdin;
33
use ceno_zkvm::{
44
self,
55
e2e::{Checkpoint, Preset, run_e2e_with_checkpoint, setup_platform},
6+
scheme::{create_backend, create_prover},
67
};
7-
use std::{fs, path::PathBuf, rc::Rc, time::Duration};
8+
use std::{fs, path::PathBuf, time::Duration};
89
mod alloc;
910
use criterion::*;
11+
1012
use ff_ext::BabyBearExt4;
11-
use gkr_iop::cpu::{CpuBackend, CpuProver};
13+
use gkr_iop::cpu::default_backend_config;
1214
use mpcs::BasefoldDefault;
1315

1416
criterion_group! {
@@ -38,7 +40,8 @@ fn setup() -> (Program, Platform) {
3840

3941
fn fibonacci_witness(c: &mut Criterion) {
4042
let (program, platform) = setup();
41-
let backend: Rc<_> = CpuBackend::<E, Pcs>::default().into();
43+
let (max_num_variables, security_level) = default_backend_config();
44+
let backend = create_backend::<E, Pcs>(max_num_variables, security_level);
4245

4346
let max_steps = usize::MAX;
4447
let mut group = c.benchmark_group(format!("fib_wit_max_steps_{}", max_steps));
@@ -59,7 +62,7 @@ fn fibonacci_witness(c: &mut Criterion) {
5962
let mut time = Duration::new(0, 0);
6063
for _ in 0..iters {
6164
let result = run_e2e_with_checkpoint::<E, Pcs, _, _>(
62-
CpuProver::new(backend.clone()),
65+
create_prover(backend.clone()),
6366
program.clone(),
6467
platform.clone(),
6568
&Vec::from(&hints),

ceno_zkvm/benches/is_prime.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
1-
use std::{rc::Rc, time::Duration};
1+
use std::time::Duration;
22

33
use ceno_emul::{Platform, Program};
44
use ceno_host::CenoStdin;
55
use ceno_zkvm::{
66
self,
77
e2e::{Checkpoint, Preset, run_e2e_with_checkpoint, setup_platform},
8+
scheme::{create_backend, create_prover},
89
};
910
mod alloc;
1011
use criterion::*;
1112
use ff_ext::BabyBearExt4;
12-
use gkr_iop::cpu::{CpuBackend, CpuProver};
13+
use gkr_iop::cpu::default_backend_config;
1314
use mpcs::BasefoldDefault;
1415

1516
criterion_group! {
@@ -36,7 +37,9 @@ fn setup() -> (Program, Platform) {
3637

3738
fn is_prime_1(c: &mut Criterion) {
3839
let (program, platform) = setup();
39-
let backend: Rc<_> = CpuBackend::<E, Pcs>::default().into();
40+
41+
let (max_num_variables, security_level) = default_backend_config();
42+
let backend = create_backend::<E, Pcs>(max_num_variables, security_level);
4043

4144
for n in [100u32, 10000u32, 50000u32] {
4245
let max_steps = usize::MAX;
@@ -56,7 +59,7 @@ fn is_prime_1(c: &mut Criterion) {
5659

5760
for _ in 0..iters {
5861
let result = run_e2e_with_checkpoint::<E, Pcs, _, _>(
59-
CpuProver::new(backend.clone()),
62+
create_prover(backend.clone()),
6063
program.clone(),
6164
platform.clone(),
6265
&hints,

0 commit comments

Comments
 (0)