debug: illegal memory access

Velaciela · Velaciela · commit 9588dd9580ec · 2025-08-04T09:45:13.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -88,8 +88,8 @@ uint = "0.8"
 
 ceno_gpu = { git = "ssh://git@github.com/scroll-tech/ceno-gpu.git", package = "cuda_hal", branch = "dev/integrate-into-ceno-as-dep" }
 
-[patch."ssh://git@github.com/scroll-tech/ceno-gpu.git"]
-ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" }
+# [patch."ssh://git@github.com/scroll-tech/ceno-gpu.git"]
+# ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" }
 
 [profile.dev]
 lto = "thin"
diff --git a/ceno_zkvm/Cargo.toml b/ceno_zkvm/Cargo.toml
@@ -29,6 +29,8 @@ serde_json.workspace = true
 sumcheck.workspace = true
 transcript = { git = "https://github.com/scroll-tech/ceno", package = "transcript", rev = "7adb306" }
 witness = { git = "https://github.com/scroll-tech/ceno", package = "witness", rev = "7adb306" }
+once_cell = "1.21.3"
+cudarc = { version = "0.13.0", features = ["driver", "cuda-version-from-build-system"] }
 
 itertools.workspace = true
 ndarray.workspace = true
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -43,6 +43,26 @@ use transcript::{BasicTranscript, Transcript};
 use witness::next_pow2_instance_padding;
 
 use ceno_gpu::gl64::CudaHalGL64;
+use cudarc::driver::{CudaDevice, DriverError};
+
+use once_cell::sync::Lazy;
+use std::sync::Mutex;
+// static CUDA_HAL: Lazy<Mutex<CudaHalGL64>> = Lazy::new(|| {
+//     Mutex::new(CudaHalGL64::new().unwrap())
+// });
+
+static CUDA_DEVICE: Lazy<Result<Arc<CudaDevice>, DriverError>> = Lazy::new(|| {
+    CudaDevice::new(0)
+});
+static CUDA_HAL: Lazy<Result<Arc<Mutex<CudaHalGL64>>, Box<dyn std::error::Error + Send + Sync>>> = Lazy::new(|| {
+    let device = CUDA_DEVICE.as_ref().map_err(|e| format!("Device init failed: {:?}", e))?;
+    device.bind_to_thread()?;
+    
+    CudaHalGL64::new()
+        .map(|hal| Arc::new(Mutex::new(hal)))
+        .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
+});
+
 
 pub struct GpuTowerProver;
 
@@ -295,7 +315,12 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TraceCommitter<GpuBa
             //     panic!("error: type conversion failed");
             // };
 
-            let cuda_hal = CudaHalGL64::new().unwrap();
+            // let cuda_hal = CUDA_HAL.lock().unwrap(); // CudaHalGL64::new().unwrap();
+            let device = CUDA_DEVICE.as_ref().map_err(|e| format!("Device not available: {:?}", e)).unwrap();
+            device.bind_to_thread().unwrap();
+            let hal_arc = CUDA_HAL.as_ref().map_err(|e| format!("HAL not available: {:?}", e)).unwrap();
+            let cuda_hal = hal_arc.lock().unwrap();
+
             let traces_gl64: Vec<witness::RowMajorMatrix<p3::goldilocks::Goldilocks>> = 
                 unsafe { std::mem::transmute(vec_traces.clone()) };
             let pcs_data = cuda_hal.basefold.batch_commit(traces_gl64).unwrap();
@@ -863,6 +888,10 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
     }
 }
 
+use p3::field::extension::BinomialExtensionField;
+type GL64 = p3::goldilocks::Goldilocks;
+type EGL64 = BinomialExtensionField<GL64, 2>;
+
 impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBackend<E, PCS>>
     for GpuProver<GpuBackend<E, PCS>>
 {
@@ -880,9 +909,14 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBac
             panic!("GPU backend only supports Goldilocks base field");
         }
 
-        use p3::field::extension::BinomialExtensionField;
-        type EGL64 = BinomialExtensionField<p3::goldilocks::Goldilocks, 2>;
-        let cuda_hal = CudaHalGL64::new().unwrap();
+        // use p3::field::extension::BinomialExtensionField;
+        // type GL64 = p3::goldilocks::Goldilocks;
+        // type EGL64 = BinomialExtensionField<GL64, 2>;
+        // let cuda_hal = CUDA_HAL.lock().unwrap(); //CudaHalGL64::new().unwrap();
+        let device = CUDA_DEVICE.as_ref().map_err(|e| format!("Device not available: {:?}", e)).unwrap();
+        device.bind_to_thread().unwrap();
+        let hal_arc = CUDA_HAL.as_ref().map_err(|e| format!("HAL not available: {:?}", e)).unwrap();
+        let cuda_hal = hal_arc.lock().unwrap();
 
         let mut rounds = vec![];
         rounds.push((
@@ -913,13 +947,17 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBac
             ));
         }
 
+
+        use ceno_gpu::gl64::buffer::BufferImpl;
+        use ceno_gpu::BasefoldCommitmentWithWitness as BasefoldCommitmentWithWitnessGpu;
+
         // Type conversions using unsafe transmute
         let pp_gl64: &mpcs::basefold::structure::BasefoldProverParams<EGL64, mpcs::BasefoldRSParams> = 
             unsafe { std::mem::transmute(self.pp.as_ref().unwrap()) };
         let rounds_gl64: Vec<_> = rounds
             .iter()
             .map(|(commitment, point_eval_pairs)| {
-                let commitment_gl64: &mpcs::BasefoldCommitmentWithWitness<EGL64> = 
+                let commitment_gl64: &BasefoldCommitmentWithWitnessGpu<GL64, BufferImpl<GL64>> = 
                     unsafe { std::mem::transmute(*commitment) };
                 let point_eval_pairs_gl64: Vec<_> = point_eval_pairs
                     .iter()