From 81cacbcc32dfb8285a9ed9c07dea9b2c06232fc5 Mon Sep 17 00:00:00 2001 From: Sethu Iyer Date: Sat, 7 Feb 2026 09:11:43 +0530 Subject: [PATCH] Fix CUDA initialization safety and checks --- include/baha/baha.cuh | 19 ++++++++++++++----- src/baha_gpu.cu | 40 ++++++++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/include/baha/baha.cuh b/include/baha/baha.cuh index d80b850..0154ba9 100644 --- a/include/baha/baha.cuh +++ b/include/baha/baha.cuh @@ -15,6 +15,7 @@ * - Launch bounds for optimal occupancy */ +#include #include #include @@ -155,11 +156,13 @@ ramsey_baha_kernel_optimized( } __syncthreads(); - // Cooperative state initialization - if (tid < words_per_state) { - const unsigned int rand_val = curand(&s_rng); - s_state[tid] = rand_val ^ (tid * 12345u); - s_best_state[tid] = s_state[tid]; + // Cooperative state initialization (single RNG to avoid races) + if (tid == 0) { + for (int i = 0; i < words_per_state; ++i) { + const unsigned int rand_val = curand(&s_rng); + s_state[i] = rand_val ^ (static_cast(i) * 12345u); + s_best_state[i] = s_state[i]; + } } __syncthreads(); @@ -325,6 +328,12 @@ inline cudaError_t launch_ramsey_optimizer( int words_per_state, cudaStream_t stream = 0 ) { + if (num_blocks <= 0) { + return cudaErrorInvalidValue; + } + if (words_per_state > 64 || words_per_state <= 0) { + return cudaErrorInvalidValue; + } // Calculate optimal grid size int device; cudaGetDevice(&device); diff --git a/src/baha_gpu.cu b/src/baha_gpu.cu index 192b09f..c86b133 100644 --- a/src/baha_gpu.cu +++ b/src/baha_gpu.cu @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -179,19 +181,25 @@ int main() { int threads_per_block = 256; int total_threads = num_blocks * threads_per_block; - int* d_sequence; cudaMalloc(&d_sequence, n * sizeof(int)); - cudaMemcpy(d_sequence, h_sequence.data(), n * sizeof(int), cudaMemcpyHostToDevice); + int* d_sequence; + cudaCheckError(cudaMalloc(&d_sequence, n * sizeof(int))); + cudaCheckError(cudaMemcpy(d_sequence, h_sequence.data(), n * sizeof(int), cudaMemcpyHostToDevice)); - curandState* d_states; cudaMalloc(&d_states, total_threads * sizeof(curandState)); + curandState* d_states; + cudaCheckError(cudaMalloc(&d_states, total_threads * sizeof(curandState))); - int* d_pop_moves; cudaMalloc(&d_pop_moves, total_threads * MAX_N * sizeof(int)); - int* d_pop_enes; cudaMalloc(&d_pop_enes, total_threads * sizeof(int)); + int* d_pop_moves; + cudaCheckError(cudaMalloc(&d_pop_moves, total_threads * MAX_N * sizeof(int))); + int* d_pop_enes; + cudaCheckError(cudaMalloc(&d_pop_enes, total_threads * sizeof(int))); - int* d_best_moves; cudaMalloc(&d_best_moves, MAX_N * sizeof(int)); - int* d_best_energy; cudaMalloc(&d_best_energy, sizeof(int)); + int* d_best_moves; + cudaCheckError(cudaMalloc(&d_best_moves, MAX_N * sizeof(int))); + int* d_best_energy; + cudaCheckError(cudaMalloc(&d_best_energy, sizeof(int))); int h_init_best = 999999; - cudaMemcpy(d_best_energy, &h_init_best, sizeof(int), cudaMemcpyHostToDevice); + cudaCheckError(cudaMemcpy(d_best_energy, &h_init_best, sizeof(int), cudaMemcpyHostToDevice)); // Log setup std::ofstream log("protein_log.csv"); @@ -220,10 +228,10 @@ int main() { // Read best int current_global_best_e; - cudaMemcpy(¤t_global_best_e, d_best_energy, sizeof(int), cudaMemcpyDeviceToHost); + cudaCheckError(cudaMemcpy(¤t_global_best_e, d_best_energy, sizeof(int), cudaMemcpyDeviceToHost)); // Only verify moves if energy improved or periodically - cudaMemcpy(h_best_moves.data(), d_best_moves, (n-1)*sizeof(int), cudaMemcpyDeviceToHost); + cudaCheckError(cudaMemcpy(h_best_moves.data(), d_best_moves, (n-1)*sizeof(int), cudaMemcpyDeviceToHost)); std::cout << "Frame " << frame << " (Beta=" << std::fixed << std::setprecision(2) << beta << "): Best E=" << current_global_best_e << "\r"; std::cout.flush(); @@ -239,12 +247,12 @@ int main() { log.close(); std::cout << "\nDone! Log saved to protein_log.csv\n"; - cudaFree(d_sequence); - cudaFree(d_states); - cudaFree(d_pop_moves); - cudaFree(d_pop_enes); - cudaFree(d_best_moves); - cudaFree(d_best_energy); + cudaCheckError(cudaFree(d_sequence)); + cudaCheckError(cudaFree(d_states)); + cudaCheckError(cudaFree(d_pop_moves)); + cudaCheckError(cudaFree(d_pop_enes)); + cudaCheckError(cudaFree(d_best_moves)); + cudaCheckError(cudaFree(d_best_energy)); return 0; }