From 81cacbcc32dfb8285a9ed9c07dea9b2c06232fc5 Mon Sep 17 00:00:00 2001
From: Sethu Iyer <sethuiyer95@gmail.com>
Date: Sat, 7 Feb 2026 09:11:43 +0530
Subject: [PATCH] Fix CUDA initialization safety and checks

---
 include/baha/baha.cuh | 19 ++++++++++++++-----
 src/baha_gpu.cu       | 40 ++++++++++++++++++++++++----------------
 2 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/include/baha/baha.cuh b/include/baha/baha.cuh
index d80b850..0154ba9 100644
--- a/include/baha/baha.cuh
+++ b/include/baha/baha.cuh
@@ -15,6 +15,7 @@
  * - Launch bounds for optimal occupancy
  */
 
+#include <climits>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 
@@ -155,11 +156,13 @@ ramsey_baha_kernel_optimized(
     }
     __syncthreads();
     
-    // Cooperative state initialization
-    if (tid < words_per_state) {
-        const unsigned int rand_val = curand(&s_rng);
-        s_state[tid] = rand_val ^ (tid * 12345u);
-        s_best_state[tid] = s_state[tid];
+    // Cooperative state initialization (single RNG to avoid races)
+    if (tid == 0) {
+        for (int i = 0; i < words_per_state; ++i) {
+            const unsigned int rand_val = curand(&s_rng);
+            s_state[i] = rand_val ^ (static_cast<unsigned int>(i) * 12345u);
+            s_best_state[i] = s_state[i];
+        }
     }
     __syncthreads();
     
@@ -325,6 +328,12 @@ inline cudaError_t launch_ramsey_optimizer(
     int words_per_state,
     cudaStream_t stream = 0
 ) {
+    if (num_blocks <= 0) {
+        return cudaErrorInvalidValue;
+    }
+    if (words_per_state > 64 || words_per_state <= 0) {
+        return cudaErrorInvalidValue;
+    }
     // Calculate optimal grid size
     int device;
     cudaGetDevice(&device);
diff --git a/src/baha_gpu.cu b/src/baha_gpu.cu
index 192b09f..c86b133 100644
--- a/src/baha_gpu.cu
+++ b/src/baha_gpu.cu
@@ -4,6 +4,8 @@
 #include <string>
 #include <random>
 #include <chrono>
+#include <cmath>
+#include <ctime>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 #include <iomanip>
@@ -179,19 +181,25 @@ int main() {
     int threads_per_block = 256;
     int total_threads = num_blocks * threads_per_block;
     
-    int* d_sequence; cudaMalloc(&d_sequence, n * sizeof(int));
-    cudaMemcpy(d_sequence, h_sequence.data(), n * sizeof(int), cudaMemcpyHostToDevice);
+    int* d_sequence;
+    cudaCheckError(cudaMalloc(&d_sequence, n * sizeof(int)));
+    cudaCheckError(cudaMemcpy(d_sequence, h_sequence.data(), n * sizeof(int), cudaMemcpyHostToDevice));
     
-    curandState* d_states; cudaMalloc(&d_states, total_threads * sizeof(curandState));
+    curandState* d_states;
+    cudaCheckError(cudaMalloc(&d_states, total_threads * sizeof(curandState)));
     
-    int* d_pop_moves; cudaMalloc(&d_pop_moves, total_threads * MAX_N * sizeof(int));
-    int* d_pop_enes; cudaMalloc(&d_pop_enes, total_threads * sizeof(int));
+    int* d_pop_moves;
+    cudaCheckError(cudaMalloc(&d_pop_moves, total_threads * MAX_N * sizeof(int)));
+    int* d_pop_enes;
+    cudaCheckError(cudaMalloc(&d_pop_enes, total_threads * sizeof(int)));
     
-    int* d_best_moves; cudaMalloc(&d_best_moves, MAX_N * sizeof(int));
-    int* d_best_energy; cudaMalloc(&d_best_energy, sizeof(int));
+    int* d_best_moves;
+    cudaCheckError(cudaMalloc(&d_best_moves, MAX_N * sizeof(int)));
+    int* d_best_energy;
+    cudaCheckError(cudaMalloc(&d_best_energy, sizeof(int)));
     
     int h_init_best = 999999;
-    cudaMemcpy(d_best_energy, &h_init_best, sizeof(int), cudaMemcpyHostToDevice);
+    cudaCheckError(cudaMemcpy(d_best_energy, &h_init_best, sizeof(int), cudaMemcpyHostToDevice));
     
     // Log setup
     std::ofstream log("protein_log.csv");
@@ -220,10 +228,10 @@ int main() {
         
         // Read best
         int current_global_best_e;
-        cudaMemcpy(&current_global_best_e, d_best_energy, sizeof(int), cudaMemcpyDeviceToHost);
+        cudaCheckError(cudaMemcpy(&current_global_best_e, d_best_energy, sizeof(int), cudaMemcpyDeviceToHost));
         
         // Only verify moves if energy improved or periodically
-        cudaMemcpy(h_best_moves.data(), d_best_moves, (n-1)*sizeof(int), cudaMemcpyDeviceToHost);
+        cudaCheckError(cudaMemcpy(h_best_moves.data(), d_best_moves, (n-1)*sizeof(int), cudaMemcpyDeviceToHost));
         
         std::cout << "Frame " << frame << " (Beta=" << std::fixed << std::setprecision(2) << beta << "): Best E=" << current_global_best_e << "\r";
         std::cout.flush();
@@ -239,12 +247,12 @@ int main() {
     log.close();
     std::cout << "\nDone! Log saved to protein_log.csv\n";
     
-    cudaFree(d_sequence);
-    cudaFree(d_states);
-    cudaFree(d_pop_moves);
-    cudaFree(d_pop_enes);
-    cudaFree(d_best_moves);
-    cudaFree(d_best_energy);
+    cudaCheckError(cudaFree(d_sequence));
+    cudaCheckError(cudaFree(d_states));
+    cudaCheckError(cudaFree(d_pop_moves));
+    cudaCheckError(cudaFree(d_pop_enes));
+    cudaCheckError(cudaFree(d_best_moves));
+    cudaCheckError(cudaFree(d_best_energy));
     
     return 0;
 }