Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions include/baha/baha.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* - Launch bounds for optimal occupancy
*/

#include <climits>
#include <cuda_runtime.h>
#include <curand_kernel.h>

Expand Down Expand Up @@ -155,11 +156,13 @@ ramsey_baha_kernel_optimized(
}
__syncthreads();

// Cooperative state initialization
if (tid < words_per_state) {
const unsigned int rand_val = curand(&s_rng);
s_state[tid] = rand_val ^ (tid * 12345u);
s_best_state[tid] = s_state[tid];
// Cooperative state initialization (single RNG to avoid races)
if (tid == 0) {
for (int i = 0; i < words_per_state; ++i) {
const unsigned int rand_val = curand(&s_rng);
s_state[i] = rand_val ^ (static_cast<unsigned int>(i) * 12345u);
s_best_state[i] = s_state[i];
}
}
__syncthreads();

Expand Down Expand Up @@ -325,6 +328,12 @@ inline cudaError_t launch_ramsey_optimizer(
int words_per_state,
cudaStream_t stream = 0
) {
if (num_blocks <= 0) {
return cudaErrorInvalidValue;
}
if (words_per_state > 64 || words_per_state <= 0) {
return cudaErrorInvalidValue;
}
// Calculate optimal grid size
int device;
cudaGetDevice(&device);
Expand Down
40 changes: 24 additions & 16 deletions src/baha_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <string>
#include <random>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <iomanip>
Expand Down Expand Up @@ -179,19 +181,25 @@ int main() {
int threads_per_block = 256;
int total_threads = num_blocks * threads_per_block;

int* d_sequence; cudaMalloc(&d_sequence, n * sizeof(int));
cudaMemcpy(d_sequence, h_sequence.data(), n * sizeof(int), cudaMemcpyHostToDevice);
int* d_sequence;
cudaCheckError(cudaMalloc(&d_sequence, n * sizeof(int)));
cudaCheckError(cudaMemcpy(d_sequence, h_sequence.data(), n * sizeof(int), cudaMemcpyHostToDevice));

curandState* d_states; cudaMalloc(&d_states, total_threads * sizeof(curandState));
curandState* d_states;
cudaCheckError(cudaMalloc(&d_states, total_threads * sizeof(curandState)));

int* d_pop_moves; cudaMalloc(&d_pop_moves, total_threads * MAX_N * sizeof(int));
int* d_pop_enes; cudaMalloc(&d_pop_enes, total_threads * sizeof(int));
int* d_pop_moves;
cudaCheckError(cudaMalloc(&d_pop_moves, total_threads * MAX_N * sizeof(int)));
int* d_pop_enes;
cudaCheckError(cudaMalloc(&d_pop_enes, total_threads * sizeof(int)));

int* d_best_moves; cudaMalloc(&d_best_moves, MAX_N * sizeof(int));
int* d_best_energy; cudaMalloc(&d_best_energy, sizeof(int));
int* d_best_moves;
cudaCheckError(cudaMalloc(&d_best_moves, MAX_N * sizeof(int)));
int* d_best_energy;
cudaCheckError(cudaMalloc(&d_best_energy, sizeof(int)));

int h_init_best = 999999;
cudaMemcpy(d_best_energy, &h_init_best, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckError(cudaMemcpy(d_best_energy, &h_init_best, sizeof(int), cudaMemcpyHostToDevice));

// Log setup
std::ofstream log("protein_log.csv");
Expand Down Expand Up @@ -220,10 +228,10 @@ int main() {

// Read best
int current_global_best_e;
cudaMemcpy(&current_global_best_e, d_best_energy, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckError(cudaMemcpy(&current_global_best_e, d_best_energy, sizeof(int), cudaMemcpyDeviceToHost));

// Only verify moves if energy improved or periodically
cudaMemcpy(h_best_moves.data(), d_best_moves, (n-1)*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckError(cudaMemcpy(h_best_moves.data(), d_best_moves, (n-1)*sizeof(int), cudaMemcpyDeviceToHost));

std::cout << "Frame " << frame << " (Beta=" << std::fixed << std::setprecision(2) << beta << "): Best E=" << current_global_best_e << "\r";
std::cout.flush();
Expand All @@ -239,12 +247,12 @@ int main() {
log.close();
std::cout << "\nDone! Log saved to protein_log.csv\n";

cudaFree(d_sequence);
cudaFree(d_states);
cudaFree(d_pop_moves);
cudaFree(d_pop_enes);
cudaFree(d_best_moves);
cudaFree(d_best_energy);
cudaCheckError(cudaFree(d_sequence));
cudaCheckError(cudaFree(d_states));
cudaCheckError(cudaFree(d_pop_moves));
cudaCheckError(cudaFree(d_pop_enes));
cudaCheckError(cudaFree(d_best_moves));
cudaCheckError(cudaFree(d_best_energy));

return 0;
}
Loading