From 39053c5524f71e37159156d066ac236b634711f3 Mon Sep 17 00:00:00 2001
From: Megha <meghakoranga7@gmail.com>
Date: Mon, 2 Feb 2026 12:09:45 +0000
Subject: [PATCH 1/8] Add cuSSL core structure and ML-KEM GPU backend

---
 .gitignore                 |   9 +++
 benchmarks/benchmark_cpu.c |  87 ++++++++++++++++++++++
 benchmarks/benchmark_pqc.c | 149 +++++++++++++++++++++++++++++++++++++
 benchmarks/benchmark_tls.c | 141 +++++++++++++++++++++++++++++++++++
 include/cussl/pqc.h        |  25 +++++++
 src/pqc/mlkem_cupqc.cu     | 108 +++++++++++++++++++++++++++
 6 files changed, 519 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 benchmarks/benchmark_cpu.c
 create mode 100644 benchmarks/benchmark_pqc.c
 create mode 100644 benchmarks/benchmark_tls.c
 create mode 100644 include/cussl/pqc.h
 create mode 100644 src/pqc/mlkem_cupqc.cu

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2a47e9f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.o
+*.a
+*.so
+build/
+cupqc_sdk/
+*.pem
+*.crt
+*.key
+*.txt
diff --git a/benchmarks/benchmark_cpu.c b/benchmarks/benchmark_cpu.c
new file mode 100644
index 0000000..3b53db2
--- /dev/null
+++ b/benchmarks/benchmark_cpu.c
@@ -0,0 +1,87 @@
+/* benchmark_cpu.c - Multi-Threaded CPU Benchmark */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <pthread.h>
+#include <openssl/evp.h>
+#include <openssl/err.h>
+
+#define ALGO_NAME "ML-KEM-768"
+
+// Shared specific variables
+int g_iterations_per_thread = 0;
+EVP_PKEY *g_pkey = NULL;
+
+// The Worker Thread Function
+void *cpu_worker(void *arg) {
+    EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new(g_pkey, NULL);
+    if (!ctx) return NULL;
+
+    // We do NOT call Async init, so this runs on CPU Software path
+    EVP_PKEY_encapsulate_init(ctx, NULL);
+
+    unsigned char *secret = malloc(32);
+    unsigned char *ciphertext = malloc(1088);
+    size_t secret_len = 32;
+    size_t ciphertext_len = 1088;
+
+    for (int i = 0; i < g_iterations_per_thread; i++) {
+        secret_len = 32;
+        ciphertext_len = 1088;
+        EVP_PKEY_encapsulate(ctx, ciphertext, &ciphertext_len, secret, &secret_len);
+    }
+
+    free(secret);
+    free(ciphertext);
+    EVP_PKEY_CTX_free(ctx);
+    return NULL;
+}
+
+int main(int argc, char **argv) {
+    int num_threads = 4; // Default to 4 cores
+    int total_iters = 100000;
+
+    if (argc > 1) num_threads = atoi(argv[1]);
+    if (argc > 2) total_iters = atoi(argv[2]);
+
+    g_iterations_per_thread = total_iters / num_threads;
+
+    printf("Benchmarking Multi-Core CPU Performance\n");
+    printf("Algorithm: %s\n", ALGO_NAME);
+    printf("Threads:   %d\n", num_threads);
+    printf("Total Ops: %d\n", num_threads * g_iterations_per_thread);
+
+    // Generate Key (Once)
+    EVP_PKEY_CTX *kctx = EVP_PKEY_CTX_new_from_name(NULL, ALGO_NAME, NULL);
+    EVP_PKEY_keygen_init(kctx);
+    EVP_PKEY_keygen(kctx, &g_pkey);
+    EVP_PKEY_CTX_free(kctx);
+
+    // Launch Threads
+    pthread_t threads[num_threads];
+    clock_t start = clock();
+    struct timespec ts_start, ts_end;
+    clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+    for (int i = 0; i < num_threads; i++) {
+        pthread_create(&threads[i], NULL, cpu_worker, NULL);
+    }
+
+    // Wait for Threads
+    for (int i = 0; i < num_threads; i++) {
+        pthread_join(threads[i], NULL);
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &ts_end);
+    
+    double time_spent = (ts_end.tv_sec - ts_start.tv_sec) + 
+                        (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9;
+
+    printf("\n--- CPU Results ---\n");
+    printf("Total Time: %.2f seconds\n", time_spent);
+    printf("Ops/Sec:    %.2f\n", (double)(num_threads * g_iterations_per_thread) / time_spent);
+
+    EVP_PKEY_free(g_pkey);
+    return 0;
+}
diff --git a/benchmarks/benchmark_pqc.c b/benchmarks/benchmark_pqc.c
new file mode 100644
index 0000000..a3563a6
--- /dev/null
+++ b/benchmarks/benchmark_pqc.c
@@ -0,0 +1,149 @@
+/* benchmark_pqc.c - Async Encapsulation Benchmark (FIXED) */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <openssl/evp.h>
+#include <openssl/err.h>
+#include <openssl/async.h>
+
+#define ALGO_NAME "ML-KEM-768"
+#define DEFAULT_JOBS 128
+#define DEFAULT_ITERS 100000
+
+// Structure to pass data into the Async Job
+typedef struct {
+    EVP_PKEY_CTX *ctx;
+    unsigned char *secret;
+    size_t secret_len;
+    unsigned char *ciphertext;
+    size_t ciphertext_len;
+    int result;
+} JobArgs;
+
+// The function that runs inside the Async Fiber
+int encaps_job(void *arg) {
+    JobArgs *args = (JobArgs *)arg;
+    
+    // reset output lengths (crucial for loop)
+    args->secret_len = 32;   // ML-KEM-768 SS len
+    args->ciphertext_len = 1088; // ML-KEM-768 CT len
+
+    if (EVP_PKEY_encapsulate(args->ctx, 
+                             args->ciphertext, &args->ciphertext_len,
+                             args->secret, &args->secret_len) <= 0) {
+        return 0; // Error
+    }
+    return 1; // Success
+}
+
+int main(int argc, char **argv) {
+    int async_jobs = DEFAULT_JOBS;
+    int total_iters = DEFAULT_ITERS;
+
+    // 1. Simple Argument Parsing
+    for(int i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-jobs")==0 && i+1<argc) async_jobs = atoi(argv[++i]);
+        if(strcmp(argv[i], "-iters")==0 && i+1<argc) total_iters = atoi(argv[++i]);
+    }
+
+    printf("Benchmarking %s Encapsulation\n", ALGO_NAME);
+    printf("Configuration: %d Parallel Jobs, %d Total Iterations\n", async_jobs, total_iters);
+
+    // 2. Setup: Generate ONE Keypair (simulating a server key)
+    printf("Generating Server Keypair...\n");
+    EVP_PKEY_CTX *kctx = EVP_PKEY_CTX_new_from_name(NULL, ALGO_NAME, NULL);
+    EVP_PKEY *pkey = NULL;
+    EVP_PKEY_keygen_init(kctx);
+    EVP_PKEY_keygen(kctx, &pkey);
+    EVP_PKEY_CTX_free(kctx);
+
+    if (!pkey) {
+        fprintf(stderr, "Error: Keygen failed. Is the algorithm name correct?\n");
+        return 1;
+    }
+
+    // 3. Setup Async Pool
+    ASYNC_JOB *jobs[async_jobs];
+    ASYNC_WAIT_CTX *wait_ctxs[async_jobs];
+    JobArgs args[async_jobs];
+    int job_status[async_jobs]; // 0=Idle, 1=InProgress/Paused
+    
+    // Initialize Memory
+    for (int i = 0; i < async_jobs; i++) {
+        jobs[i] = NULL;
+        wait_ctxs[i] = ASYNC_WAIT_CTX_new();
+        job_status[i] = 0;
+        
+        // Each job needs its own context derived from the pubkey
+        args[i].ctx = EVP_PKEY_CTX_new(pkey, NULL);
+        
+        // --- FIX IS HERE (added NULL) ---
+        EVP_PKEY_encapsulate_init(args[i].ctx, NULL);
+        
+        args[i].secret = malloc(32);
+        args[i].ciphertext = malloc(1088);
+    }
+
+    printf("Starting Benchmark...\n");
+    clock_t start = clock();
+
+    int started_count = 0;
+    int finished_count = 0;
+    int ret;
+
+    // 4. The "Event Loop" (Round Robin Scheduler)
+    while (finished_count < total_iters) {
+        for (int i = 0; i < async_jobs; i++) {
+            
+            // Case A: Job is Idle, and we have work to do -> START IT
+            if (job_status[i] == 0 && started_count < total_iters) {
+                started_count++;
+                job_status[i] = 1; // Mark active
+                
+                // Launch Job
+                ret = ASYNC_start_job(&jobs[i], wait_ctxs[i], &ret, encaps_job, &args[i], sizeof(JobArgs));
+                
+                if (ret == ASYNC_PAUSE) {
+                    // Good! It hit the GPU batch queue and paused.
+                    // Loop continues to start next job...
+                } else if (ret == ASYNC_FINISH) {
+                    // It finished instantly (CPU fallback or fast path)
+                    job_status[i] = 0;
+                    finished_count++;
+                }
+            } 
+            // Case B: Job is Paused (Waiting for GPU) -> RESUME IT
+            else if (job_status[i] == 1) {
+                // Resume Job
+                ret = ASYNC_start_job(&jobs[i], wait_ctxs[i], &ret, encaps_job, &args[i], sizeof(JobArgs));
+                
+                if (ret == ASYNC_FINISH) {
+                    // Now it's really done
+                    job_status[i] = 0;
+                    finished_count++;
+                }
+                // If ASYNC_PAUSE again, it means batch isn't full yet, keep looping.
+            }
+        }
+    }
+
+    clock_t end = clock();
+
+    // 5. Cleanup & Report
+    for (int i = 0; i < async_jobs; i++) {
+        ASYNC_WAIT_CTX_free(wait_ctxs[i]);
+        EVP_PKEY_CTX_free(args[i].ctx);
+        free(args[i].secret);
+        free(args[i].ciphertext);
+    }
+    EVP_PKEY_free(pkey);
+
+    double time_spent = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("\n--- Results ---\n");
+    printf("Total Time: %.2f seconds\n", time_spent);
+    printf("Ops/Sec:    %.2f\n", total_iters / time_spent);
+
+    return 0;
+}
diff --git a/benchmarks/benchmark_tls.c b/benchmarks/benchmark_tls.c
new file mode 100644
index 0000000..5123b3b
--- /dev/null
+++ b/benchmarks/benchmark_tls.c
@@ -0,0 +1,141 @@
+/* benchmark_tls.c - Multi-Threaded TLS 1.3 Load Generator for ML-KEM */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <openssl/ssl.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#include <time.h>
+
+#define SERVER_PORT "4433"
+#define HOST_NAME "localhost"
+
+// GLOBAL STATS
+int g_handshakes_completed = 0;
+pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
+
+typedef struct {
+    int thread_id;
+    int iterations;
+} thread_args;
+
+void *worker_thread(void *arg) {
+    thread_args *args = (thread_args *)arg;
+    SSL_CTX *ctx;
+    SSL *ssl;
+    BIO *bio;
+    
+    // 1. Create Context
+    ctx = SSL_CTX_new(TLS_client_method());
+    if (!ctx) {
+        fprintf(stderr, "Thread %d: Failed to create SSL_CTX\n", args->thread_id);
+        return NULL;
+    }
+
+    // 2. FORCE ML-KEM-768
+    // This ensures the Client Hello specifically asks for your PQC algorithm
+    if (!SSL_CTX_set1_groups_list(ctx, "mlkem768")) {
+        fprintf(stderr, "Error: ML-KEM-768 not supported by this OpenSSL build.\n");
+        SSL_CTX_free(ctx);
+        return NULL;
+    }
+    
+    // Disable certificate verification (Speed hack for benchmarking)
+    SSL_CTX_set_verify(ctx, SSL_VERIFY_NONE, NULL);
+
+    for (int i = 0; i < args->iterations; i++) {
+        // 3. Create Connection BIO
+        bio = BIO_new_ssl_connect(ctx);
+        if (!bio) {
+             fprintf(stderr, "Thread %d: BIO creation failed\n", args->thread_id);
+             continue;
+        }
+
+        BIO_set_conn_hostname(bio, HOST_NAME ":" SERVER_PORT);
+
+        // --- THE FIX IS HERE ---
+        BIO_get_ssl(bio, &ssl); 
+        // -----------------------
+
+        if (!ssl) {
+             fprintf(stderr, "Thread %d: Could not get SSL pointer\n", args->thread_id);
+             BIO_free_all(bio);
+             continue;
+        }
+
+        SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
+
+        // 4. Perform Handshake
+        // We use BIO_do_connect first to establish TCP, then handshake
+        if (BIO_do_connect(bio) <= 0) {
+             // Connection failed (Server might be busy/full)
+             // Uncomment this line only if you need to debug connection errors:
+             // ERR_print_errors_fp(stderr);
+             BIO_free_all(bio);
+             continue;
+        }
+
+        if (SSL_do_handshake(ssl) <= 0) {
+            // Handshake failed
+            // ERR_print_errors_fp(stderr);
+        } else {
+            // Success!
+            pthread_mutex_lock(&g_lock);
+            g_handshakes_completed++;
+            pthread_mutex_unlock(&g_lock);
+        }
+
+        // 5. Cleanup
+        BIO_free_all(bio); // This frees the SSL object attached to it
+    }
+
+    SSL_CTX_free(ctx);
+    return NULL;
+}
+
+int main(int argc, char **argv) {
+    int num_threads = 64;   
+    int total_iters = 10000; 
+
+    if (argc > 1) num_threads = atoi(argv[1]);
+    if (argc > 2) total_iters = atoi(argv[2]);
+
+    int iters_per_thread = total_iters / num_threads;
+    
+    printf("Starting TLS Benchmark (ML-KEM-768)\n");
+    printf("Threads: %d | Total Requests: %d\n", num_threads, num_threads * iters_per_thread);
+
+    pthread_t *threads = malloc(sizeof(pthread_t) * num_threads);
+    thread_args *t_args = malloc(sizeof(thread_args) * num_threads);
+
+    struct timespec ts_start, ts_end;
+    clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+    // Launch Threads
+    for (int i = 0; i < num_threads; i++) {
+        t_args[i].thread_id = i;
+        t_args[i].iterations = iters_per_thread;
+        pthread_create(&threads[i], NULL, worker_thread, &t_args[i]);
+    }
+
+    // Wait for Threads
+    for (int i = 0; i < num_threads; i++) {
+        pthread_join(threads[i], NULL);
+    }
+
+    // End Timer
+    clock_gettime(CLOCK_MONOTONIC, &ts_end);
+    double time_spent = (ts_end.tv_sec - ts_start.tv_sec) + 
+                        (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9;
+
+    printf("\n--- TLS Results ---\n");
+    printf("Handshakes: %d\n", g_handshakes_completed);
+    printf("Time:       %.2f sec\n", time_spent);
+    printf("Rate:       %.2f Handshakes/Sec\n", g_handshakes_completed / time_spent);
+
+    free(threads);
+    free(t_args);
+    return 0;
+}
diff --git a/include/cussl/pqc.h b/include/cussl/pqc.h
new file mode 100644
index 0000000..3f9f14b
--- /dev/null
+++ b/include/cussl/pqc.h
@@ -0,0 +1,25 @@
+#ifndef CUPQC_SHIM_H
+#define CUPQC_SHIM_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 1. Key Generation
+// Returns 1 on success, 0 on failure
+int cupqc_shim_keygen_768(uint8_t *pk, uint8_t *sk);
+
+// 2. Encapsulation
+int cupqc_shim_encaps_768(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+
+// 3. Decapsulation
+int cupqc_shim_decaps_768(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/pqc/mlkem_cupqc.cu b/src/pqc/mlkem_cupqc.cu
new file mode 100644
index 0000000..0a8160f
--- /dev/null
+++ b/src/pqc/mlkem_cupqc.cu
@@ -0,0 +1,108 @@
+#include "cupqc_shim.hpp"
+#include <cupqc/pk.hpp>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+using namespace cupqc;
+
+// --- DESCRIPTORS ---
+using Keygen768 = decltype(ML_KEM_768{} + Function<function::Keygen>() + Block() + BlockDim<256>());
+using Encaps768 = decltype(ML_KEM_768{} + Function<function::Encaps>() + Block() + BlockDim<256>());
+
+// --- PERSISTENT MEMORY ---
+static uint8_t *g_d_pk = nullptr;
+static uint8_t *g_d_ct = nullptr;
+static uint8_t *g_d_ss = nullptr;
+static uint8_t *g_d_entropy = nullptr;
+static uint8_t *g_d_workspace = nullptr;
+
+static uint8_t *g_h_pk = nullptr;
+static uint8_t *g_h_ct = nullptr;
+static uint8_t *g_h_ss = nullptr;
+static uint8_t *g_h_entropy = nullptr;
+
+const int MAX_CAPACITY = 2048;
+
+// --- KERNEL ---
+__global__ void kernel_encaps_batch(
+    uint8_t* flat_ct, 
+    uint8_t* flat_ss, 
+    const uint8_t* flat_pk, 
+    uint8_t* flat_entropy, 
+    uint8_t* flat_workspace
+) {
+    int job_id = blockIdx.x; 
+
+    uint8_t* my_ct = flat_ct + (job_id * Encaps768::ciphertext_size);
+    uint8_t* my_ss = flat_ss + (job_id * Encaps768::shared_secret_size);
+    const uint8_t* my_pk = flat_pk + (job_id * Encaps768::public_key_size);
+    uint8_t* my_entropy = flat_entropy + (job_id * Encaps768::entropy_size);
+    uint8_t* my_workspace = flat_workspace + (job_id * Encaps768::workspace_size);
+
+    __shared__ uint8_t smem[Encaps768::shared_memory_size];
+    Encaps768().execute(my_ct, my_ss, my_pk, my_entropy, my_workspace, smem);
+}
+
+extern "C" {
+
+int cupqc_shim_keygen_768(uint8_t *pk, uint8_t *sk) { return 1; }
+
+void cupqc_encaps_mlkem768_batch(
+    int count, 
+    uint8_t **pk_ptrs, 
+    uint8_t **rnd_ptrs, 
+    uint8_t **ss_ptrs, 
+    uint8_t **ct_ptrs
+) {
+    // Safety Check: Invalid count
+    if (count <= 0 || count > MAX_CAPACITY) return;
+
+    // 1. ALLOCATION (First Run Only)
+    if (g_d_pk == nullptr) {
+        cudaMalloc(&g_d_pk, MAX_CAPACITY * Encaps768::public_key_size);
+        cudaMalloc(&g_d_ct, MAX_CAPACITY * Encaps768::ciphertext_size);
+        cudaMalloc(&g_d_ss, MAX_CAPACITY * Encaps768::shared_secret_size);
+        cudaMalloc(&g_d_entropy, MAX_CAPACITY * Encaps768::entropy_size);
+        cudaMalloc(&g_d_workspace, MAX_CAPACITY * Encaps768::workspace_size);
+
+        cudaHostAlloc(&g_h_pk, MAX_CAPACITY * Encaps768::public_key_size, cudaHostAllocDefault);
+        cudaHostAlloc(&g_h_ct, MAX_CAPACITY * Encaps768::ciphertext_size, cudaHostAllocDefault);
+        cudaHostAlloc(&g_h_ss, MAX_CAPACITY * Encaps768::shared_secret_size, cudaHostAllocDefault);
+        cudaHostAlloc(&g_h_entropy, MAX_CAPACITY * Encaps768::entropy_size, cudaHostAllocDefault);
+    }
+
+    // 2. GATHER
+    for (int i = 0; i < count; i++) {
+        // Safety Check: Input pointers
+        if (pk_ptrs[i] != nullptr && rnd_ptrs[i] != nullptr) {
+            memcpy(g_h_pk + (i * Encaps768::public_key_size), pk_ptrs[i], Encaps768::public_key_size);
+            memcpy(g_h_entropy + (i * Encaps768::entropy_size), rnd_ptrs[i], Encaps768::entropy_size);
+        }
+    }
+
+    // 3. LAUNCH
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    cudaMemcpyAsync(g_d_pk, g_h_pk, count * Encaps768::public_key_size, cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(g_d_entropy, g_h_entropy, count * Encaps768::entropy_size, cudaMemcpyHostToDevice, stream);
+
+    kernel_encaps_batch<<<count, 256, 0, stream>>>(g_d_ct, g_d_ss, g_d_pk, g_d_entropy, g_d_workspace);
+
+    cudaMemcpyAsync(g_h_ct, g_d_ct, count * Encaps768::ciphertext_size, cudaMemcpyDeviceToHost, stream);
+    cudaMemcpyAsync(g_h_ss, g_d_ss, count * Encaps768::shared_secret_size, cudaMemcpyDeviceToHost, stream);
+
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+
+    // 4. SCATTER
+    for (int i = 0; i < count; i++) {
+        // Safety Check: Output pointers
+        if (ct_ptrs[i] != nullptr && ss_ptrs[i] != nullptr) {
+            memcpy(ct_ptrs[i], g_h_ct + (i * Encaps768::ciphertext_size), Encaps768::ciphertext_size);
+            memcpy(ss_ptrs[i], g_h_ss + (i * Encaps768::shared_secret_size), Encaps768::shared_secret_size);
+        }
+    }
+}
+
+} // extern "C"

From c4e86a85f24c62bc7b46564f6325296e4dc20a70 Mon Sep 17 00:00:00 2001
From: MeghaKoranga <meghakoranga7@email.com>
Date: Sat, 14 Feb 2026 06:15:07 +0000
Subject: [PATCH 2/8] Add cuSSL runtime, CUDA backend, ML-KEM GPU integration,
 and OpenSSL patch

---
 .../patches/openssl-3.5.0-mlkem-cupqc.patch   | 215 ++++++++++++++++++
 src/cupqc_batch.h                             |  34 +++
 src/cupqc_runtime.c                           | 174 ++++++++++++++
 src/cupqc_shim.cu                             | 112 +++++++++
 4 files changed, 535 insertions(+)
 create mode 100644 openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
 create mode 100644 src/cupqc_batch.h
 create mode 100644 src/cupqc_runtime.c
 create mode 100644 src/cupqc_shim.cu

diff --git a/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch b/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
new file mode 100644
index 0000000..95307f6
--- /dev/null
+++ b/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
@@ -0,0 +1,215 @@
+--- upstream/openssl/crypto/ml_kem/ml_kem.c	2026-02-02 12:57:21.435417013 +0000
++++ openssl-openssl-3.5.0/crypto/ml_kem/ml_kem.c	2026-02-13 15:55:24.882586905 +0000
+@@ -13,7 +13,8 @@
+ #include "internal/common.h"
+ #include "internal/constant_time.h"
+ #include "internal/sha3.h"
+-
++#include <openssl/async.h>
++#include "cupqc_batch.h"
+ #if defined(OPENSSL_CONSTANT_TIME_VALIDATION)
+ #include <valgrind/memcheck.h>
+ #endif
+@@ -87,6 +88,22 @@
+ /*
+  * Structure of keys
+  */
++// External declaration for the GPU Library
++/* Wrapper functions to match the callback signature */
++
++/* Wrapper functions */
++static void wrapper_pause(void) { 
++    ASYNC_pause_job(); 
++}
++
++/* * 'ASYNC_wake' is internal/hidden and causes linker errors.
++ * The runtime will simply rely on the condition variable or blocking.
++ */
++static void* wrapper_get_job(void) { 
++    return (void*)ASYNC_get_current_job(); 
++}
++
++static int cupqc_callbacks_registered = 0;
+ typedef struct ossl_ml_kem_scalar_st {
+     /* On every function entry and exit, 0 <= c[i] < ML_KEM_PRIME. */
+     uint16_t c[ML_KEM_DEGREE];
+@@ -1798,11 +1815,14 @@
+     EVP_MD_CTX_free(mdctx);
+     return ret;
+ }
+-
+ /*
+  * Generate a new keypair, either from the saved seed (when non-null), or from
+  * the RNG.
+  */
++/* --- ADD THIS --- */
++/* * STANDARD OPENSSL 3.5 ML-KEM KEYGEN
++ * (CPU-Only, Safe, Correct)
++ */
+ int ossl_ml_kem_genkey(uint8_t *pubenc, size_t publen, ML_KEM_KEY *key)
+ {
+     uint8_t seed[ML_KEM_SEED_BYTES];
+@@ -1819,6 +1839,7 @@
+     if (pubenc != NULL && publen != vinfo->pubkey_bytes)
+         return 0;
+ 
++    /* 1. Generate or retrieve the random seed */
+     if (ossl_ml_kem_have_seed(key)) {
+         if (!ossl_ml_kem_encode_seed(seed, sizeof(seed), key))
+             return 0;
+@@ -1830,32 +1851,30 @@
+ 
+     if ((mdctx = EVP_MD_CTX_new()) == NULL)
+         return 0;
+-
+-    /*
++     /*
+      * Data derived from (d, z) defaults secret, and to avoid side-channel
+      * leaks should not influence control flow.
+      */
+     CONSTTIME_SECRET(seed, ML_KEM_SEED_BYTES);
+ 
+-    if (add_storage(OPENSSL_malloc(vinfo->prvalloc), 1, key))
++   if (add_storage(OPENSSL_malloc(vinfo->prvalloc), 1, key))
+         ret = genkey(seed, mdctx, pubenc, key);
++    
+     OPENSSL_cleanse(seed, sizeof(seed));
+-
+-    /* Declassify secret inputs and derived outputs before returning control */
+-    CONSTTIME_DECLASSIFY(seed, ML_KEM_SEED_BYTES);
++ /* Declassify secret inputs and derived outputs before returning control */   
++ CONSTTIME_DECLASSIFY(seed, ML_KEM_SEED_BYTES);
+ 
+     EVP_MD_CTX_free(mdctx);
++    
+     if (!ret) {
+         ossl_ml_kem_key_reset(key);
+         return 0;
+     }
+ 
+-    /* The public components are already declassified */
+     CONSTTIME_DECLASSIFY(key->s, vinfo->rank * sizeof(scalar));
+     CONSTTIME_DECLASSIFY(key->z, 2 * ML_KEM_RANDOM_BYTES);
+     return 1;
+ }
+-
+ /*
+  * FIPS 203, Section 6.2, Algorithm 17: ML-KEM.Encaps_internal
+  * This is the deterministic version with randomness supplied externally.
+@@ -1919,6 +1938,43 @@
+                            const ML_KEM_KEY *key)
+ {
+     uint8_t r[ML_KEM_RANDOM_BYTES];
++    uint8_t pub_encoded[1184];
++
++    if (key == NULL) return 0;
++
++    // 1. Generate Randomness (Standard OpenSSL)
++    if (RAND_bytes_ex(key->libctx, r, ML_KEM_RANDOM_BYTES,
++                      key->vinfo->secbits) < 1)
++        return 0;
++
++    const char *env = getenv("ENABLE_CUPQC");
++
++    if (env && strcmp(env, "1") == 0) {
++        
++        // Register callbacks ONCE (Lazy Init)
++        if (!cupqc_callbacks_registered) {
++            cupqc_set_callbacks(wrapper_pause,NULL, wrapper_get_job);
++            cupqc_callbacks_registered = 1;
++        }
++
++        // Serialize Key
++        ossl_ml_kem_encode_public_key(pub_encoded, 1184, key);
++
++        // Submit to Runtime (Runtime handles waiting/pausing)
++        if (cupqc_submit_encap_job(pub_encoded, r, ctext, shared_secret) == 1) {
++            return 1;
++        }
++    }
++
++    // Fallback
++    return ossl_ml_kem_encap_seed(ctext, clen, shared_secret, slen,
++                                  r, sizeof(r), key);
++}
++/* int ossl_ml_kem_encap_rand(uint8_t *ctext, size_t clen,
++                           uint8_t *shared_secret, size_t slen,
++                           const ML_KEM_KEY *key)
++{
++    uint8_t r[ML_KEM_RANDOM_BYTES];
+ 
+     if (key == NULL)
+         return 0;
+@@ -1930,7 +1986,7 @@
+     return ossl_ml_kem_encap_seed(ctext, clen, shared_secret, slen,
+                                   r, sizeof(r), key);
+ }
+-
++*/
+ int ossl_ml_kem_decap(uint8_t *shared_secret, size_t slen,
+                       const uint8_t *ctext, size_t clen,
+                       const ML_KEM_KEY *key)
+--- upstream/openssl/crypto/ml_kem/cupqc_batch.h	2026-02-02 12:57:21.437223877 +0000
++++ openssl-openssl-3.5.0/crypto/ml_kem/cupqc_batch.h	2026-02-14 04:57:49.888240721 +0000
+@@ -1,34 +1,34 @@
+ #ifndef CUPQC_BATCH_H
+ #define CUPQC_BATCH_H
+ 
+-#include <openssl/async.h>
++#include <stdint.h>
+ 
+-#define MLKEM_768_PK_BYTES 1184 // Standard size#include <pthread.h>
+-
+-// 1. Configurable Batch Size (This is what you will change to Benchmark 50 vs 100)
+-#define CUPQC_BATCH_SIZE 512
++#ifdef __cplusplus
++extern "C" {
++#endif
+ 
+-// 2. The "Ticket" structure for one TLS request
+-typedef struct {
+-    int id;
+-
+-    unsigned char pub_key_storage[1184];
+-    unsigned char *randomness_in;    
+-    unsigned char *shared_secret_out;// Pointer to output buffer
+-    unsigned char *ciphertext_out;   // Pointer to output buffer
+-    
+-    ASYNC_JOB *job;                  // The OpenSSL Job to wake up later
+-    ASYNC_WAIT_CTX *wait_ctx;        // Context to signal the wake-up
+-    int status;                      // 0 = Waiting, 1 = Completed
+-} cupqc_job_t;
+-
+-// 3. The "Waiting Room" (Queue)
+-typedef struct {
+-    cupqc_job_t jobs[CUPQC_BATCH_SIZE]; // Array of slots
+-    int count;                          // Current number of people waiting
+-    pthread_mutex_t lock;               // Thread safety lock
+-    pthread_cond_t cond;                // Signal to wake up the GPU Manager
+-    int shutdown;                       // Flag to stop the system
+-} cupqc_batch_queue_t;
++/* * API: Submit an encapsulation job.
++ * The runtime handles all batching, threading, and GPU offloading internally.
++ */
++int cupqc_submit_encap_job(
++    uint8_t *public_key,
++    uint8_t *randomness,
++    uint8_t *ciphertext_out,
++    uint8_t *shared_secret_out
++);
++
++/*
++ * API: Register OpenSSL async callbacks.
++ * Allows the runtime to pause/wake OpenSSL jobs during GPU wait times.
++ */
++void cupqc_set_callbacks(
++    void (*pause)(void),
++    void (*wake)(void *),
++    void *(*get_job)(void)
++);
+ 
++#ifdef __cplusplus
++}
+ #endif
++
++#endif /* CUPQC_BATCH_H */
diff --git a/src/cupqc_batch.h b/src/cupqc_batch.h
new file mode 100644
index 0000000..ae12778
--- /dev/null
+++ b/src/cupqc_batch.h
@@ -0,0 +1,34 @@
+#ifndef CUPQC_BATCH_H
+#define CUPQC_BATCH_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* * API: Submit an encapsulation job.
+ * The runtime handles all batching, threading, and GPU offloading internally.
+ */
+int cupqc_submit_encap_job(
+    uint8_t *public_key,
+    uint8_t *randomness,
+    uint8_t *ciphertext_out,
+    uint8_t *shared_secret_out
+);
+
+/*
+ * API: Register OpenSSL async callbacks.
+ * Allows the runtime to pause/wake OpenSSL jobs during GPU wait times.
+ */
+void cupqc_set_callbacks(
+    void (*pause)(void),
+    void (*wake)(void *),
+    void *(*get_job)(void)
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CUPQC_BATCH_H */
diff --git a/src/cupqc_runtime.c b/src/cupqc_runtime.c
new file mode 100644
index 0000000..238c69d
--- /dev/null
+++ b/src/cupqc_runtime.c
@@ -0,0 +1,174 @@
+/* cupqc_runtime.c - Internal Implementation */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdint.h> /* FIX #2: Explicit include for defensive correctness */
+
+/* Include the clean Public API */
+#include "cupqc_batch.h"
+
+/* --- INTERNAL STRUCTURES (Hidden from OpenSSL) --- */
+#define CUPQC_BATCH_SIZE 512
+
+typedef struct {
+    unsigned char pub_key_storage[1184];
+    unsigned char randomness_storage[32]; 
+    unsigned char *shared_secret_out;     
+    unsigned char *ciphertext_out;        
+    void *opaque_job_ptr; 
+    int status;
+} cupqc_job_t;
+
+typedef struct {
+    cupqc_job_t jobs[CUPQC_BATCH_SIZE];
+    int count;
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+    pthread_cond_t cond_done;
+    int shutdown;
+} cupqc_batch_queue_t;
+
+/* --- GLOBAL STATE --- */
+static cupqc_batch_queue_t global_queue;
+static pthread_t batch_thread;
+static int cupqc_initialized = 0;
+
+/* --- CALLBACKS --- */
+static void (*cb_pause_job)(void) = NULL;
+static void (*cb_wake_job)(void*) = NULL;
+static void* (*cb_get_curr_job)(void) = NULL;
+
+void cupqc_set_callbacks(void (*pause)(void), 
+                         void (*wake)(void*), 
+                         void* (*get_job)(void)) 
+{
+    cb_pause_job = pause;
+    cb_wake_job = wake;
+    cb_get_curr_job = get_job;
+}
+
+/* --- WORKER THREAD --- */
+void* cupqc_batch_worker(void *arg) {
+    while (1) {
+        pthread_mutex_lock(&global_queue.lock);
+
+        while (global_queue.count == 0 && !global_queue.shutdown) {
+            pthread_cond_wait(&global_queue.cond, &global_queue.lock);
+        }
+
+        if (global_queue.shutdown) {
+            pthread_mutex_unlock(&global_queue.lock);
+            break;
+        }
+
+        int batch_size = global_queue.count;
+
+        // 1. Prepare Data
+        unsigned char *pks[CUPQC_BATCH_SIZE];
+        unsigned char *rnds[CUPQC_BATCH_SIZE];
+        unsigned char *cts[CUPQC_BATCH_SIZE];
+        unsigned char *sss[CUPQC_BATCH_SIZE];
+
+        for(int i=0; i<batch_size; i++) {
+            pks[i]  = global_queue.jobs[i].pub_key_storage;
+            rnds[i] = global_queue.jobs[i].randomness_storage;
+            cts[i]  = global_queue.jobs[i].ciphertext_out;
+            sss[i]  = global_queue.jobs[i].shared_secret_out;
+        }
+
+        // 2. Call GPU Kernel (External Linkage)
+        extern void cupqc_encap_mlkem768_batch(int count, 
+                                                 unsigned char **pk, 
+                                                 unsigned char **rnd, 
+                                                 unsigned char **ct, 
+                                                 unsigned char **ss);
+        
+        // Note: Signature is (pk, rnd, ct, ss) to match Shim
+        cupqc_encap_mlkem768_batch(batch_size, pks, rnds, cts, sss);
+
+        // 3. Notify Completion
+        for(int i=0; i<batch_size; i++) {
+            global_queue.jobs[i].status = 1;
+            if (global_queue.jobs[i].opaque_job_ptr != NULL && cb_wake_job != NULL) {
+                cb_wake_job(global_queue.jobs[i].opaque_job_ptr);
+            }
+        }
+
+        pthread_cond_broadcast(&global_queue.cond_done);
+
+        global_queue.count = 0;
+        pthread_mutex_unlock(&global_queue.lock);
+    }
+    return NULL;
+}
+
+static void cupqc_lazy_init(void) {
+    if (!cupqc_initialized) {
+        pthread_mutex_init(&global_queue.lock, NULL);
+        pthread_cond_init(&global_queue.cond, NULL);
+        pthread_cond_init(&global_queue.cond_done, NULL);
+        global_queue.count = 0;
+        global_queue.shutdown = 0;
+        pthread_create(&batch_thread, NULL, cupqc_batch_worker, NULL);
+        cupqc_initialized = 1;
+    }
+}
+
+/* --- PUBLIC API IMPLEMENTATION --- */
+int cupqc_submit_encap_job(uint8_t *public_key, 
+                           uint8_t *randomness,
+                           uint8_t *ciphertext_out, 
+                           uint8_t *shared_secret_out) 
+{
+    cupqc_lazy_init();
+
+    pthread_mutex_lock(&global_queue.lock);
+
+    while (global_queue.count >= CUPQC_BATCH_SIZE) {
+        pthread_cond_wait(&global_queue.cond_done, &global_queue.lock);
+    }
+
+    int slot = global_queue.count;
+
+    // Secure Copy
+    memcpy(global_queue.jobs[slot].pub_key_storage, public_key, 1184);
+    memcpy(global_queue.jobs[slot].randomness_storage, randomness, 32);
+
+    global_queue.jobs[slot].ciphertext_out = ciphertext_out;
+    global_queue.jobs[slot].shared_secret_out = shared_secret_out;
+    global_queue.jobs[slot].status = 0;
+
+    if (cb_get_curr_job) {
+        global_queue.jobs[slot].opaque_job_ptr = cb_get_curr_job();
+    } else {
+        global_queue.jobs[slot].opaque_job_ptr = NULL;
+    }
+
+    global_queue.count++;
+    
+    if (global_queue.count >= 32) { 
+        pthread_cond_signal(&global_queue.cond);
+    }
+
+    pthread_mutex_unlock(&global_queue.lock);
+
+    // Wait Strategy
+    void *current_job = (cb_get_curr_job) ? cb_get_curr_job() : NULL;
+
+    if (current_job != NULL && cb_pause_job != NULL) {
+        cb_pause_job();
+    } else {
+        pthread_mutex_lock(&global_queue.lock);
+        while (global_queue.jobs[slot].status == 0) {
+            /* FIX #1: Removed redundant signal. Just wait. */
+            pthread_cond_wait(&global_queue.cond_done, &global_queue.lock);
+        }
+        pthread_mutex_unlock(&global_queue.lock);
+    }
+
+    return 1;
+}
diff --git a/src/cupqc_shim.cu b/src/cupqc_shim.cu
new file mode 100644
index 0000000..3fa5078
--- /dev/null
+++ b/src/cupqc_shim.cu
@@ -0,0 +1,112 @@
+/* cupqc_shim.cu - Fixed Signature Version */
+#include <cupqc/pk.hpp>
+#include <cupqc/cupqc.hpp>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <string.h>
+
+using namespace cupqc;
+
+/* --- 1. DEFINE THE ALGORITHM --- */
+using Encaps768 = decltype(ML_KEM_768{} + Function<function::Encaps>() + Block() + BlockDim<256>());
+
+/* --- 2. GLOBAL GPU BUFFERS --- */
+static uint8_t *g_d_pk = nullptr;
+static uint8_t *g_d_ct = nullptr;
+static uint8_t *g_d_ss = nullptr;
+static uint8_t *g_d_entropy = nullptr;
+static uint8_t *g_d_workspace = nullptr;
+
+static uint8_t *g_h_pk = nullptr;
+static uint8_t *g_h_ct = nullptr;
+static uint8_t *g_h_ss = nullptr;
+static uint8_t *g_h_entropy = nullptr;
+
+const int MAX_CAPACITY = 2048;
+
+/* --- 3. THE KERNEL --- */
+__global__ void kernel_encaps_batch(
+    uint8_t* flat_ct, 
+    uint8_t* flat_ss, 
+    const uint8_t* flat_pk, 
+    uint8_t* flat_entropy, 
+    uint8_t* flat_workspace
+) {
+    int job_id = blockIdx.x; 
+
+    uint8_t* my_ct = flat_ct + (job_id * Encaps768::ciphertext_size);
+    uint8_t* my_ss = flat_ss + (job_id * Encaps768::shared_secret_size);
+    const uint8_t* my_pk = flat_pk + (job_id * Encaps768::public_key_size);
+    uint8_t* my_entropy = flat_entropy + (job_id * Encaps768::entropy_size);
+    uint8_t* my_workspace = flat_workspace + (job_id * Encaps768::workspace_size);
+
+    __shared__ uint8_t smem[Encaps768::shared_memory_size];
+    Encaps768().execute(my_ct, my_ss, my_pk, my_entropy, my_workspace, smem);
+}
+
+extern "C" {
+
+/* --- 4. HOST FUNCTION --- */
+/* FIX: Argument order restored to (pk, rnd, ct, ss) to match Runtime */
+void cupqc_encap_mlkem768_batch(
+    int count, 
+    unsigned char **pk_ptrs, 
+    unsigned char **rnd_ptrs, 
+    unsigned char **ct_ptrs,  /* Arg 3: Ciphertext (1088 bytes) */
+    unsigned char **ss_ptrs   /* Arg 4: Shared Secret (32 bytes) */
+) {
+    if (count <= 0 || count > MAX_CAPACITY) return;
+
+    // A. ALLOCATION
+    if (g_d_pk == nullptr) {
+        cudaMalloc(&g_d_pk, MAX_CAPACITY * Encaps768::public_key_size);
+        cudaMalloc(&g_d_ct, MAX_CAPACITY * Encaps768::ciphertext_size);
+        cudaMalloc(&g_d_ss, MAX_CAPACITY * Encaps768::shared_secret_size);
+        cudaMalloc(&g_d_entropy, MAX_CAPACITY * Encaps768::entropy_size);
+        cudaMalloc(&g_d_workspace, MAX_CAPACITY * Encaps768::workspace_size);
+
+        cudaHostAlloc(&g_h_pk, MAX_CAPACITY * Encaps768::public_key_size, cudaHostAllocDefault);
+        cudaHostAlloc(&g_h_ct, MAX_CAPACITY * Encaps768::ciphertext_size, cudaHostAllocDefault);
+        cudaHostAlloc(&g_h_ss, MAX_CAPACITY * Encaps768::shared_secret_size, cudaHostAllocDefault);
+        cudaHostAlloc(&g_h_entropy, MAX_CAPACITY * Encaps768::entropy_size, cudaHostAllocDefault);
+    }
+
+    // B. GATHER
+    for (int i = 0; i < count; i++) {
+        if (pk_ptrs[i] && rnd_ptrs[i]) {
+            memcpy(g_h_pk + (i * Encaps768::public_key_size), pk_ptrs[i], Encaps768::public_key_size);
+            memcpy(g_h_entropy + (i * Encaps768::entropy_size), rnd_ptrs[i], Encaps768::entropy_size);
+        }
+    }
+
+    // C. COPY & LAUNCH
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    cudaMemcpyAsync(g_d_pk, g_h_pk, count * Encaps768::public_key_size, cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(g_d_entropy, g_h_entropy, count * Encaps768::entropy_size, cudaMemcpyHostToDevice, stream);
+
+    kernel_encaps_batch<<<count, 256, 0, stream>>>(g_d_ct, g_d_ss, g_d_pk, g_d_entropy, g_d_workspace);
+
+    cudaMemcpyAsync(g_h_ct, g_d_ct, count * Encaps768::ciphertext_size, cudaMemcpyDeviceToHost, stream);
+    cudaMemcpyAsync(g_h_ss, g_d_ss, count * Encaps768::shared_secret_size, cudaMemcpyDeviceToHost, stream);
+
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+
+    // D. SCATTER (Now safe because ct_ptrs is actually the big buffer)
+    for (int i = 0; i < count; i++) {
+        /* Write Ciphertext (1088 bytes) to ct_ptrs */
+        if (ct_ptrs[i]) {
+            memcpy(ct_ptrs[i], g_h_ct + (i * Encaps768::ciphertext_size), Encaps768::ciphertext_size);
+        }
+        /* Write Shared Secret (32 bytes) to ss_ptrs */
+        if (ss_ptrs[i]) {
+            memcpy(ss_ptrs[i], g_h_ss + (i * Encaps768::shared_secret_size), Encaps768::shared_secret_size);
+        }
+    }
+}
+
+void cupqc_keygen_mlkem768(uint8_t *pk, uint8_t *sk) { return; }
+
+} // extern "C"

From 79a034e3eeb606df63531b2d5b4f2ed48145f351 Mon Sep 17 00:00:00 2001
From: MeghaKoranga <meghakoranga7@email.com>
Date: Sat, 14 Feb 2026 06:36:48 +0000
Subject: [PATCH 3/8]  Add ML-KEM GPU integration, and OpenSSL patch

---
 .../patches/openssl-3.5.0-mlkem-cupqc.patch   | 106 ++++++++----------
 1 file changed, 46 insertions(+), 60 deletions(-)

diff --git a/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch b/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
index 95307f6..fd6617c 100644
--- a/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
+++ b/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
@@ -1,5 +1,5 @@
 --- upstream/openssl/crypto/ml_kem/ml_kem.c	2026-02-02 12:57:21.435417013 +0000
-+++ openssl-openssl-3.5.0/crypto/ml_kem/ml_kem.c	2026-02-13 15:55:24.882586905 +0000
++++ openssl-openssl-3.5.0/crypto/ml_kem/ml_kem.c	2026-02-14 06:31:46.217434301 +0000
 @@ -13,7 +13,8 @@
  #include "internal/common.h"
  #include "internal/constant_time.h"
@@ -10,7 +10,7 @@
  #if defined(OPENSSL_CONSTANT_TIME_VALIDATION)
  #include <valgrind/memcheck.h>
  #endif
-@@ -87,6 +88,22 @@
+@@ -87,6 +88,24 @@
  /*
   * Structure of keys
   */
@@ -18,22 +18,24 @@
 +/* Wrapper functions to match the callback signature */
 +
 +/* Wrapper functions */
-+static void wrapper_pause(void) { 
-+    ASYNC_pause_job(); 
-+}
++static void wrapper_pause(void)
++  {
++    ASYNC_pause_job();
++  }
 +
 +/* * 'ASYNC_wake' is internal/hidden and causes linker errors.
 + * The runtime will simply rely on the condition variable or blocking.
 + */
-+static void* wrapper_get_job(void) { 
-+    return (void*)ASYNC_get_current_job(); 
++static void* wrapper_get_job(void)
++ {
++    return (void*)ASYNC_get_current_job();
 +}
 +
 +static int cupqc_callbacks_registered = 0;
  typedef struct ossl_ml_kem_scalar_st {
      /* On every function entry and exit, 0 <= c[i] < ML_KEM_PRIME. */
      uint16_t c[ML_KEM_DEGREE];
-@@ -1798,11 +1815,14 @@
+@@ -1798,11 +1817,11 @@
      EVP_MD_CTX_free(mdctx);
      return ret;
  }
@@ -42,73 +44,55 @@
   * Generate a new keypair, either from the saved seed (when non-null), or from
   * the RNG.
   */
-+/* --- ADD THIS --- */
-+/* * STANDARD OPENSSL 3.5 ML-KEM KEYGEN
-+ * (CPU-Only, Safe, Correct)
-+ */
++
  int ossl_ml_kem_genkey(uint8_t *pubenc, size_t publen, ML_KEM_KEY *key)
  {
      uint8_t seed[ML_KEM_SEED_BYTES];
-@@ -1819,6 +1839,7 @@
-     if (pubenc != NULL && publen != vinfo->pubkey_bytes)
-         return 0;
- 
-+    /* 1. Generate or retrieve the random seed */
-     if (ossl_ml_kem_have_seed(key)) {
-         if (!ossl_ml_kem_encode_seed(seed, sizeof(seed), key))
-             return 0;
-@@ -1830,32 +1851,30 @@
+@@ -1830,7 +1849,6 @@
  
      if ((mdctx = EVP_MD_CTX_new()) == NULL)
          return 0;
 -
--    /*
-+     /*
+     /*
       * Data derived from (d, z) defaults secret, and to avoid side-channel
       * leaks should not influence control flow.
-      */
-     CONSTTIME_SECRET(seed, ML_KEM_SEED_BYTES);
+@@ -1839,17 +1857,16 @@
  
--    if (add_storage(OPENSSL_malloc(vinfo->prvalloc), 1, key))
-+   if (add_storage(OPENSSL_malloc(vinfo->prvalloc), 1, key))
+     if (add_storage(OPENSSL_malloc(vinfo->prvalloc), 1, key))
          ret = genkey(seed, mdctx, pubenc, key);
-+    
-     OPENSSL_cleanse(seed, sizeof(seed));
--
+-    OPENSSL_cleanse(seed, sizeof(seed));
+ 
 -    /* Declassify secret inputs and derived outputs before returning control */
 -    CONSTTIME_DECLASSIFY(seed, ML_KEM_SEED_BYTES);
-+ /* Declassify secret inputs and derived outputs before returning control */   
-+ CONSTTIME_DECLASSIFY(seed, ML_KEM_SEED_BYTES);
++    OPENSSL_cleanse(seed, sizeof(seed));
++   /* Declassify secret inputs and derived outputs before returning control */   
++   CONSTTIME_DECLASSIFY(seed, ML_KEM_SEED_BYTES);
  
-     EVP_MD_CTX_free(mdctx);
-+    
+-    EVP_MD_CTX_free(mdctx);
++    EVP_MD_CTX_free(mdctx); 
      if (!ret) {
          ossl_ml_kem_key_reset(key);
          return 0;
      }
- 
--    /* The public components are already declassified */
+-
+     /* The public components are already declassified */
      CONSTTIME_DECLASSIFY(key->s, vinfo->rank * sizeof(scalar));
      CONSTTIME_DECLASSIFY(key->z, 2 * ML_KEM_RANDOM_BYTES);
-     return 1;
- }
--
- /*
-  * FIPS 203, Section 6.2, Algorithm 17: ML-KEM.Encaps_internal
-  * This is the deterministic version with randomness supplied externally.
-@@ -1919,6 +1938,43 @@
+@@ -1919,18 +1936,54 @@
                             const ML_KEM_KEY *key)
  {
      uint8_t r[ML_KEM_RANDOM_BYTES];
 +    uint8_t pub_encoded[1184];
-+
+ 
+-    if (key == NULL)
+-        return 0;
 +    if (key == NULL) return 0;
-+
+ 
 +    // 1. Generate Randomness (Standard OpenSSL)
-+    if (RAND_bytes_ex(key->libctx, r, ML_KEM_RANDOM_BYTES,
-+                      key->vinfo->secbits) < 1)
-+        return 0;
-+
+     if (RAND_bytes_ex(key->libctx, r, ML_KEM_RANDOM_BYTES,
+                       key->vinfo->secbits) < 1)
+         return 0;
+ 
 +    const char *env = getenv("ENABLE_CUPQC");
 +
 +    if (env && strcmp(env, "1") == 0) {
@@ -129,23 +113,25 @@
 +    }
 +
 +    // Fallback
-+    return ossl_ml_kem_encap_seed(ctext, clen, shared_secret, slen,
-+                                  r, sizeof(r), key);
-+}
+     return ossl_ml_kem_encap_seed(ctext, clen, shared_secret, slen,
+                                   r, sizeof(r), key);
+ }
 +/* int ossl_ml_kem_encap_rand(uint8_t *ctext, size_t clen,
 +                           uint8_t *shared_secret, size_t slen,
 +                           const ML_KEM_KEY *key)
 +{
 +    uint8_t r[ML_KEM_RANDOM_BYTES];
  
-     if (key == NULL)
-         return 0;
-@@ -1930,7 +1986,7 @@
-     return ossl_ml_kem_encap_seed(ctext, clen, shared_secret, slen,
-                                   r, sizeof(r), key);
- }
--
-+*/
++    if (key == NULL)
++        return 0;
++
++    if (RAND_bytes_ex(key->libctx, r, ML_KEM_RANDOM_BYTES,
++                      key->vinfo->secbits) < 1)
++        return 0;
++
++    return ossl_ml_kem_encap_seed(ctext, clen, shared_secret, slen,
++                                  r, sizeof(r), key);
++}*/
  int ossl_ml_kem_decap(uint8_t *shared_secret, size_t slen,
                        const uint8_t *ctext, size_t clen,
                        const ML_KEM_KEY *key)

From d14292f7e7e1f44d01e1054c04b8f5b2b256bc88 Mon Sep 17 00:00:00 2001
From: Megha <94665705+Meghakoranga@users.noreply.github.com>
Date: Sat, 14 Feb 2026 12:34:58 +0530
Subject: [PATCH 4/8] Update README.md

---
 README.md | 228 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 226 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b7b030d..4fad4fb 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,226 @@
-# cuSSL
-cuPQC provider for OpenSSL
+# cuSSL: GPU-Accelerated ML-KEM-768 Integration for OpenSSL 3.5
+
+**Hardware-Accelerated Post-Quantum Cryptography using NVIDIA cuPQC and CUDA**
+
+cuSSL is a high-performance runtime and backend that offloads **ML-KEM-768** Key Encapsulation operations from OpenSSL 3.5 to NVIDIA GPUs. It integrates directly into the OpenSSL cryptographic core and enables high-throughput TLS 1.3 Post-Quantum handshakes.
+
+cuSSL implements a **Split-Stack Architecture**, cleanly separating the OpenSSL cryptographic core (CPU/C) from the GPU execution backend (CUDA/C++), ensuring ABI stability, thread safety, and memory isolation.
+
+---
+
+##  Features
+
+* GPU-accelerated ML-KEM-768 encapsulation using NVIDIA cuPQC
+* Asynchronous batching runtime (up to 512 concurrent operations)
+* Direct integration into OpenSSL 3.5 cryptographic core
+* Thread-safe job queue and runtime scheduler
+* Secure memory isolation between OpenSSL and GPU runtime
+* Automatic CPU fallback when GPU offload is disabled
+* Clean patch-based integration (no OpenSSL source redistribution)
+
+---
+
+## Architecture
+
+cuSSL operates in three layers:
+
+### 1. OpenSSL Integration Layer (Client)
+
+**File:** `crypto/ml_kem/ml_kem.c` (patched)
+
+Responsibilities:
+
+* Intercepts ML-KEM encapsulation requests
+* Submits jobs via cuSSL runtime API
+* Uses OpenSSL Async Job framework (`ASYNC_pause_job`)
+* Maintains full compatibility with OpenSSL execution model
+
+---
+
+### 2. cuSSL Runtime Layer (Manager)
+
+**File:** `src/cupqc_runtime.c`
+
+Responsibilities:
+
+* Thread-safe batching queue
+* Job scheduling and worker thread management
+* Memory isolation between OpenSSL and CUDA
+* Async job coordination
+
+This layer acts as the bridge between OpenSSL and GPU backend.
+
+---
+
+### 3. CUDA Backend Layer (Worker)
+
+**File:** `src/cupqc_shim.cu`
+
+Responsibilities:
+
+* Executes batched ML-KEM-768 encapsulation
+* Launches cuPQC CUDA kernels
+* Manages persistent GPU memory buffers
+* Performs host/device memory transfers
+
+Uses: cupqc::ML_KEM_768 from NVIDIA cuPQC SDK.
+
+---
+
+##  Prerequisites
+
+Hardware:
+
+* NVIDIA GPU (Turing / Ampere / Ada or newer)
+* Compute Capability ≥ 7.5
+
+Software:
+
+* Linux (Ubuntu 20.04 / 22.04 recommended)
+* OpenSSL 3.5.0 source
+* NVIDIA CUDA Toolkit (12+)
+* NVIDIA cuPQC SDK
+* GCC 9+
+* NVCC compiler
+
+---
+
+##  Build Instructions
+
+### 1. Set Environment Variables
+
+```
+export CUPQC_HOME=/path/to/cupqc_sdk
+export OPENSSL_ROOT=/path/to/openssl-3.5.0
+```
+
+---
+
+### 2. Compile cuSSL Runtime and CUDA Backend
+
+Compile runtime:
+```
+gcc -c src/cupqc_runtime.c -o cupqc_runtime.o -fPIC
+-I${OPENSSL_ROOT}/include
+-I${OPENSSL_ROOT}/crypto/ml_kem
+```
+
+Compile CUDA backend:
+
+```
+nvcc -c src/cupqc_shim.cu -o cupqc_shim.o
+-rdc=true -dlto -std=c++17
+-I${CUPQC_HOME}/include
+-Xcompiler -fPIC
+```
+
+Device link:
+```
+nvcc -dlink cupqc_shim.o -o cupqc_shim_dlink.o
+-rdc=true -dlto
+-L${CUPQC_HOME}/lib -lcupqc-pk
+```
+
+Final shared library:
+```
+g++ -shared -o libcussl.so
+cupqc_runtime.o cupqc_shim.o cupqc_shim_dlink.o
+-L${CUPQC_HOME}/lib -lcupqc-pk
+-L/usr/local/cuda/lib64 -lcudart -lpthread
+```
+
+---
+
+### 3. Apply OpenSSL Patch
+
+From OpenSSL root:
+
+```
+patch -p1 < /path/to/cuSSL/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
+```
+
+Rebuild OpenSSL:
+
+```
+make -j$(nproc)
+```
+---
+
+##  Usage
+
+Enable GPU offload:```export ENABLE_CUPQC=1```
+
+Run OpenSSL TLS server:
+
+```
+openssl s_server -accept 4433 -cert cert.pem -key key.pem -tls1_3 -groups mlkem768
+```
+## Verify Offload
+Use ```nvitop``` or ```nvidia-smi``` to verify GPU utilization during handshakes
+
+--- 
+**Disable GPU offload**:```unset ENABLE_CUPQC```
+
+OpenSSL will fall back to CPU implementation automatically.
+
+---
+
+##  Performance Characteristics
+
+GPU acceleration improves throughput significantly when batching is enabled.
+
+Tradeoffs:
+
+Latency:
+* Higher for single requests (PCIe transfer overhead)
+
+Throughput:
+* Much higher under concurrent load
+* Optimized for multi-connection TLS servers
+
+Designed for:
+
+* TLS termination servers
+* PQC-enabled secure infrastructure
+* GPU-accelerated cryptographic workloads
+
+---
+
+##  Security and Compatibility
+
+cuSSL:
+
+* Preserves OpenSSL security model
+* Does not modify public OpenSSL APIs
+* Uses isolated runtime
+* Supports CPU fallback
+
+Patch-based integration ensures maintainability across OpenSSL versions.
+
+---
+
+##  Licensing
+
+This repository contains only integration code.
+
+It does NOT include:
+
+* OpenSSL source code
+* NVIDIA cuPQC SDK
+* CUDA Toolkit
+
+Users must obtain those separately under their respective licenses.
+
+---
+
+## Project Status
+
+The engine is fully functional and architecturally stable. It successfully performs hardware-offloaded ML-KEM-768 key encapsulation for standard OpenSSL TLS 1.3 connections.
+
+**Core achievements include:**
+<ul>
+<li>Correctness: Validated bit-exact key exchange and successful handshake completion.</li>
+<li>Stability: Zero crashes or memory leaks during sustained load testing.</li>
+<li>Architecture: Strict separation of OpenSSL API and GPU runtime for full library compliance.</li>
+<li>Performance: Asynchronous batching logic is implemented and operational, ready for multi-threaded deployment.</li>
+</ul>

From 91cb0ce2e2619b13e60101013e71785075967778 Mon Sep 17 00:00:00 2001
From: MeghaKoranga <meghakoranga7@email.com>
Date: Sun, 15 Feb 2026 13:25:12 +0000
Subject: [PATCH 5/8] Fix patch: properly add cupqc_batch.h as new file using
 /dev/null

---
 .../patches/openssl-3.5.0-mlkem-cupqc.patch   | 45 ++++---------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch b/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
index fd6617c..036ee4e 100644
--- a/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
+++ b/openssl/patches/openssl-3.5.0-mlkem-cupqc.patch
@@ -135,45 +135,18 @@
  int ossl_ml_kem_decap(uint8_t *shared_secret, size_t slen,
                        const uint8_t *ctext, size_t clen,
                        const ML_KEM_KEY *key)
---- upstream/openssl/crypto/ml_kem/cupqc_batch.h	2026-02-02 12:57:21.437223877 +0000
+--- /dev/null	2026-02-15 13:18:51.577000071 +0000
 +++ openssl-openssl-3.5.0/crypto/ml_kem/cupqc_batch.h	2026-02-14 04:57:49.888240721 +0000
-@@ -1,34 +1,34 @@
- #ifndef CUPQC_BATCH_H
- #define CUPQC_BATCH_H
- 
--#include <openssl/async.h>
+@@ -0,0 +1,34 @@
++#ifndef CUPQC_BATCH_H
++#define CUPQC_BATCH_H
++
 +#include <stdint.h>
- 
--#define MLKEM_768_PK_BYTES 1184 // Standard size#include <pthread.h>
--
--// 1. Configurable Batch Size (This is what you will change to Benchmark 50 vs 100)
--#define CUPQC_BATCH_SIZE 512
++
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
- 
--// 2. The "Ticket" structure for one TLS request
--typedef struct {
--    int id;
--
--    unsigned char pub_key_storage[1184];
--    unsigned char *randomness_in;    
--    unsigned char *shared_secret_out;// Pointer to output buffer
--    unsigned char *ciphertext_out;   // Pointer to output buffer
--    
--    ASYNC_JOB *job;                  // The OpenSSL Job to wake up later
--    ASYNC_WAIT_CTX *wait_ctx;        // Context to signal the wake-up
--    int status;                      // 0 = Waiting, 1 = Completed
--} cupqc_job_t;
--
--// 3. The "Waiting Room" (Queue)
--typedef struct {
--    cupqc_job_t jobs[CUPQC_BATCH_SIZE]; // Array of slots
--    int count;                          // Current number of people waiting
--    pthread_mutex_t lock;               // Thread safety lock
--    pthread_cond_t cond;                // Signal to wake up the GPU Manager
--    int shutdown;                       // Flag to stop the system
--} cupqc_batch_queue_t;
++
 +/* * API: Submit an encapsulation job.
 + * The runtime handles all batching, threading, and GPU offloading internally.
 + */
@@ -193,9 +166,9 @@
 +    void (*wake)(void *),
 +    void *(*get_job)(void)
 +);
- 
++
 +#ifdef __cplusplus
 +}
- #endif
++#endif
 +
 +#endif /* CUPQC_BATCH_H */

From 5b8949663b4f5d4040f1b01f48fcee663f4c35d5 Mon Sep 17 00:00:00 2001
From: Megha <94665705+Meghakoranga@users.noreply.github.com>
Date: Sun, 1 Mar 2026 11:23:17 +0530
Subject: [PATCH 6/8] Update README.md

---
 README.md | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 4fad4fb..f0dbc19 100644
--- a/README.md
+++ b/README.md
@@ -165,26 +165,19 @@ OpenSSL will fall back to CPU implementation automatically.
 
 ---
 
-##  Performance Characteristics
+## Performance & Scaling
 
-GPU acceleration improves throughput significantly when batching is enabled.
+This engine offloads the heavy post-quantum math to the GPU. However, overall throughput depends heavily on the web server's architecture.
 
-Tradeoffs:
+### Current Benchmark (Standard Nginx)
+* **Rate:** ~500 Handshakes/Second
+* **Architecture Limit:** Standard Nginx uses a multi-processing model (e.g., 32 isolated worker processes). Because memory is not shared between these workers, the engine's internal batch queue cannot easily aggregate hundreds of connections at once. To prevent deadlocks, the GPU wake threshold is set to `1`, meaning the GPU processes very small batches, causing high CPU overhead from frequent kernel launches.
 
-Latency:
-* Higher for single requests (PCIe transfer overhead)
+### How to Scale to 2,000+ HS/s
+To fully saturate the GPU and achieve maximum throughput, the engine needs to fill its 512-slot batch queue. This can be achieved through two potential upgrades:
 
-Throughput:
-* Much higher under concurrent load
-* Optimized for multi-connection TLS servers
-
-Designed for:
-
-* TLS termination servers
-* PQC-enabled secure infrastructure
-* GPU-accelerated cryptographic workloads
-
----
+1. **Async-Enabled Server:** Use a web server that supports OpenSSL's asynchronous features (like Intel's Async Nginx). This allows a *single* worker process to handle thousands of concurrent connections, naturally filling a single, massive GPU queue without blocking.
+2. **Background Flush Timer:** Implement a POSIX timer thread inside `cupqc_runtime.c` that forces a queue flush every few milliseconds, ensuring that "leftover" connections do not deadlock when using larger batch thresholds across multiple Nginx workers.
 
 ##  Security and Compatibility
 

From 8e73927a8c1068588d37b484a85100ec1b2ff54b Mon Sep 17 00:00:00 2001
From: MeghaKoranga <meghakoranga7@email.com>
Date: Sun, 1 Mar 2026 06:06:33 +0000
Subject: [PATCH 7/8] Improve runtime batching logic for nginx TLS workload

---
 src/cupqc_runtime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cupqc_runtime.c b/src/cupqc_runtime.c
index 238c69d..f709c2f 100644
--- a/src/cupqc_runtime.c
+++ b/src/cupqc_runtime.c
@@ -150,7 +150,7 @@ int cupqc_submit_encap_job(uint8_t *public_key,
 
     global_queue.count++;
     
-    if (global_queue.count >= 32) { 
+    if (global_queue.count >= 1) { 
         pthread_cond_signal(&global_queue.cond);
     }
 

From ce9664f44b4b2d2a8ad8125256fa32671fb7c66a Mon Sep 17 00:00:00 2001
From: Megha <94665705+Meghakoranga@users.noreply.github.com>
Date: Tue, 31 Mar 2026 11:25:17 +0530
Subject: [PATCH 8/8] Update README.md

Co-authored-by: AddyTiv <adityakoranga2004@gmail.com>
---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index f0dbc19..5b1d7f5 100644
--- a/README.md
+++ b/README.md
@@ -213,7 +213,6 @@ The engine is fully functional and architecturally stable. It successfully perfo
 **Core achievements include:**
 <ul>
 <li>Correctness: Validated bit-exact key exchange and successful handshake completion.</li>
-<li>Stability: Zero crashes or memory leaks during sustained load testing.</li>
 <li>Architecture: Strict separation of OpenSSL API and GPU runtime for full library compliance.</li>
 <li>Performance: Asynchronous batching logic is implemented and operational, ready for multi-threaded deployment.</li>
 </ul>