Merge pull request #2 from chamikasudusinghe/patch-1

virnarula · web-flow · commit 64fa52e0c8e8 · 2024-04-16T10:01:17.000-05:00
sample_test_02 (initial pull rq)
diff --git a/llvm/test/Transforms/NVPTXMemOpts/driver_test_02.cpp b/llvm/test/Transforms/NVPTXMemOpts/driver_test_02.cpp
@@ -0,0 +1,79 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>   // for fabs
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+   if (code != cudaSuccess) {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+int main(void) {
+    int numElements = 50000;
+    size_t size = numElements * sizeof(float);
+    float scalar = 3.0f;
+    float *h_input = (float *)malloc(size);
+    float *h_output = (float *)malloc(size);
+
+    // Initialize the input data
+    for (int i = 0; i < numElements; ++i) {
+        h_input[i] = i;
+    }
+
+    // Initialize CUDA
+    CUdevice cuDevice;
+    CUcontext cuContext;
+    CUmodule cuModule;
+    CUfunction vectorScalarMultiply;
+    cuInit(0);
+    cuDeviceGet(&cuDevice, 0);
+    cuCtxCreate(&cuContext, 0, cuDevice);
+    cuModuleLoad(&cuModule, "test_02_scalar.fatbin");
+    cuModuleGetFunction(&vectorScalarMultiply, cuModule, "_Z20vectorScalarMultiplyPKfPffi");
+
+    // Allocate vectors in device memory
+    CUdeviceptr d_input, d_output;
+    cuMemAlloc(&d_input, size);
+    cuMemAlloc(&d_output, size);
+
+    // Copy vector from host memory to device memory
+    cuMemcpyHtoD(d_input, (void *)h_input, size);
+
+    // Prepare kernel launch
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    void *args[] = { &d_input, &d_output, &scalar, &numElements };
+
+    // Invoke kernel
+    cuLaunchKernel(vectorScalarMultiply, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, 0);
+
+    // Copy result from device memory to host memory
+    cuMemcpyDtoH((void *)h_output, d_output, size);
+
+    // Verify result
+    for (int i = 0; i < numElements; ++i) {
+        if (fabs(h_input[i] * scalar - h_output[i]) > 1e-5) {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device memory
+    cuMemFree(d_input);
+    cuMemFree(d_output);
+
+    // Free host memory
+    free(h_input);
+    free(h_output);
+
+    // Cleanup CUDA
+    cuCtxDestroy(cuContext);
+
+    return 0;
+}
diff --git a/llvm/test/Transforms/NVPTXMemOpts/driver_test_03.cpp b/llvm/test/Transforms/NVPTXMemOpts/driver_test_03.cpp
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>   // for fabs
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); }
+inline void gpuAssert(CUresult code, const char *file, int line, bool abort=true) {
+   if (code != CUDA_SUCCESS) {
+      const char *errorString;
+      cuGetErrorName(code, &errorString);
+      fprintf(stderr,"GPUassert: %s %s %d\n", errorString, file, line);
+      if (abort) exit(code);
+   }
+}
+
+int main(void) {
+    int numElements = 50000;
+    size_t size = numElements * sizeof(float);
+    float *h_input1 = (float *)malloc(size);
+    float *h_input2 = (float *)malloc(size);
+    float *h_output = (float *)malloc(size);
+
+    // Initialize CUDA
+    CUdevice cuDevice;
+    CUcontext cuContext;
+    CUmodule cuModule;
+    CUfunction vectorMultiply;
+    cuInit(0);
+    CUDA_CHECK(cuDeviceGet(&cuDevice, 0));
+    CUDA_CHECK(cuCtxCreate(&cuContext, 0, cuDevice));
+    CUDA_CHECK(cuModuleLoad(&cuModule, "test_03_multiply.fatbin"));
+    CUDA_CHECK(cuModuleGetFunction(&vectorMultiply, cuModule, "_Z14vectorMultiplyPKfS0_Pfi"));
+
+    // Allocate vectors in device memory
+    CUdeviceptr d_input1, d_input2, d_output;
+    CUDA_CHECK(cuMemAlloc(&d_input1, size));
+    CUDA_CHECK(cuMemAlloc(&d_input2, size));
+    CUDA_CHECK(cuMemAlloc(&d_output, size));
+
+    // Copy vectors from host memory to device memory
+    CUDA_CHECK(cuMemcpyHtoD(d_input1, (void *)h_input1, size));
+    CUDA_CHECK(cuMemcpyHtoD(d_input2, (void *)h_input2, size));
+
+    // Invoke kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    void *args[] = { &d_input1, &d_input2, &d_output, &numElements };
+    CUDA_CHECK(cuLaunchKernel(vectorMultiply, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, 0));
+
+    // Copy result from device memory to host memory
+    CUDA_CHECK(cuMemcpyDtoH((void *)h_output, d_output, size));
+
+    // Verify result
+    for (int i = 0; i < numElements; ++i) {
+        if (fabs(h_input1[i] * h_input2[i] - h_output[i]) > 1e-5) {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device memory
+    cuMemFree(d_input1);
+    cuMemFree(d_input2);
+    cuMemFree(d_output);
+
+    // Free host memory
+    free(h_input1);
+    free(h_input2);
+    free(h_output);
+
+    // Cleanup CUDA
+    cuCtxDestroy(cuContext);
+
+    return 0;
+}
diff --git a/llvm/test/Transforms/NVPTXMemOpts/driver_test_04.cpp b/llvm/test/Transforms/NVPTXMemOpts/driver_test_04.cpp
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+   if (code != cudaSuccess) {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+int main(void) {
+    int width = 1024;
+    int height = 1024;
+    size_t size = width * height * sizeof(float);
+    float *h_input = (float *)malloc(size);
+    float *h_output = (float *)malloc(size);
+
+    // Initialize the input matrix
+    for (int i = 0; i < height; ++i) {
+        for (int j = 0; j < width; ++j) {
+            h_input[i * width + j] = i * width + j;
+        }
+    }
+
+    // Initialize CUDA
+    CUdevice cuDevice;
+    CUcontext cuContext;
+    CUmodule cuModule;
+    CUfunction transposeMatrix;
+    cuInit(0);
+    cuDeviceGet(&cuDevice, 0);
+    cuCtxCreate(&cuContext, 0, cuDevice);
+    cuModuleLoad(&cuModule, "test_04_transpose.fatbin");
+    cuModuleGetFunction(&transposeMatrix, cuModule, "_Z15transposeMatrixPKfPfii");
+
+    // Allocate matrices in device memory
+    CUdeviceptr d_input, d_output;
+    cuMemAlloc(&d_input, size);
+    cuMemAlloc(&d_output, size);
+
+    // Copy matrix from host memory to device memory
+    cuMemcpyHtoD(d_input, (void *)h_input, size);
+
+    // Set up the execution configuration
+    dim3 threadsPerBlock(16, 16);
+    dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x, 
+                       (height + threadsPerBlock.y - 1) / threadsPerBlock.y);
+
+    // Invoke kernel
+    void *args[] = { &d_input, &d_output, &width, &height };
+    cuLaunchKernel(transposeMatrix, blocksPerGrid.x, blocksPerGrid.y, 1, 
+                   threadsPerBlock.x, threadsPerBlock.y, 1, 0, 0, args, 0);
+
+    // Copy result from device memory to host memory
+    cuMemcpyDtoH((void *)h_output, d_output, size);
+
+    // Verify result
+    bool success = true;
+    for (int i = 0; i < height; ++i) {
+        for (int j = 0; j < width; ++j) {
+            if (h_output[j * height + i] != h_input[i * width + j]) {
+                fprintf(stderr, "Result verification failed at element (%d, %d)!\n", i, j);
+                success = false;
+                break;
+            }
+        }
+        if (!success) {
+            break;
+        }
+    }
+
+    if (success) {
+        printf("Test PASSED\n");
+    } else {
+        printf("Test FAILED\n");
+    }
+
+    // Free device and host memory
+    cuMemFree(d_input);
+    cuMemFree(d_output);
+    free(h_input);
+    free(h_output);
+
+    // Cleanup CUDA
+    cuCtxDestroy(cuContext);
+
+    return 0;
+}
diff --git a/llvm/test/Transforms/NVPTXMemOpts/test_02_scalar.cu b/llvm/test/Transforms/NVPTXMemOpts/test_02_scalar.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+// CUDA kernel for vector scalar multiplication
+__global__ void vectorScalarMultiply(const float *input, float *output, float scalar, int numElements) {
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements) {
+        output[i] = input[i] * scalar;
+    }
+}
+
+int main(void) {
+    int numElements = 50000;
+    size_t size = numElements * sizeof(float);
+    float *h_input = (float *)malloc(size);
+    float *h_output = (float *)malloc(size);
+
+    // Initialize the input data
+    for (int i = 0; i < numElements; ++i) {
+        h_input[i] = i;
+    }
+
+    // Allocate vectors in device memory
+    float *d_input;
+    cudaMalloc(&d_input, size);
+    float *d_output;
+    cudaMalloc(&d_output, size);
+
+    // Copy vector from host memory to device memory
+    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
+
+    // Invoke kernel
+    float scalar = 3.0f;
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    vectorScalarMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, scalar, numElements);
+
+    // Copy result from device memory to host memory
+    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
+
+    // Verify result
+    for (int i = 0; i < numElements; ++i) {
+        if (fabs(h_input[i] * scalar - h_output[i]) > 1e-5) {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device and host memory
+    cudaFree(d_input);
+    cudaFree(d_output);
+    free(h_input);
+    free(h_output);
+
+    return 0;
+}
diff --git a/llvm/test/Transforms/NVPTXMemOpts/test_03_multiply.cu b/llvm/test/Transforms/NVPTXMemOpts/test_03_multiply.cu
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+// CUDA kernel for vector multiplication
+__global__ void vectorMultiply(const float *input1, const float *input2, float *output, int numElements) {
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements) {
+        output[i] = input1[i] * input2[i];
+    }
+}
+
+int main(void) {
+    int numElements = 50000;
+    size_t size = numElements * sizeof(float);
+    float *h_input1 = (float *)malloc(size);
+    float *h_input2 = (float *)malloc(size);
+    float *h_output = (float *)malloc(size);
+
+    // Initialize the input data
+    for (int i = 0; i < numElements; ++i) {
+        h_input1[i] = i;
+        h_input2[i] = 2 * i;  // Different values for the second vector
+    }
+
+    // Allocate vectors in device memory
+    float *d_input1, *d_input2, *d_output;
+    cudaMalloc(&d_input1, size);
+    cudaMalloc(&d_input2, size);
+    cudaMalloc(&d_output, size);
+
+    // Copy vectors from host memory to device memory
+    cudaMemcpy(d_input1, h_input1, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_input2, h_input2, size, cudaMemcpyHostToDevice);
+
+    // Invoke kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    vectorMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_input1, d_input2, d_output, numElements);
+
+    // Copy result from device memory to host memory
+    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
+
+    // Verify result
+    for (int i = 0; i < numElements; ++i) {
+        if (fabs(h_input1[i] * h_input2[i] - h_output[i]) > 1e-5) {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device and host memory
+    cudaFree(d_input1);
+    cudaFree(d_input2);
+    cudaFree(d_output);
+    free(h_input1);
+    free(h_input2);
+    free(h_output);
+
+    return 0;
+}
diff --git a/llvm/test/Transforms/NVPTXMemOpts/test_04_transpose.cu b/llvm/test/Transforms/NVPTXMemOpts/test_04_transpose.cu