|
| 1 | +#include <stdio.h> |
| 2 | +#include <stdlib.h> |
| 3 | +#include <cuda.h> |
| 4 | +#include <cuda_runtime.h> |
| 5 | + |
| 6 | +#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); } |
| 7 | +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { |
| 8 | + if (code != cudaSuccess) { |
| 9 | + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); |
| 10 | + if (abort) exit(code); |
| 11 | + } |
| 12 | +} |
| 13 | + |
| 14 | +int main(void) { |
| 15 | + int width = 1024; |
| 16 | + int height = 1024; |
| 17 | + size_t size = width * height * sizeof(float); |
| 18 | + float *h_input = (float *)malloc(size); |
| 19 | + float *h_output = (float *)malloc(size); |
| 20 | + |
| 21 | + // Initialize the input matrix |
| 22 | + for (int i = 0; i < height; ++i) { |
| 23 | + for (int j = 0; j < width; ++j) { |
| 24 | + h_input[i * width + j] = i * width + j; |
| 25 | + } |
| 26 | + } |
| 27 | + |
| 28 | + // Initialize CUDA |
| 29 | + CUdevice cuDevice; |
| 30 | + CUcontext cuContext; |
| 31 | + CUmodule cuModule; |
| 32 | + CUfunction transposeMatrix; |
| 33 | + cuInit(0); |
| 34 | + cuDeviceGet(&cuDevice, 0); |
| 35 | + cuCtxCreate(&cuContext, 0, cuDevice); |
| 36 | + cuModuleLoad(&cuModule, "test_04_transpose.fatbin"); |
| 37 | + cuModuleGetFunction(&transposeMatrix, cuModule, "_Z15transposeMatrixPKfPfii"); |
| 38 | + |
| 39 | + // Allocate matrices in device memory |
| 40 | + CUdeviceptr d_input, d_output; |
| 41 | + cuMemAlloc(&d_input, size); |
| 42 | + cuMemAlloc(&d_output, size); |
| 43 | + |
| 44 | + // Copy matrix from host memory to device memory |
| 45 | + cuMemcpyHtoD(d_input, (void *)h_input, size); |
| 46 | + |
| 47 | + // Set up the execution configuration |
| 48 | + dim3 threadsPerBlock(16, 16); |
| 49 | + dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x, |
| 50 | + (height + threadsPerBlock.y - 1) / threadsPerBlock.y); |
| 51 | + |
| 52 | + // Invoke kernel |
| 53 | + void *args[] = { &d_input, &d_output, &width, &height }; |
| 54 | + cuLaunchKernel(transposeMatrix, blocksPerGrid.x, blocksPerGrid.y, 1, |
| 55 | + threadsPerBlock.x, threadsPerBlock.y, 1, 0, 0, args, 0); |
| 56 | + |
| 57 | + // Copy result from device memory to host memory |
| 58 | + cuMemcpyDtoH((void *)h_output, d_output, size); |
| 59 | + |
| 60 | + // Verify result |
| 61 | + bool success = true; |
| 62 | + for (int i = 0; i < height; ++i) { |
| 63 | + for (int j = 0; j < width; ++j) { |
| 64 | + if (h_output[j * height + i] != h_input[i * width + j]) { |
| 65 | + fprintf(stderr, "Result verification failed at element (%d, %d)!\n", i, j); |
| 66 | + success = false; |
| 67 | + break; |
| 68 | + } |
| 69 | + } |
| 70 | + if (!success) { |
| 71 | + break; |
| 72 | + } |
| 73 | + } |
| 74 | + |
| 75 | + if (success) { |
| 76 | + printf("Test PASSED\n"); |
| 77 | + } else { |
| 78 | + printf("Test FAILED\n"); |
| 79 | + } |
| 80 | + |
| 81 | + // Free device and host memory |
| 82 | + cuMemFree(d_input); |
| 83 | + cuMemFree(d_output); |
| 84 | + free(h_input); |
| 85 | + free(h_output); |
| 86 | + |
| 87 | + // Cleanup CUDA |
| 88 | + cuCtxDestroy(cuContext); |
| 89 | + |
| 90 | + return 0; |
| 91 | +} |
0 commit comments