diff --git a/CMakeLists.txt b/CMakeLists.txt index dd0c793..6b71725 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,8 +66,20 @@ if(ENABLE_GPU_DEVICE) # https://en.wikipedia.org/w/index.php?title=CUDA§ion=5#GPUs_supported # https://raw.githubusercontent.com/PointCloudLibrary/pcl/master/cmake/pcl_find_cuda.cmake - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0") - set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80 86) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.1") + execute_process(COMMAND ${CMAKE_CUDA_COMPILER} --list-gpu-code RESULT_VARIABLE EXIT_CODE OUTPUT_VARIABLE OUTPUT_VAL) + if(EXIT_CODE EQUAL 0) + #Remove sm_ + string(REPLACE "sm_" "" OUTPUT_VAL ${OUTPUT_VAL}) + #Convert to list + string(REPLACE "\n" ";" CMAKE_CUDA_ARCHITECTURES ${OUTPUT_VAL}) + #Remove last empty entry + list(REMOVE_AT CMAKE_CUDA_ARCHITECTURES -1) + else() + message(FATAL_ERROR "Failed to run NVCC to get list of GPU codes: ${EXIT_CODE}") + endif() + elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0") + set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80) elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "10.0") set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72 75) elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "9.0") @@ -107,4 +119,3 @@ else() message(STATUS "GPU mode: ${BoldRed}OFF${ColourReset}") endif() message(STATUS "----------------------") - diff --git a/src/gpu/arch.cu b/src/gpu/arch.cu index ad632c5..62f6802 100644 --- a/src/gpu/arch.cu +++ b/src/gpu/arch.cu @@ -16,6 +16,7 @@ enum { ARCH_VOLTA, ARCH_TURING, ARCH_AMPERE, + ARCH_ADA, ARCH_UNKNOWN }; @@ -27,6 +28,7 @@ static const char *uarch_str[] = { /*[ARCH_VOLTA] = */ "Volta", /*[ARCH_TURING] = */ "Turing", /*[ARCH_AMPERE] = */ "Ampere", + /*[ARCH_ADA] = */ "Ada", }; struct benchmark_gpu { @@ -143,8 +145,12 @@ struct gpu* get_gpu_info(int gpu_idx) { break; case 80: case 86: + case 87: gpu->uarch = ARCH_AMPERE; break; + case 89: + gpu->uarch = ARCH_ADA; + break; default: printf("GPU: %s\n", gpu->name); printErr("Invalid uarch: %d.%d\n", deviceProp.major, deviceProp.minor); @@ -162,6 +168,7 @@ struct gpu* get_gpu_info(int gpu_idx) { break; case ARCH_TURING: case ARCH_AMPERE: // UNTESTED + case ARCH_ADA: // UNTESTED gpu->latency = 4; break; default: @@ -185,7 +192,7 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) { bench->nbk = (nbk == INVALID_CFG) ? (gpu->latency * gpu->sm_count) : nbk; bench->tpb = (tpb == INVALID_CFG) ? _ConvertSMVer2Cores(gpu->cc_major, gpu->cc_minor) : tpb; } - bench->n = bench->nbk * bench->tpb; + bench->n = 16 * bench->nbk * bench->tpb; bench->gflops = (double)(BENCHMARK_GPU_ITERS * 2 * (long)bench->n)/(long)1000000000; cudaError_t err = cudaSuccess; @@ -193,13 +200,15 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) { float *h_B; int size = bench->n * sizeof(float); + cudaSetDevice(0); + if ((err = cudaMallocHost((void **)&h_A, size)) != cudaSuccess) { - printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err)); + printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err)); return NULL; } if ((err = cudaMallocHost((void **)&h_B, size)) != cudaSuccess) { - printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err)); + printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err)); return NULL; } @@ -208,6 +217,7 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) { h_B[i] = rand()/(float)RAND_MAX; } + if ((err = cudaMalloc((void **) &(bench->d_A), size)) != cudaSuccess) { printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err)); return NULL; diff --git a/src/gpu/arch.hpp b/src/gpu/arch.hpp index 69d19fa..fe42c71 100644 --- a/src/gpu/arch.hpp +++ b/src/gpu/arch.hpp @@ -3,7 +3,7 @@ #include "../getarg.hpp" -#define BENCHMARK_GPU_ITERS 400000000 +#define BENCHMARK_GPU_ITERS 40000000 struct benchmark_gpu; diff --git a/src/gpu/kernel.cu b/src/gpu/kernel.cu index b435451..41b62da 100644 --- a/src/gpu/kernel.cu +++ b/src/gpu/kernel.cu @@ -1,15 +1,38 @@ #include "kernel.hpp" +#include +#include +#define N 16 +#define gid threadIdx.x + blockIdx.x * blockDim.x +#define off gid*N + __global__ void compute_kernel(float *vec_a, float *vec_b, float *vec_c, int n) { - float a = vec_a[0]; - float b = vec_b[0]; - float c = 0.0; + __shared__ float myblockA[N]; + __shared__ float myblockB[N]; + __shared__ float myblockC[N]; + + #pragma unroll + for(int i = 0; i < N; i++){ + myblockA[i] = vec_a[off+i]; + myblockB[i] = vec_b[off+i]; + myblockC[i] = vec_a[off+i]; + } + + __syncthreads(); - #pragma unroll 2000 + #pragma unroll 32 for(long i=0; i < BENCHMARK_GPU_ITERS; i++) { - c = (c * a) + b; + #pragma unroll + for(int j = 0; j < N; j++){ + myblockC[j] = (myblockC[j] * myblockA[j]) + myblockB[j]; + } + } + + #pragma unroll + for(int i = 0; i < N; i++){ + vec_c[off+i] = myblockC[i]; } - vec_c[0] = c; } +