Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,20 @@ if(ENABLE_GPU_DEVICE)

# https://en.wikipedia.org/w/index.php?title=CUDA&section=5#GPUs_supported
# https://raw.githubusercontent.com/PointCloudLibrary/pcl/master/cmake/pcl_find_cuda.cmake
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0")
set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80 86)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.1")
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} --list-gpu-code RESULT_VARIABLE EXIT_CODE OUTPUT_VARIABLE OUTPUT_VAL)
if(EXIT_CODE EQUAL 0)
#Remove sm_
string(REPLACE "sm_" "" OUTPUT_VAL ${OUTPUT_VAL})
#Convert to list
string(REPLACE "\n" ";" CMAKE_CUDA_ARCHITECTURES ${OUTPUT_VAL})
#Remove last empty entry
list(REMOVE_AT CMAKE_CUDA_ARCHITECTURES -1)
else()
message(FATAL_ERROR "Failed to run NVCC to get list of GPU codes: ${EXIT_CODE}")
endif()
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0")
set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80)
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "10.0")
set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72 75)
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "9.0")
Expand Down Expand Up @@ -107,4 +119,3 @@ else()
message(STATUS "GPU mode: ${BoldRed}OFF${ColourReset}")
endif()
message(STATUS "----------------------")

16 changes: 13 additions & 3 deletions src/gpu/arch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ enum {
ARCH_VOLTA,
ARCH_TURING,
ARCH_AMPERE,
ARCH_ADA,
ARCH_UNKNOWN
};

Expand All @@ -27,6 +28,7 @@ static const char *uarch_str[] = {
/*[ARCH_VOLTA] = */ "Volta",
/*[ARCH_TURING] = */ "Turing",
/*[ARCH_AMPERE] = */ "Ampere",
/*[ARCH_ADA] = */ "Ada",
};

struct benchmark_gpu {
Expand Down Expand Up @@ -143,8 +145,12 @@ struct gpu* get_gpu_info(int gpu_idx) {
break;
case 80:
case 86:
case 87:
gpu->uarch = ARCH_AMPERE;
break;
case 89:
gpu->uarch = ARCH_ADA;
break;
default:
printf("GPU: %s\n", gpu->name);
printErr("Invalid uarch: %d.%d\n", deviceProp.major, deviceProp.minor);
Expand All @@ -162,6 +168,7 @@ struct gpu* get_gpu_info(int gpu_idx) {
break;
case ARCH_TURING:
case ARCH_AMPERE: // UNTESTED
case ARCH_ADA: // UNTESTED
gpu->latency = 4;
break;
default:
Expand All @@ -185,21 +192,23 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) {
bench->nbk = (nbk == INVALID_CFG) ? (gpu->latency * gpu->sm_count) : nbk;
bench->tpb = (tpb == INVALID_CFG) ? _ConvertSMVer2Cores(gpu->cc_major, gpu->cc_minor) : tpb;
}
bench->n = bench->nbk * bench->tpb;
bench->n = 16 * bench->nbk * bench->tpb;
bench->gflops = (double)(BENCHMARK_GPU_ITERS * 2 * (long)bench->n)/(long)1000000000;

cudaError_t err = cudaSuccess;
float *h_A;
float *h_B;
int size = bench->n * sizeof(float);

cudaSetDevice(0);

if ((err = cudaMallocHost((void **)&h_A, size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
}

if ((err = cudaMallocHost((void **)&h_B, size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
}

Expand All @@ -208,6 +217,7 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) {
h_B[i] = rand()/(float)RAND_MAX;
}


if ((err = cudaMalloc((void **) &(bench->d_A), size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "../getarg.hpp"

#define BENCHMARK_GPU_ITERS 400000000
#define BENCHMARK_GPU_ITERS 40000000

struct benchmark_gpu;

Expand Down
35 changes: 29 additions & 6 deletions src/gpu/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,38 @@
#include "kernel.hpp"
#include <stdio.h>
#include <stdint.h>
#define N 16
#define gid threadIdx.x + blockIdx.x * blockDim.x
#define off gid*N


__global__
void compute_kernel(float *vec_a, float *vec_b, float *vec_c, int n) {
float a = vec_a[0];
float b = vec_b[0];
float c = 0.0;
__shared__ float myblockA[N];
__shared__ float myblockB[N];
__shared__ float myblockC[N];

#pragma unroll
for(int i = 0; i < N; i++){
myblockA[i] = vec_a[off+i];
myblockB[i] = vec_b[off+i];
myblockC[i] = vec_a[off+i];
}

__syncthreads();

#pragma unroll 2000
#pragma unroll 32
for(long i=0; i < BENCHMARK_GPU_ITERS; i++) {
c = (c * a) + b;
#pragma unroll
for(int j = 0; j < N; j++){
myblockC[j] = (myblockC[j] * myblockA[j]) + myblockB[j];
}
}

#pragma unroll
for(int i = 0; i < N; i++){
vec_c[off+i] = myblockC[i];
}

vec_c[0] = c;
}