Skip to content

Commit bfaed2b

Browse files
committed
Organize testing directory
1 parent 64fa52e commit bfaed2b

File tree

13 files changed

+145
-1
lines changed

13 files changed

+145
-1
lines changed

llvm/test/Transforms/NVPTXMemOpts/already_coalesced/driver.cpp

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include <stdio.h>
2+
3+
// CUDA kernel for mat mul
4+
__global__ void naiveMM(const float **A, const float **B, float **C, int w) {
5+
int idx = blockDim.x * blockIdx.x + threadIdx.x;
6+
int idy = blockDim.y * blockIdx.y + threadIdx.y;
7+
float sum = 0;
8+
for (int i = 0; i < w; i++) {
9+
sum += A[idy][i] * B[i][idx];
10+
}
11+
C[idy][idx] = sum;
12+
13+
}
14+
15+
// CUDA kernel for mat mul with shared memory
16+
__global__ void sharedMM(const float **A, const float **B, float **C, int w) {
17+
__shared__ float As[32][32];
18+
__shared__ float Bs[32][32];
19+
int idx = blockDim.x * blockIdx.x + threadIdx.x;
20+
int idy = blockDim.y * blockIdx.y + threadIdx.y;
21+
float sum = 0;
22+
for (int i = 0; i < w; i++) {
23+
As[threadIdx.y][i] = A[idy][i];
24+
Bs[i][threadIdx.x] = B[i][idx];
25+
__syncthreads();
26+
for (int j = 0; j < w; j++) {
27+
sum += As[threadIdx.y][j] * Bs[j][threadIdx.x];
28+
}
29+
__syncthreads();
30+
}
31+
C[idy][idx] = sum;
32+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// CUDA kernel for constant access
2+
__global__ void naive_add(const float *A, float **C, int w) {
3+
int idx = blockDim.x * blockIdx.x + threadIdx.x;
4+
int idy = blockDim.y * blockIdx.y + threadIdx.y;
5+
float sum = 0;
6+
for (int i = 0; i < w; i++) {
7+
for (int j = 0; j < w; j++) {
8+
sum += A[10][15] * B[23][54];
9+
}
10+
}
11+
C[idy][idx] = sum;
12+
13+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#include <stdio.h>
2+
3+
// the reads in this kernel are of constant indexes and therefore cannot be coallesced
4+
5+
// CUDA kernel for constant access
6+
__global__ void naiveMM(const float **A, const float **B, float **C, int w) {
7+
int idx = blockDim.x * blockIdx.x + threadIdx.x;
8+
int idy = blockDim.y * blockIdx.y + threadIdx.y;
9+
float sum = 0;
10+
for (int i = 0; i < w; i++) {
11+
for (int j = 0; j < w; j++) {
12+
sum += A[10][15] * B[23][54];
13+
}
14+
}
15+
C[idy][idx] = sum;
16+
17+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include <cuda_runtime.h>
2+
3+
// CUDA kernel for element-wise addition of two matrices
4+
__global__ void matrixAdd(const int *A, const int *B, int *C, int numRows, int numCols) {
5+
int row = blockIdx.y * blockDim.y + threadIdx.y;
6+
int col = blockIdx.x * blockDim.x + threadIdx.x;
7+
if (row < numRows && col < numCols) {
8+
int idx = row * numCols + col;
9+
C[idx] = A[idx] + B[idx];
10+
}
11+
}
12+
13+
14+
// CUDA kernel for element-wise addition of two matrices with memory coalescing
15+
__global__ void matrixAdd_coalesced(const int *A, const int *B, int *C, int numRows, int numCols) {
16+
int row = blockIdx.y * blockDim.y + threadIdx.y;
17+
int col = blockIdx.x * blockDim.x + threadIdx.x;
18+
__shared__ int A_shared[16][16];
19+
__shared__ int B_shared[16][16];
20+
A_shared[threadIdx.y][threadIdx.x] = A[row * numCols + col];
21+
B_shared[threadIdx.y][threadIdx.x] = B[row * numCols + col];
22+
__syncthreads();
23+
if (row < numRows && col < numCols) {
24+
int idx = row * numCols + col;
25+
C[idx] = A_shared[threadIdx.y][threadIdx.x] + B_shared[threadIdx.y][threadIdx.x];
26+
}
27+
}
28+
29+

llvm/test/Transforms/NVPTXMemOpts/test_04_transpose.cu renamed to llvm/test/Transforms/NVPTXMemOpts/matrix_transpose/test_04_transpose.cu

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,22 @@ __global__ void transposeMatrix(const float *input, float *output, int width, in
1212
}
1313
}
1414

15+
// CUDA kernel to transpose a matrix
16+
__global__ void transposeMatrix_coalesced(const float *input, float *output, int width, int height) {
17+
int x = blockIdx.x * blockDim.x + threadIdx.x;
18+
int y = blockIdx.y * blockDim.y + threadIdx.y;
19+
20+
__shared__ float input_shared[16][16];
21+
input_shared[threadIdx.y][threadIdx.x] = input[y * width + x];
22+
__syncthreads();
23+
24+
if (x < width && y < height) {
25+
int pos = y * width + x;
26+
int transPos = x * height + y;
27+
output[transPos] = input[pos];
28+
}
29+
}
30+
1531
int main(void) {
1632
int width = 1024;
1733
int height = 1024;

llvm/test/Transforms/NVPTXMemOpts/test.cu renamed to llvm/test/Transforms/NVPTXMemOpts/vector_add/test.cu

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,19 @@ __global__ void vectorAdd(const float *A, const float *B, float *C, int numEleme
88
}
99
}
1010

11+
// Cuda kernel for vector addition with memory coalescing
12+
__global__ void vectorAdd_coalesced(const float *A, const float *B, float *C, int numElements) {
13+
int i = blockDim.x * blockIdx.x + threadIdx.x;
14+
__shared__ float A_shared[16];
15+
__shared__ float B_shared[16];
16+
A_shared[threadIdx.x] = A[i];
17+
B_shared[threadIdx.x] = B[i];
18+
__syncthreads();
19+
if (i < numElements) {
20+
C[i] = A_shared[threadIdx.x] + B_shared[threadIdx.x];
21+
}
22+
}
23+
1124
int main(void) {
1225
int numElements = 50000;
1326
size_t size = numElements * sizeof(float);

0 commit comments

Comments
 (0)