Skip to content

Commit 64fa52e

Browse files
authored
Merge pull request #2 from chamikasudusinghe/patch-1
sample_test_02 (initial pull rq)
2 parents bb44fed + f26de98 commit 64fa52e

File tree

6 files changed

+445
-0
lines changed

6 files changed

+445
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <math.h> // for fabs
4+
#include <cuda.h>
5+
#include <cuda_runtime.h>
6+
7+
#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); }
8+
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
9+
if (code != cudaSuccess) {
10+
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
11+
if (abort) exit(code);
12+
}
13+
}
14+
15+
int main(void) {
16+
int numElements = 50000;
17+
size_t size = numElements * sizeof(float);
18+
float scalar = 3.0f;
19+
float *h_input = (float *)malloc(size);
20+
float *h_output = (float *)malloc(size);
21+
22+
// Initialize the input data
23+
for (int i = 0; i < numElements; ++i) {
24+
h_input[i] = i;
25+
}
26+
27+
// Initialize CUDA
28+
CUdevice cuDevice;
29+
CUcontext cuContext;
30+
CUmodule cuModule;
31+
CUfunction vectorScalarMultiply;
32+
cuInit(0);
33+
cuDeviceGet(&cuDevice, 0);
34+
cuCtxCreate(&cuContext, 0, cuDevice);
35+
cuModuleLoad(&cuModule, "test_02_scalar.fatbin");
36+
cuModuleGetFunction(&vectorScalarMultiply, cuModule, "_Z20vectorScalarMultiplyPKfPffi");
37+
38+
// Allocate vectors in device memory
39+
CUdeviceptr d_input, d_output;
40+
cuMemAlloc(&d_input, size);
41+
cuMemAlloc(&d_output, size);
42+
43+
// Copy vector from host memory to device memory
44+
cuMemcpyHtoD(d_input, (void *)h_input, size);
45+
46+
// Prepare kernel launch
47+
int threadsPerBlock = 256;
48+
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
49+
void *args[] = { &d_input, &d_output, &scalar, &numElements };
50+
51+
// Invoke kernel
52+
cuLaunchKernel(vectorScalarMultiply, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, 0);
53+
54+
// Copy result from device memory to host memory
55+
cuMemcpyDtoH((void *)h_output, d_output, size);
56+
57+
// Verify result
58+
for (int i = 0; i < numElements; ++i) {
59+
if (fabs(h_input[i] * scalar - h_output[i]) > 1e-5) {
60+
fprintf(stderr, "Result verification failed at element %d!\n", i);
61+
exit(EXIT_FAILURE);
62+
}
63+
}
64+
65+
printf("Test PASSED\n");
66+
67+
// Free device memory
68+
cuMemFree(d_input);
69+
cuMemFree(d_output);
70+
71+
// Free host memory
72+
free(h_input);
73+
free(h_output);
74+
75+
// Cleanup CUDA
76+
cuCtxDestroy(cuContext);
77+
78+
return 0;
79+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <math.h> // for fabs
4+
#include <cuda.h>
5+
#include <cuda_runtime.h>
6+
7+
#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); }
8+
inline void gpuAssert(CUresult code, const char *file, int line, bool abort=true) {
9+
if (code != CUDA_SUCCESS) {
10+
const char *errorString;
11+
cuGetErrorName(code, &errorString);
12+
fprintf(stderr,"GPUassert: %s %s %d\n", errorString, file, line);
13+
if (abort) exit(code);
14+
}
15+
}
16+
17+
int main(void) {
18+
int numElements = 50000;
19+
size_t size = numElements * sizeof(float);
20+
float *h_input1 = (float *)malloc(size);
21+
float *h_input2 = (float *)malloc(size);
22+
float *h_output = (float *)malloc(size);
23+
24+
// Initialize CUDA
25+
CUdevice cuDevice;
26+
CUcontext cuContext;
27+
CUmodule cuModule;
28+
CUfunction vectorMultiply;
29+
cuInit(0);
30+
CUDA_CHECK(cuDeviceGet(&cuDevice, 0));
31+
CUDA_CHECK(cuCtxCreate(&cuContext, 0, cuDevice));
32+
CUDA_CHECK(cuModuleLoad(&cuModule, "test_03_multiply.fatbin"));
33+
CUDA_CHECK(cuModuleGetFunction(&vectorMultiply, cuModule, "_Z14vectorMultiplyPKfS0_Pfi"));
34+
35+
// Allocate vectors in device memory
36+
CUdeviceptr d_input1, d_input2, d_output;
37+
CUDA_CHECK(cuMemAlloc(&d_input1, size));
38+
CUDA_CHECK(cuMemAlloc(&d_input2, size));
39+
CUDA_CHECK(cuMemAlloc(&d_output, size));
40+
41+
// Copy vectors from host memory to device memory
42+
CUDA_CHECK(cuMemcpyHtoD(d_input1, (void *)h_input1, size));
43+
CUDA_CHECK(cuMemcpyHtoD(d_input2, (void *)h_input2, size));
44+
45+
// Invoke kernel
46+
int threadsPerBlock = 256;
47+
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
48+
void *args[] = { &d_input1, &d_input2, &d_output, &numElements };
49+
CUDA_CHECK(cuLaunchKernel(vectorMultiply, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, 0));
50+
51+
// Copy result from device memory to host memory
52+
CUDA_CHECK(cuMemcpyDtoH((void *)h_output, d_output, size));
53+
54+
// Verify result
55+
for (int i = 0; i < numElements; ++i) {
56+
if (fabs(h_input1[i] * h_input2[i] - h_output[i]) > 1e-5) {
57+
fprintf(stderr, "Result verification failed at element %d!\n", i);
58+
exit(EXIT_FAILURE);
59+
}
60+
}
61+
62+
printf("Test PASSED\n");
63+
64+
// Free device memory
65+
cuMemFree(d_input1);
66+
cuMemFree(d_input2);
67+
cuMemFree(d_output);
68+
69+
// Free host memory
70+
free(h_input1);
71+
free(h_input2);
72+
free(h_output);
73+
74+
// Cleanup CUDA
75+
cuCtxDestroy(cuContext);
76+
77+
return 0;
78+
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <cuda.h>
4+
#include <cuda_runtime.h>
5+
6+
#define CUDA_CHECK(result) { gpuAssert((result), __FILE__, __LINE__); }
7+
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
8+
if (code != cudaSuccess) {
9+
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
10+
if (abort) exit(code);
11+
}
12+
}
13+
14+
int main(void) {
15+
int width = 1024;
16+
int height = 1024;
17+
size_t size = width * height * sizeof(float);
18+
float *h_input = (float *)malloc(size);
19+
float *h_output = (float *)malloc(size);
20+
21+
// Initialize the input matrix
22+
for (int i = 0; i < height; ++i) {
23+
for (int j = 0; j < width; ++j) {
24+
h_input[i * width + j] = i * width + j;
25+
}
26+
}
27+
28+
// Initialize CUDA
29+
CUdevice cuDevice;
30+
CUcontext cuContext;
31+
CUmodule cuModule;
32+
CUfunction transposeMatrix;
33+
cuInit(0);
34+
cuDeviceGet(&cuDevice, 0);
35+
cuCtxCreate(&cuContext, 0, cuDevice);
36+
cuModuleLoad(&cuModule, "test_04_transpose.fatbin");
37+
cuModuleGetFunction(&transposeMatrix, cuModule, "_Z15transposeMatrixPKfPfii");
38+
39+
// Allocate matrices in device memory
40+
CUdeviceptr d_input, d_output;
41+
cuMemAlloc(&d_input, size);
42+
cuMemAlloc(&d_output, size);
43+
44+
// Copy matrix from host memory to device memory
45+
cuMemcpyHtoD(d_input, (void *)h_input, size);
46+
47+
// Set up the execution configuration
48+
dim3 threadsPerBlock(16, 16);
49+
dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x,
50+
(height + threadsPerBlock.y - 1) / threadsPerBlock.y);
51+
52+
// Invoke kernel
53+
void *args[] = { &d_input, &d_output, &width, &height };
54+
cuLaunchKernel(transposeMatrix, blocksPerGrid.x, blocksPerGrid.y, 1,
55+
threadsPerBlock.x, threadsPerBlock.y, 1, 0, 0, args, 0);
56+
57+
// Copy result from device memory to host memory
58+
cuMemcpyDtoH((void *)h_output, d_output, size);
59+
60+
// Verify result
61+
bool success = true;
62+
for (int i = 0; i < height; ++i) {
63+
for (int j = 0; j < width; ++j) {
64+
if (h_output[j * height + i] != h_input[i * width + j]) {
65+
fprintf(stderr, "Result verification failed at element (%d, %d)!\n", i, j);
66+
success = false;
67+
break;
68+
}
69+
}
70+
if (!success) {
71+
break;
72+
}
73+
}
74+
75+
if (success) {
76+
printf("Test PASSED\n");
77+
} else {
78+
printf("Test FAILED\n");
79+
}
80+
81+
// Free device and host memory
82+
cuMemFree(d_input);
83+
cuMemFree(d_output);
84+
free(h_input);
85+
free(h_output);
86+
87+
// Cleanup CUDA
88+
cuCtxDestroy(cuContext);
89+
90+
return 0;
91+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#include <stdio.h>
2+
3+
// CUDA kernel for vector scalar multiplication
4+
__global__ void vectorScalarMultiply(const float *input, float *output, float scalar, int numElements) {
5+
int i = blockDim.x * blockIdx.x + threadIdx.x;
6+
if (i < numElements) {
7+
output[i] = input[i] * scalar;
8+
}
9+
}
10+
11+
int main(void) {
12+
int numElements = 50000;
13+
size_t size = numElements * sizeof(float);
14+
float *h_input = (float *)malloc(size);
15+
float *h_output = (float *)malloc(size);
16+
17+
// Initialize the input data
18+
for (int i = 0; i < numElements; ++i) {
19+
h_input[i] = i;
20+
}
21+
22+
// Allocate vectors in device memory
23+
float *d_input;
24+
cudaMalloc(&d_input, size);
25+
float *d_output;
26+
cudaMalloc(&d_output, size);
27+
28+
// Copy vector from host memory to device memory
29+
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
30+
31+
// Invoke kernel
32+
float scalar = 3.0f;
33+
int threadsPerBlock = 256;
34+
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
35+
vectorScalarMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, scalar, numElements);
36+
37+
// Copy result from device memory to host memory
38+
cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
39+
40+
// Verify result
41+
for (int i = 0; i < numElements; ++i) {
42+
if (fabs(h_input[i] * scalar - h_output[i]) > 1e-5) {
43+
fprintf(stderr, "Result verification failed at element %d!\n", i);
44+
exit(EXIT_FAILURE);
45+
}
46+
}
47+
48+
printf("Test PASSED\n");
49+
50+
// Free device and host memory
51+
cudaFree(d_input);
52+
cudaFree(d_output);
53+
free(h_input);
54+
free(h_output);
55+
56+
return 0;
57+
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <math.h>
4+
5+
// CUDA kernel for vector multiplication
6+
__global__ void vectorMultiply(const float *input1, const float *input2, float *output, int numElements) {
7+
int i = blockDim.x * blockIdx.x + threadIdx.x;
8+
if (i < numElements) {
9+
output[i] = input1[i] * input2[i];
10+
}
11+
}
12+
13+
int main(void) {
14+
int numElements = 50000;
15+
size_t size = numElements * sizeof(float);
16+
float *h_input1 = (float *)malloc(size);
17+
float *h_input2 = (float *)malloc(size);
18+
float *h_output = (float *)malloc(size);
19+
20+
// Initialize the input data
21+
for (int i = 0; i < numElements; ++i) {
22+
h_input1[i] = i;
23+
h_input2[i] = 2 * i; // Different values for the second vector
24+
}
25+
26+
// Allocate vectors in device memory
27+
float *d_input1, *d_input2, *d_output;
28+
cudaMalloc(&d_input1, size);
29+
cudaMalloc(&d_input2, size);
30+
cudaMalloc(&d_output, size);
31+
32+
// Copy vectors from host memory to device memory
33+
cudaMemcpy(d_input1, h_input1, size, cudaMemcpyHostToDevice);
34+
cudaMemcpy(d_input2, h_input2, size, cudaMemcpyHostToDevice);
35+
36+
// Invoke kernel
37+
int threadsPerBlock = 256;
38+
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
39+
vectorMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_input1, d_input2, d_output, numElements);
40+
41+
// Copy result from device memory to host memory
42+
cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
43+
44+
// Verify result
45+
for (int i = 0; i < numElements; ++i) {
46+
if (fabs(h_input1[i] * h_input2[i] - h_output[i]) > 1e-5) {
47+
fprintf(stderr, "Result verification failed at element %d!\n", i);
48+
exit(EXIT_FAILURE);
49+
}
50+
}
51+
52+
printf("Test PASSED\n");
53+
54+
// Free device and host memory
55+
cudaFree(d_input1);
56+
cudaFree(d_input2);
57+
cudaFree(d_output);
58+
free(h_input1);
59+
free(h_input2);
60+
free(h_output);
61+
62+
return 0;
63+
}

0 commit comments

Comments
 (0)