Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions SYCL/BFloat16/bfloat16_builtins.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// REQUIRES: cuda
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
// execution once it does.
// RUNx: %t.out

#include <CL/sycl.hpp>

#include <cmath>
#include <vector>

using namespace cl::sycl;
using sycl::ext::oneapi::experimental::bfloat16;

constexpr int N = 16 * 3; // divisible by all vector sizes
constexpr float bf16_eps = 0.00390625;

union conv {
float f;
vec<uint16_t, 2> u;
uint32_t u2;
};

float from_bf16(uint16_t x) {
conv c;
c.u.y() = x;
c.u.x() = 0;
return c.f;
}

bool check(float a, float b) {
return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
}

#define TEST_BUILTIN_1_SCAL_IMPL(NAME) \
{ \
buffer<float> a_buf(&a[0], N); \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
auto A = a_buf.get_access<access::mode::read_write>(cgh); \
auto ERR = err_buf.get_access<access::mode::write>(cgh); \
cgh.parallel_for(N, [=](id<1> index) { \
if (check(from_bf16(NAME(bfloat16{A[index]}).raw()), \
NAME(A[index]))) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_1(NAME) TEST_BUILTIN_1_SCAL_IMPL(NAME)

#define TEST_BUILTIN_2_SCAL_IMPL(NAME) \
{ \
buffer<float> a_buf(&a[0], N); \
buffer<float> b_buf(&b[0], N); \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
auto A = a_buf.get_access<access::mode::read>(cgh); \
auto B = b_buf.get_access<access::mode::read>(cgh); \
auto ERR = err_buf.get_access<access::mode::write>(cgh); \
cgh.parallel_for(N, [=](id<1> index) { \
if (check( \
from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \
NAME(A[index], B[index]))) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_2(NAME) TEST_BUILTIN_2_SCAL_IMPL(NAME)

#define TEST_BUILTIN_3_SCAL_IMPL(NAME) \
{ \
buffer<float> a_buf(&a[0], N); \
buffer<float> b_buf(&b[0], N); \
buffer<float> c_buf(&c[0], N); \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
auto A = a_buf.get_access<access::mode::read>(cgh); \
auto B = b_buf.get_access<access::mode::read>(cgh); \
auto C = c_buf.get_access<access::mode::read>(cgh); \
auto ERR = err_buf.get_access<access::mode::write>(cgh); \
cgh.parallel_for(N, [=](id<1> index) { \
if (check(from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \
bfloat16{C[index]}) \
.raw()), \
NAME(A[index], B[index], C[index]))) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_3(NAME) TEST_BUILTIN_3_SCAL_IMPL(NAME)

int main() {
queue q;
std::vector<float> a(N), b(N), c(N);
int err = 0;

for (int i = 0; i < N; i++) {
a[i] = (i - N / 2) / (float)N;
b[i] = (N / 2 - i) / (float)N;
c[i] = (float)(3 * i);
}

TEST_BUILTIN_1(fabs);
TEST_BUILTIN_2(fmin);
TEST_BUILTIN_2(fmax);
TEST_BUILTIN_3(fma);

return 0;
}
176 changes: 110 additions & 66 deletions SYCL/Matrix/joint_matrix_tensorcore.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// REQUIRES: gpu, cuda
// REQUIRES: cuda

// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
// execution once it does.
// RUNx: %t.out
//
// Specifying the sm version via the --cuda-gpu-arch flag is necessary
// for the Nvidia case. DPC++ JIT compilation is not
Expand All @@ -11,6 +14,8 @@

using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;
using sycl::ext::oneapi::experimental::bfloat16;
constexpr float bf16_eps = 0.00390625;

// Example usage of Nvidia matrix multiply.
// Optimizations such as memory paddings for avoiding bank conflicts are not
Expand Down Expand Up @@ -63,6 +68,10 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
if constexpr (std::is_same<T1, uint16_t>::value) {
for (int k = 0; k < Big_K; k++)
res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
} else if constexpr (std::is_same<T1, bfloat16>::value) {
for (int k = 0; k < Big_K; k++)
res +=
make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw());
} else {
for (int k = 0; k < Big_K; k++)

Expand Down Expand Up @@ -105,7 +114,7 @@ void test() {
for (int i = 0; i < Big_K * Big_N; i++) {
B[i] = make_bf16(0.1f * (i % 10));
}
} else {
} else if constexpr (!std::is_same<T1, bfloat16>::value) {
for (int i = 0; i < Big_M * Big_K; i++) {
A[i] = i % 100;
}
Expand All @@ -114,72 +123,103 @@ void test() {
B[i] = i % 100;
}
}
{
buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));

queue q;
// currently bfloat16 has to be initialized on device
if constexpr (std::is_same<T1, bfloat16>::value) {
q.submit([&](handler &cgh) {
auto accA = bufA.template get_access<access::mode::write>(cgh);

cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
range<1>(Big_M * Big_K), [=](item<1> item) {
auto i = item.get_linear_id();
accA[i] = 0.1f * (i % 10);
});
});

q.submit([&](handler &cgh) {
auto accB = bufB.template get_access<access::mode::write>(cgh);

cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
range<1>(Big_K * Big_N), [=](item<1> item) {
auto i = item.get_linear_id();
accB[i] = 0.1f * (i % 10);
});
});
}

q.submit([&](handler &cgh) {
auto accC = bufC.template get_access<access::mode::read_write>(cgh);
auto accA = bufA.template get_access<access::mode::read_write>(cgh);
auto accB = bufB.template get_access<access::mode::read_write>(cgh);
auto accD = bufD.template get_access<access::mode::read_write>(cgh);

range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
range<2> GlobalRange = {Sub_Tiles_M,
Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};

cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
nd_range<2>(GlobalRange, LocalRange),
[=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
sycl::sub_group sg = item.get_sub_group();
const auto m =
item.get_group().get_group_id()[0]; // row id of current
// submatrix of BIG C matrix
const auto n =
item.get_group().get_group_id()[1]; // column id of current
// submatrix of BIG C matrix

joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major>
sub_a;

joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major>
sub_b;

joint_matrix<T2, matrix_use::accumulator, M, N,
matrix_layout::row_major>
sub_c;

joint_matrix_load(
sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);

for (int k = 0; k < Sub_Tiles_K;
k++) // row/col id of current submatrix of BIG A/B matrices
{
joint_matrix_load(sg, sub_a,
accA.get_pointer() + (k * K) + (m * M * Big_K),
Big_K);

joint_matrix_load(sg, sub_b,
accB.get_pointer() + (k * K * Big_N) + (n * N),
Big_N);

sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
joint_matrix_store(
sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
});

q.wait();
}

buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));

queue q;
q.submit([&](handler &cgh) {
auto accC = bufC.template get_access<access::mode::read_write>(cgh);
auto accA = bufA.template get_access<access::mode::read_write>(cgh);
auto accB = bufB.template get_access<access::mode::read_write>(cgh);
auto accD = bufD.template get_access<access::mode::read_write>(cgh);

range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};

cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
nd_range<2>(GlobalRange, LocalRange), [=
](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
sycl::sub_group sg = item.get_sub_group();
const auto m =
item.get_group()
.get_id()[0]; // row id of current submatrix of BIG C matrix
const auto n =
item.get_group().get_id()[1]; // column id of current
// submatrix of BIG C matrix

joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;

joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;

joint_matrix<T2, matrix_use::accumulator, M, N,
matrix_layout::row_major>
sub_c;

joint_matrix_load(
sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);

for (int k = 0; k < Sub_Tiles_K;
k++) // row/col id of current submatrix of BIG A/B matrices
{
joint_matrix_load(sg, sub_a,
accA.get_pointer() + (k * K) + (m * M * Big_K),
Big_K);

joint_matrix_load(sg, sub_b,
accB.get_pointer() + (k * K * Big_N) + (n * N),
Big_N);

sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
joint_matrix_store(
sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
});
});

q.wait();

const auto host_accessor = bufD.template get_access<access::mode::read>();
for (int m = 0; m < Big_M; m++)
for (int n = 0; n < Big_N; n++) {

assert((host_accessor[m * Big_N + n] ==
matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
if constexpr (std::is_same<T1, bfloat16>::value) {
auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
(D[m * Big_N + n] + res_device) <
bf16_eps * 2);
} else {
assert((D[m * Big_N + n] ==
matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
}
}
};
};

int main() {

Expand All @@ -203,10 +243,14 @@ int main() {

test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>();

// A/B bf16
// A/B bf16 using storage type
test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();

test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();

return 0;
};