intel · steffenlarsen · Jun 30, 2022 · Apr 4, 2022 · Apr 5, 2022 · Apr 5, 2022
@@ -0,0 +1,118 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
+// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
+// execution once it does.
+// RUNx: %t.out
+
+#include <CL/sycl.hpp>
+
+#include <cmath>
+#include <vector>
+
+using namespace cl::sycl;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+constexpr int N = 16 * 3; // divisible by all vector sizes
+constexpr float bf16_eps = 0.00390625;
+
+union conv {
+  float f;
+  vec<uint16_t, 2> u;
+  uint32_t u2;
+};
+
+float from_bf16(uint16_t x) {
+  conv c;
+  c.u.y() = x;
+  c.u.x() = 0;
+  return c.f;
+}
+
+bool check(float a, float b) {
+  return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
+}
+
+#define TEST_BUILTIN_1_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      auto A = a_buf.get_access<access::mode::read_write>(cgh);                \
+      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(from_bf16(NAME(bfloat16{A[index]}).raw()),                   \
+                  NAME(A[index]))) {                                           \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_1(NAME) TEST_BUILTIN_1_SCAL_IMPL(NAME)
+
+#define TEST_BUILTIN_2_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<float> b_buf(&b[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      auto A = a_buf.get_access<access::mode::read>(cgh);                      \
+      auto B = b_buf.get_access<access::mode::read>(cgh);                      \
+      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(                                                             \
+                from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]}).raw()), \
+                NAME(A[index], B[index]))) {                                   \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_2(NAME) TEST_BUILTIN_2_SCAL_IMPL(NAME)
+
+#define TEST_BUILTIN_3_SCAL_IMPL(NAME)                                         \
+  {                                                                            \
+    buffer<float> a_buf(&a[0], N);                                             \
+    buffer<float> b_buf(&b[0], N);                                             \
+    buffer<float> c_buf(&c[0], N);                                             \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+      auto A = a_buf.get_access<access::mode::read>(cgh);                      \
+      auto B = b_buf.get_access<access::mode::read>(cgh);                      \
+      auto C = c_buf.get_access<access::mode::read>(cgh);                      \
+      auto ERR = err_buf.get_access<access::mode::write>(cgh);                 \
+      cgh.parallel_for(N, [=](id<1> index) {                                   \
+        if (check(from_bf16(NAME(bfloat16{A[index]}, bfloat16{B[index]},       \
+                                 bfloat16{C[index]})                           \
+                                .raw()),                                       \
+                  NAME(A[index], B[index], C[index]))) {                       \
+          ERR[0] = 1;                                                          \
+        }                                                                      \
+      });                                                                      \
+    });                                                                        \
+  }                                                                            \
+  assert(err == 0);
+
+#define TEST_BUILTIN_3(NAME) TEST_BUILTIN_3_SCAL_IMPL(NAME)
+
+int main() {
+  queue q;
+  std::vector<float> a(N), b(N), c(N);
+  int err = 0;
+
+  for (int i = 0; i < N; i++) {
+    a[i] = (i - N / 2) / (float)N;
+    b[i] = (N / 2 - i) / (float)N;
+    c[i] = (float)(3 * i);
+  }
+
+  TEST_BUILTIN_1(fabs);
+  TEST_BUILTIN_2(fmin);
+  TEST_BUILTIN_2(fmax);
+  TEST_BUILTIN_3(fma);
+
+  return 0;
+}
@@ -1,6 +1,9 @@
-// REQUIRES: gpu, cuda
+// REQUIRES: cuda
 
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3  %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
+// TODO: Currently the CI does not have a sm_80 capable machine. Enable the test
+// execution once it does.
+// RUNx: %t.out
 //
 // Specifying the sm version via the --cuda-gpu-arch flag is necessary
 // for the Nvidia case.  DPC++ JIT compilation is not
@@ -11,6 +14,8 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+constexpr float bf16_eps = 0.00390625;
 
 // Example usage of Nvidia matrix multiply.
 // Optimizations such as memory paddings for avoiding bank conflicts are not
@@ -63,6 +68,10 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
   if constexpr (std::is_same<T1, uint16_t>::value) {
     for (int k = 0; k < Big_K; k++)
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
+  } else if constexpr (std::is_same<T1, bfloat16>::value) {
+    for (int k = 0; k < Big_K; k++)
+      res +=
+          make_fp32(A[m * Big_K + k].raw()) * make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
 
@@ -105,7 +114,7 @@ void test() {
     for (int i = 0; i < Big_K * Big_N; i++) {
       B[i] = make_bf16(0.1f * (i % 10));
     }
-  } else {
+  } else if constexpr (!std::is_same<T1, bfloat16>::value) {
     for (int i = 0; i < Big_M * Big_K; i++) {
       A[i] = i % 100;
     }
@@ -114,72 +123,103 @@ void test() {
       B[i] = i % 100;
     }
   }
+  {
+    buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
+    buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
+    buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
+    buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
+
+    queue q;
+    // currently bfloat16 has to be initialized on device
+    if constexpr (std::is_same<T1, bfloat16>::value) {
+      q.submit([&](handler &cgh) {
+        auto accA = bufA.template get_access<access::mode::write>(cgh);
+
+        cgh.parallel_for<KernelName<bfloat16, class copyA, M, K, N>>(
+            range<1>(Big_M * Big_K), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accA[i] = 0.1f * (i % 10);
+            });
+      });
+
+      q.submit([&](handler &cgh) {
+        auto accB = bufB.template get_access<access::mode::write>(cgh);
+
+        cgh.parallel_for<KernelName<bfloat16, class copyB, M, K, N>>(
+            range<1>(Big_K * Big_N), [=](item<1> item) {
+              auto i = item.get_linear_id();
+              accB[i] = 0.1f * (i % 10);
+            });
+      });
+    }
+
+    q.submit([&](handler &cgh) {
+      auto accC = bufC.template get_access<access::mode::read_write>(cgh);
+      auto accA = bufA.template get_access<access::mode::read_write>(cgh);
+      auto accB = bufB.template get_access<access::mode::read_write>(cgh);
+      auto accD = bufD.template get_access<access::mode::read_write>(cgh);
+
+      range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
+      range<2> GlobalRange = {Sub_Tiles_M,
+                              Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
+
+      cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
+          nd_range<2>(GlobalRange, LocalRange),
+          [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+            sycl::sub_group sg = item.get_sub_group();
+            const auto m =
+                item.get_group().get_group_id()[0]; // row id of current
+                                                    // submatrix of BIG C matrix
+            const auto n =
+                item.get_group().get_group_id()[1]; // column id of current
+                                                    // submatrix of BIG C matrix
+
+            joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major>
+                sub_a;
+
+            joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major>
+                sub_b;
+
+            joint_matrix<T2, matrix_use::accumulator, M, N,
+                         matrix_layout::row_major>
+                sub_c;
+
+            joint_matrix_load(
+                sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+
+            for (int k = 0; k < Sub_Tiles_K;
+                 k++) // row/col id of current submatrix of BIG A/B matrices
+            {
+              joint_matrix_load(sg, sub_a,
+                                accA.get_pointer() + (k * K) + (m * M * Big_K),
+                                Big_K);
+
+              joint_matrix_load(sg, sub_b,
+                                accB.get_pointer() + (k * K * Big_N) + (n * N),
+                                Big_N);
+
+              sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+            }
+            joint_matrix_store(
+                sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
+          });
+
+      q.wait();
+  }
 
-  buffer<T1, 1> bufA(A, range<1>(Big_M * Big_K));
-  buffer<T1, 1> bufB(B, range<1>(Big_K * Big_N));
-  buffer<T2, 1> bufC(C, range<1>(Big_M * Big_N));
-  buffer<T2, 1> bufD(D, range<1>(Big_M * Big_N));
-
-  queue q;
-  q.submit([&](handler &cgh) {
-    auto accC = bufC.template get_access<access::mode::read_write>(cgh);
-    auto accA = bufA.template get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.template get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.template get_access<access::mode::read_write>(cgh);
-
-    range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
-    range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP};
-
-    cgh.parallel_for<KernelName<T1, T2, M, K, N>>(
-        nd_range<2>(GlobalRange, LocalRange), [=
-    ](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-          const auto m =
-              item.get_group()
-                  .get_id()[0]; // row id of current submatrix of BIG C matrix
-          const auto n =
-              item.get_group().get_id()[1]; // column id of current
-                                            // submatrix of BIG C matrix
-
-          joint_matrix<T1, matrix_use::a, M, K, matrix_layout::row_major> sub_a;
-
-          joint_matrix<T1, matrix_use::b, K, N, matrix_layout::row_major> sub_b;
-
-          joint_matrix<T2, matrix_use::accumulator, M, N,
-                       matrix_layout::row_major>
-              sub_c;
-
-          joint_matrix_load(
-              sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-
-          for (int k = 0; k < Sub_Tiles_K;
-               k++) // row/col id of current submatrix of BIG A/B matrices
-          {
-            joint_matrix_load(sg, sub_a,
-                              accA.get_pointer() + (k * K) + (m * M * Big_K),
-                              Big_K);
-
-            joint_matrix_load(sg, sub_b,
-                              accB.get_pointer() + (k * K * Big_N) + (n * N),
-                              Big_N);
-
-            sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-          }
-          joint_matrix_store(
-              sg, sub_c, accD.get_pointer() + (m * M) * Big_N + n * N, Big_N);
-        });
-  });
-
-  q.wait();
-
-  const auto host_accessor = bufD.template get_access<access::mode::read>();
   for (int m = 0; m < Big_M; m++)
     for (int n = 0; n < Big_N; n++) {
-
-      assert((host_accessor[m * Big_N + n] ==
-              matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
+      if constexpr (std::is_same<T1, bfloat16>::value) {
+        auto res_device = matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C);
+        assert(fabs(2 * (D[m * Big_N + n] - res_device)) /
+                   (D[m * Big_N + n] + res_device) <
+               bf16_eps * 2);
+      } else {
+        assert((D[m * Big_N + n] ==
+                matrix_ref_mn<T1, T2, Big_N, Big_K>(m, n, A, B, C)));
+      }
     }
-};
+  };
 
 int main() {
 
@@ -203,10 +243,14 @@ int main() {
 
   test<double, double, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 4, 8>();
 
-  // A/B bf16
+  // A/B bf16 using storage type
   test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
   test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
   test<uint16_t, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
 
+  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 16, 16, 16>();
+  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 8, 16, 32>();
+  test<bfloat16, float, SUB_TILES_M, SUB_TILES_K, SUB_TILES_N, 32, 16, 8>();
+
   return 0;
 };