intel · bader · Dec 15, 2022 · Dec 12, 2022 · Dec 12, 2022 · Dec 15, 2022
@@ -30,6 +30,14 @@ uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
 }
 } // namespace detail
 
+// According to bfloat16 format, NAN value's exponent field is 0xFF and
+// significand has non-zero bits.
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, T> isnan(T x) {
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  return (((XBits & 0x7F80) == 0x7F80) && (XBits & 0x7F)) ? true : false;
+}
+
 template <typename T>
 std::enable_if_t<std::is_same<T, bfloat16>::value, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
@@ -74,20 +82,31 @@ std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmin(T x, T y) {
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   return oneapi::detail::bitsToBfloat16(__clc_fmin(XBits, YBits));
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  static const oneapi::detail::Bfloat16StorageT CanonicalNan = 0x7FC0;
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
+  if (isnan(x) && isnan(y))
+    return oneapi::detail::bitsToBfloat16(CanonicalNan);
+
+  if (isnan(x))
+    return y;
+  else if (isnan(y))
+    return x;
+  else if (((XBits | YBits) ==
+            static_cast<oneapi::detail::Bfloat16StorageT>(0x8000)) &&
+           !(XBits & YBits))
+    return oneapi::detail::bitsToBfloat16(
+        static_cast<oneapi::detail::Bfloat16StorageT>(0x8000));
+  else
+    return (x < y) ? x : y;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
                                sycl::marray<bfloat16, N> y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::marray<bfloat16, N> res;
-
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fmin(detail::to_uint32_t(x, i * 2),
                                   detail::to_uint32_t(y, i * 2));
@@ -101,15 +120,12 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
         oneapi::detail::bfloat16ToBits(y[N - 1]);
     res[N - 1] = oneapi::detail::bitsToBfloat16(__clc_fmin(XBits, YBits));
   }
-
-  return res;
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fmin(x[i], y[i]);
+  }
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return res;
 }
 
 template <typename T>
@@ -119,20 +135,30 @@ std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmax(T x, T y) {
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   return oneapi::detail::bitsToBfloat16(__clc_fmax(XBits, YBits));
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  static const oneapi::detail::Bfloat16StorageT CanonicalNan = 0x7FC0;
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
+  if (isnan(x) && isnan(y))
+    return oneapi::detail::bitsToBfloat16(CanonicalNan);
+
+  if (isnan(x))
+    return y;
+  else if (isnan(y))
+    return x;
+  else if (((XBits | YBits) ==
+            static_cast<oneapi::detail::Bfloat16StorageT>(0x8000)) &&
+           !(XBits & YBits))
+    return oneapi::detail::bitsToBfloat16(0);
+  else
+    return (x > y) ? x : y;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
                                sycl::marray<bfloat16, N> y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::marray<bfloat16, N> res;
-
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fmax(detail::to_uint32_t(x, i * 2),
                                   detail::to_uint32_t(y, i * 2));
@@ -146,14 +172,12 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
         oneapi::detail::bfloat16ToBits(y[N - 1]);
     res[N - 1] = oneapi::detail::bitsToBfloat16(__clc_fmax(XBits, YBits));
   }
-  return res;
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fmax(x[i], y[i]);
+  }
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return res;
 }
 
 template <typename T>