int4 with bf16 support

DDEle · DDEle · commit 4c65a4022637 · 2024-07-11T05:40:44.000Z
diff --git a/include/common/core/explicit_conv.hpp b/include/common/core/explicit_conv.hpp
@@ -62,6 +62,19 @@ xetla_cvt(xetla_vector<T_src, N> src) {
   return dst;
 }
 
+/// @brief xetla explicit data conversion, bf16->fp16.
+/// @tparam T_dst is the float16 data type.
+/// @tparam T_src is the bfloat16 data type.
+/// @tparam N is the element number in xetla_vector.
+template <typename T_dst, typename T_src, int N>
+__XETLA_API typename std::enable_if_t<
+    std::is_same<T_dst, fp16>::value && std::is_same<T_src, bf16>::value,
+    xetla_vector<T_dst, N>>
+xetla_cvt(xetla_vector<T_src, N> src) {
+  xetla_vector<T_dst, N> dst = src;
+  return dst;
+}
+
 /// @brief xetla explicit data conversion, bf16->fp32.
 /// @tparam T_dst is the bfloat16 data type.
 /// @tparam T_src is the float32 data type.
diff --git a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
@@ -526,6 +526,10 @@ class gemm_universal_t<
   template <quant_mode quant_mode>
   static bool can_implement(arguments_t<quant_mode>& args) {
     bool implementable = true;
+    if (arch_tag == gpu_arch::XeLpg) {
+      implementable &= !std::is_same_v<dtype_a, bf16>; // XeLpg arch dosen't
+                                                       // have bf16 related isa.
+    }
     if (gemm_t::msg_type_a != msg_type::unaligned_2d) {
       if (gemm_t::msg_type_a == msg_type::block_2d) {
         implementable &= kernel::block_2d<arch_tag, dtype_a>::check_tensor(
diff --git a/tests/integration/gemm/int4_dequantization/main.cpp b/tests/integration/gemm/int4_dequantization/main.cpp
@@ -229,8 +229,9 @@ void dequantize_gemm_run(uint32_t iter) {
       compute_attr_t<data_type_acc_in, data_type_acc_in, data_type_acc>;
   using perf_tuning_knob = xetla::group::
       perf_tuning_knob_t<sg_tile_k, prefetch_distance, periodic_sync_interval>;
-  
-  static constexpr quant_info quant_info{quant_mode::S4_ASYM, Test::dequant_s, layout_b};
+
+  static constexpr quant_info quant_info{
+      quant_mode::S4_ASYM, Test::dequant_s, layout_b};
 
   using compute_policy = xetla::group::compute_policy_int4_dequantize<
       compute_attr,
diff --git a/tests/integration/gemm/int4_dequantization_bias/main_client.cpp b/tests/integration/gemm/int4_dequantization_bias/main_client.cpp
@@ -1043,4 +1043,4 @@ REGISTER_TYPED_TEST_SUITE_P(dequantize_gemm_act_shuf_test, esimd);
 INSTANTIATE_TYPED_TEST_SUITE_P(
     dequantize_gemm_act_shuf_test_suite,
     dequantize_gemm_act_shuf_test,
-    tests);
+    tests);
diff --git a/tests/integration/gemv/int4/main.cpp b/tests/integration/gemv/int4/main.cpp
@@ -27,6 +27,7 @@ constexpr int ITER = 200;
 #endif
 constexpr size_t UNDEFINED_DATA_SIZE = 1024;
 
+template <typename scalar_t>
 class test_col_major_1 {
  public:
   // Extract the parameters required by different test cases
@@ -48,9 +49,9 @@ class test_col_major_1 {
   static constexpr mem_layout layout_b = mem_layout::col_major;
   static constexpr mma_engine mma_eng = mma_engine::fpu;
   static constexpr gpu_arch arch = gpu_arch::XeLpg;
-  using data_type_a = fp16;
+  using data_type_a = scalar_t;
   using data_type_b = int4x8;
-  using data_type_c = fp16;
+  using data_type_c = scalar_t;
 };
 class test_col_major_2 {
  public:
@@ -173,11 +174,11 @@ std::vector<data_type_acc_in> dequantize_weight(
           (j / step) * (matrix_n / pack_radio) + i / pack_radio;
       int start_out =
           layout_b == mem_layout::row_major ? 0 : i * matrix_k + j * pack_radio;
+      data_type_zero_pt zp_value = zero_pt[start_zero_pt_in];
+      zp_value = zp_value >> (4 * (i % pack_radio));
       for (uint32_t jj = 0; jj < step; jj++) {
         std::vector<fp16> dequant_fp16 = convert_int4<quant_mode>(
-            b[start_b_in + jj],
-            scale[start_scale_in],
-            zero_pt[start_zero_pt_in] >> (4 * (i % pack_radio)));
+            b[start_b_in + jj], scale[start_scale_in], zp_value);
         for (uint32_t jjj = 0; jjj < dequant_fp16.size(); jjj++) {
           b_out[start_out + pack_radio * jj + jjj] = dequant_fp16[jjj];
         }
@@ -551,9 +552,11 @@ void dequantize_gemv_run(int iter) {
   // performance
   prof.print_profiling_result(profiling_selector::GPU);
   // check result
-  std::vector<typename Test::data_type_a> dequantize_b =
-      dequantize_weight<dequant_s, layout_b, compute_policy::quant_mode>(
-          matrix_k, matrix_n, B_h, scale_h, zero_pt_h);
+  std::vector<typename Test::data_type_a> dequantize_b = dequantize_weight<
+      dequant_s,
+      layout_b,
+      compute_policy::quant_mode,
+      data_type_c>(matrix_k, matrix_n, B_h, scale_h, zero_pt_h);
 
   queue.memcpy((void*)C_h, (void*)C_d, size_c * sizeof(data_type_c)).wait();
   ASSERT_EQ(
@@ -585,6 +588,12 @@ void dequantize_gemv_run(int iter) {
   free(Cnt_d, context);
 }
 
+// Placeholder for void test param
+template <>
+void dequantize_gemv_run<void>(int) {
+  GTEST_SKIP();
+}
+
 template <typename T>
 class dequantize_gemv_test : public ::testing::Test {};
 TYPED_TEST_SUITE_P(dequantize_gemv_test);
@@ -594,7 +603,11 @@ TYPED_TEST_P(dequantize_gemv_test, esimd) {
 }
 
 REGISTER_TYPED_TEST_SUITE_P(dequantize_gemv_test, esimd);
-using tests = ::testing::Types<test_col_major_1>;
+using tests = ::testing::Types< //
+    test_col_major_1<fp16>,
+    test_col_major_1<bf16>,
+    // test_col_major_2,
+    void>;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(
     dequantize_gemv_test_suite,