[GPU]Xetla support MTL (#176)

sunjiweiswift · DDEle · airMeng · web-flow · commit c4e854099321 · 2024-04-23T10:36:17.000+08:00
* bugfix

* add AOT

* Update examples/CMakeLists.txt

Co-authored-by: Yi DING &lt;yi1.ding@intel.com&gt;

* XETLA_PRINTF replace cout

* SLMSIZE 128KB 64KB

* Update include/subgroup/tile/impl/load_xe.hpp

Co-authored-by: Meng, Hengyu &lt;hengyu.meng@intel.com&gt;

* use arch_attr_t

* add more shape for int4

* bugfix dump_mat

* save

---------

Co-authored-by: Yi DING &lt;yi1.ding@intel.com&gt;
Co-authored-by: Meng, Hengyu &lt;hengyu.meng@intel.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -43,9 +43,16 @@ if (${LOG} STREQUAL "on")
     add_definitions(-DLOG_PRINT)
 endif ()
 
+# AOT device
+set(AOT_DEVICE "" CACHE STRING "Set device list for AOT build")
+
 add_compile_options(-fsycl)
 add_link_options(-fsycl)
 if(UNIX)
+    if (AOT_DEVICE)
+        add_compile_options(-fsycl-targets=spir64_gen)
+        add_link_options(-fsycl-targets=spir64_gen -Xs "-device ${AOT_DEVICE}") # MTL
+    endif()
     add_compile_options(-fp-model=precise -Wall -Wextra -Werror)
     add_link_options(-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm)
     link_libraries(-lgtest -lgtest_main)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -2,7 +2,7 @@ include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR})
 
 # Creates a separate device code module for each SYCL* kernel
-# so that kernel for Dg2 and Xe will be JIT separately
+# so that kernel for XeHpc, XeHpg, and XeLpg will be JIT separately
 add_compile_options(-fsycl-device-code-split=per_kernel)
 add_link_options(-fsycl-device-code-split=per_kernel)
 
diff --git a/include/common/core/arch_config.hpp b/include/common/core/arch_config.hpp
@@ -108,10 +108,10 @@ struct register_attr_t {};
 template <grf_mode grf_num_mode, gpu_arch arch_tag>
 struct client_register_attr_base_t {
   static constexpr uint32_t acc_reg_in_bytes =
-      (grf_num_mode == grf_mode::normal) ? 4 * 32 : 8 * 32;
+      (grf_num_mode == grf_mode::normal) ? 4 * 64 : 8 * 64;
   static constexpr uint32_t grf_in_bytes =
-      (grf_num_mode == grf_mode::normal) ? 128 * 32 : 256 * 32;
-  static constexpr uint32_t reg_in_bytes = 32;
+      (grf_num_mode == grf_mode::normal) ? 128 * 64 : 256 * 64;
+  static constexpr uint32_t reg_in_bytes = 64;
 };
 
 template <grf_mode grf_num_mode>
@@ -139,7 +139,7 @@ struct client_arch_attr_base_t {
   template <msg_type message_type = msg_type::block_2d>
   using load_store_attr = load_store_attr_t<message_type, gpu_arch::XeHpg>;
 
-  template <grf_mode grf_num_mode = grf_mode::normal>
+  template <grf_mode grf_num_mode = grf_mode::double_grf>
   using register_attr = register_attr_t<grf_num_mode, gpu_arch::XeHpg>;
 
   using mma_attr = mma_attr_t<gpu_arch::XeHpg>;
diff --git a/include/experimental/group/gemm/compute_policy.hpp b/include/experimental/group/gemm/compute_policy.hpp
@@ -82,8 +82,9 @@ struct compute_policy_int4_dequantize<
 
   static constexpr bool is_int4_matB_policy = true;
 
-  static constexpr uint32_t block_size_x_b =
-      arch_attr_t<arch_tag>::mma_attr::mma_n_in_elem;
+  static constexpr uint32_t block_size_x_b = (mma_engine == mma_engine::xmx)
+      ? arch_attr_t<arch_tag>::mma_attr::mma_n_in_elem
+      : 32;
   static constexpr uint32_t block_bytes_y_b = 32;
   static_assert(
       block_bytes_x_a == block_bytes_y_b,
diff --git a/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp b/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp
@@ -560,20 +560,6 @@ class gemm_t<
   }
 
  private:
-  template <typename T>
-  void dump_mat(T mat, size_t tile_x, size_t tile_y) {
-#pragma unroll
-    for (size_t row = 0; row < tile_x; row++) {
-#pragma unroll
-      for (size_t col = 0; col < tile_y; col++) {
-        sycl::ext::oneapi::experimental::printf(
-            "%0.1f ", (float)(sycl::half)mat.reg[row * tile_y + col]);
-      }
-      sycl::ext::oneapi::experimental::printf("\n ");
-    }
-    sycl::ext::oneapi::experimental::printf("\n ");
-  }
-
   inline void dequantize(
       matB_acc_t& matB_acc,
       matB_t& matB,
diff --git a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
@@ -451,9 +451,11 @@ class gemm_universal_t<
   static cl::sycl::range<3> get_local_range() {
     uint32_t local_range_m = (wg_tile_m + sg_tile_m - 1) / sg_tile_m;
     uint32_t local_range_n = (wg_tile_n + sg_tile_n - 1) / sg_tile_n;
-    // std::cout << "Local range: {" << num_local_kslicing << ", " <<
-    // local_range_m
-    //           << ", " << local_range_n << "} \n";
+    XETLA_PRINTF(
+        "Local range: {%d, %d, %d}",
+        num_local_kslicing,
+        local_range_m,
+        local_range_n);
     assert(local_range_m * local_range_n * num_local_kslicing <= 32);
     return cl::sycl::range<3>{num_local_kslicing, local_range_m, local_range_n};
   };
@@ -471,8 +473,11 @@ class gemm_universal_t<
     uint32_t group_range_m = (matrix_m + wg_tile_m - 1) / wg_tile_m;
     uint32_t group_range_n = (matrix_n + wg_tile_n - 1) / wg_tile_n;
     group_swizzle_t::update_group_range(group_range_m, group_range_n);
-    // std::cout << "Group range: {" << num_global_kslicing << ", "
-    //           << group_range_m << ", " << group_range_n << "} \n";
+    XETLA_PRINTF(
+        "Group range: {%d, %d, %d}",
+        num_global_kslicing,
+        group_range_m,
+        group_range_n);
     return cl::sycl::range<3>{
         num_global_kslicing, group_range_m, group_range_n};
   };
diff --git a/include/subgroup/tile/impl/load_xe.hpp b/include/subgroup/tile/impl/load_xe.hpp
@@ -28,6 +28,7 @@ namespace gpu::xetla::subgroup {
 namespace detail {
 template <typename tile_t, typename payload_t>
 struct check_load_type {
+  static constexpr bool is_lsc_gather = true;
   static constexpr bool is_global_block_2d =
       (payload_t::memory_space == mem_space::global &&
        (payload_t::message_type == msg_type::block_2d) &&
@@ -444,6 +445,7 @@ template <
     typename payload_t>
 __XETLA_API typename std::enable_if_t<
     detail::check_load_type<tile_t, payload_t>::is_global_block_2d &&
+    detail::check_load_type<tile_t, payload_t>::is_lsc_gather &&
     payload_t::arch_tag <= gpu_arch::XeHpg>
 tile_load(tile_t& tile, payload_t& payload) {
   using dtype = typename payload_t::dtype;
@@ -531,6 +533,77 @@ tile_load(tile_t& tile, payload_t& payload) {
   }
 }
 
+/// @brief This function loads data from unaligned-2D memory surface.
+/// Loads an array of rectangular regions (X,Y)..(X+W,Y+H) from memory into
+/// registers. Each block will be loaded serially by its corresponding payload.
+/// @tparam tile_t Is the tile_t struct contains registers.
+/// These registers will be the destination of load operation.
+/// @tparam payload_t Is the mem_payload_t struct describing the memory
+/// information. Payload indicates the source of load operation.
+/// @tparam L1 Is the cache hint for L1 cache.
+/// @tparam L3 Is the cache hint for L3 cache.
+/// @param tile Is the tile object with type tile_t, holds the return data of
+/// the loads.
+/// @param payload Is the payload object with type payload_t. Contains all the
+/// information for loads.
+/// @return No return, update in place.
+template <
+    cache_hint L1 = cache_hint::cached,
+    cache_hint L3 = cache_hint::cached,
+    typename tile_t,
+    typename payload_t>
+__XETLA_API typename std::enable_if_t<
+    detail::check_load_type<tile_t, payload_t>::is_global_block_2d &&
+    !detail::check_load_type<tile_t, payload_t>::is_lsc_gather &&
+    !arch_has_2d_load_store(payload_t::arch_tag)>
+tile_load(tile_t& tile, payload_t& payload) {
+  using dtype = typename payload_t::dtype;
+  using tile_desc = typename payload_t::tile_desc;
+  using load_dtype = typename payload_t::mem_dtype;
+  constexpr uint32_t load_elems = payload_t::simd_exec_size;
+  constexpr uint32_t pack_factor = payload_t::pack_factor;
+
+#pragma unroll
+  for (uint32_t i = 0; i < tile_desc::num_block_y; i++) {
+    uint32_t offset_y = i * tile_desc::block_size_y;
+#pragma unroll
+    for (uint32_t j = 0; j < tile_desc::num_block_x; j++) {
+      uint32_t offset_x = j * tile_desc::block_size_x;
+      auto reg_sub = tile.reg.xetla_select<tile_desc::block_elems, 1>(
+          (i * tile_desc::num_block_x + j) * tile_desc::block_elems);
+#pragma unroll
+      for (uint32_t sub_block_y = 0; sub_block_y < tile_desc::block_size_y;
+           sub_block_y += 1) {
+        xetla_vector<load_dtype, load_elems> reg_tmp = 0;
+        uint32_t address_offset = payload_t::trans
+            ? offset_x * payload.pitch_in_bytes +
+                (offset_y + sub_block_y) * sizeof(dtype)
+            : offset_x * sizeof(dtype) +
+                (offset_y + sub_block_y) * payload.pitch_in_bytes;
+        reg_tmp = xetla_load_global<
+            load_dtype,
+            payload_t::simd_exec_size,
+            data_size::default_size,
+            L1,
+            L3>(payload.base_ptr, payload.base_offset + address_offset);
+
+        reg_sub
+            .xetla_select<load_elems * pack_factor, 1>(
+                sub_block_y * tile_desc::block_size_x)
+            .xetla_format<load_dtype>() = reg_tmp;
+      }
+    }
+  }
+
+  if constexpr (payload_t::trans) {
+    SW_BARRIER();
+    tile_transpose(tile);
+  }
+  if constexpr (payload_t::mem_transform) {
+    SW_BARRIER();
+    vnni_convert(tile);
+  }
+}
 /// @brief This function loads data from unaligned-2D memory surface.
 /// Loads an array of rectangular regions (X,Y)..(X+W,Y+H) from memory into
 /// registers. Each block will be loaded serially by its corresponding payload.
diff --git a/include/subgroup/tile/impl/op_function.hpp b/include/subgroup/tile/impl/op_function.hpp
@@ -676,4 +676,35 @@ layout_convert(T_dst& dst, T_src& src) {
     }
   }
 }
+
+template <typename T>
+void dump_mat(
+    T mat,
+    size_t tile_x = T::tile_size_x,
+    size_t tile_y = T::tile_size_y) {
+#pragma unroll
+  for (size_t row = 0; row < tile_y; row++) {
+#pragma unroll
+    for (size_t col = 0; col < tile_x; col++) {
+      sycl::ext::oneapi::experimental::printf(
+          "%d ", (int)(sycl::half)mat.reg[row * tile_x + col]);
+    }
+    sycl::ext::oneapi::experimental::printf("\n ");
+  }
+  sycl::ext::oneapi::experimental::printf("\n ");
+}
+template <typename T>
+void dump_mat_reg(T mat, size_t tile_x, size_t tile_y) {
+#pragma unroll
+  for (size_t row = 0; row < tile_y; row++) {
+#pragma unroll
+    for (size_t col = 0; col < tile_x; col++) {
+      sycl::ext::oneapi::experimental::printf(
+          "%d ", (int)(sycl::half)mat[row * tile_x + col]);
+    }
+    sycl::ext::oneapi::experimental::printf("\n ");
+  }
+  sycl::ext::oneapi::experimental::printf("\n ");
+}
+
 } // namespace gpu::xetla::subgroup
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
@@ -404,14 +404,15 @@ struct mem_payload_t<
   // for pvc, we can use simd16 or simd32
   static constexpr uint32_t min_store_bytes = 16 * sizeof(dtype);
   static constexpr uint32_t max_store_bytes = 32 * sizeof(dtype);
-  static constexpr uint32_t num_channel =
+  static constexpr uint32_t simd_channel =
       ((tile_bytes % max_store_bytes) == 0 &&
        (block_bytes % max_store_bytes) == 0)
       ? 32
       : 16;
-
-  static constexpr uint32_t num_channel_x = block_size_x;
-  static constexpr uint32_t num_channel_y = num_channel / num_channel_x;
+  static constexpr uint32_t num_channel =
+      (simd_channel >= block_size_x) ? block_size_x : simd_channel;
+  static constexpr uint32_t num_channel_x = block_size_x; // 16
+  static constexpr uint32_t num_channel_y = num_channel / num_channel_x; // 1
   static constexpr uint32_t store_elems = num_channel_y * block_size_x;
 
   xetla_vector<uint32_t, num_channel> channel_offset;
diff --git a/tests/integration/data_transformer/common.hpp b/tests/integration/data_transformer/common.hpp
@@ -122,6 +122,7 @@ class TestBase {
   using data_type_in = float;
   using data_type_out = bf16;
   using data_type_acc = float;
+  static constexpr gpu_arch gpu_arch = gpu_arch::XeHpc;
 };
 
 class Test_fp32tobf16_128_64 : public TestBase {
diff --git a/tests/integration/default_config/group_gemm/common.hpp b/tests/integration/default_config/group_gemm/common.hpp
@@ -46,6 +46,7 @@ class TestBase {
     return name;
   }
   static constexpr mma_engine engine = mma_engine::xmx;
+  static constexpr gpu_arch gpu_arch = gpu_arch::XeHpc;
 };
 
 class Test0 : public TestBase {
diff --git a/tests/integration/default_config/kernel_gemm/common.hpp b/tests/integration/default_config/kernel_gemm/common.hpp
@@ -46,6 +46,7 @@ class TestBase {
     return name;
   }
   static constexpr mma_engine engine = mma_engine::xmx;
+  static constexpr gpu_arch gpu_arch = gpu_arch::XeHpc;
 };
 
 class Test0 : public TestBase {
diff --git a/tests/integration/gemm/bf16/common.hpp b/tests/integration/gemm/bf16/common.hpp
@@ -46,6 +46,7 @@ class TestBase {
     return name;
   }
   static constexpr mma_engine engine = mma_engine::xmx;
+  static constexpr gpu_arch gpu_arch = gpu_arch::XeHpc;
 };
 
 class Test0 : public TestBase {
diff --git a/tests/integration/gemm/fp16/common.hpp b/tests/integration/gemm/fp16/common.hpp
@@ -46,17 +46,18 @@ class TestBase {
     return name;
   }
   static constexpr mma_engine engine = mma_engine::xmx;
+  static constexpr gpu_arch gpu_arch = gpu_arch::XeHpg;
 };
 
 class Test0 : public TestBase {
  public:
   static constexpr size_t mat_m = 256;
   static constexpr size_t mat_n = 256;
   static constexpr size_t mat_k = 256;
-  static constexpr size_t wg_m = 256;
+  static constexpr size_t wg_m = 1;
   static constexpr size_t wg_n = 256;
-  static constexpr size_t sg_m = 32;
-  static constexpr size_t sg_n = 64;
+  static constexpr size_t sg_m = 1;
+  static constexpr size_t sg_n = 32;
   static constexpr size_t sg_k = 32;
   static constexpr uint32_t global_kslicing = 1;
   static constexpr uint32_t local_kslicing = 1;
@@ -66,6 +67,7 @@ class Test0 : public TestBase {
   using data_type_b = fp16;
   using data_type_c = fp16;
   using data_type_acc = float;
+  static constexpr mma_engine engine = mma_engine::fpu;
 };
 
 class Test1 : public TestBase {
@@ -483,4 +485,5 @@ using fp16_gemm_func = fp16_gemm_test_func<
     Test::layout_b,
     Test::global_kslicing,
     Test::local_kslicing,
-    Test::engine>;
+    Test::engine,
+    Test::gpu_arch>;
diff --git a/tests/integration/gemm/fp16/kernel_func.hpp b/tests/integration/gemm/fp16/kernel_func.hpp
@@ -37,7 +37,8 @@ template <
     mem_layout layout_b,
     uint32_t global_kslicing,
     uint32_t local_kslicing,
-    mma_engine engine>
+    mma_engine engine,
+    gpu_arch gpu_arch>
 struct fp16_gemm_test_func {
   using tile_shape = tile_shape_t<wg_n, wg_m, sg_n, sg_m>;
   static constexpr uint32_t periodic_sync_interval = 8;
@@ -51,14 +52,9 @@ struct fp16_gemm_test_func {
       perf_tuning_knob_t<sg_k, prefetch_distance, periodic_sync_interval>;
   using compute_policy = typename std::conditional<
       (engine == mma_engine::fpu),
-      compute_policy_default_fpu<
-          compute_attr,
-          perf_tuning_knob,
-          gpu_arch::XeHpc>,
-      compute_policy_default_xmx<
-          compute_attr,
-          perf_tuning_knob,
-          gpu_arch::XeHpc>>::type;
+      compute_policy_default_fpu<compute_attr, perf_tuning_knob, gpu_arch>,
+      compute_policy_default_xmx<compute_attr, perf_tuning_knob, gpu_arch>>::
+      type;
 
   using mem_desc_input_a = mem_desc_t<dtype_a, layout_a, mem_space::global>;
   using mem_desc_input_b = mem_desc_t<dtype_b, layout_b, mem_space::global>;
@@ -69,12 +65,11 @@ struct fp16_gemm_test_func {
       gemm_t<compute_policy, tile_shape, mem_desc_input_a, mem_desc_input_b>;
 
   using epilogue_t = epilogue_t<
-      epilogue_policy_default<gpu_arch::XeHpc>,
+      epilogue_policy_default<gpu_arch>,
       tile_shape,
       mem_desc_output_c>;
 
-  using group_swizzle =
-      gpu::xetla::kernel::group_swizzle_default<gpu_arch::XeHpc>;
+  using group_swizzle = gpu::xetla::kernel::group_swizzle_default<gpu_arch>;
 
   using dispatch_policy =
       dispatch_policy_kslicing<group_swizzle, global_kslicing, local_kslicing>;
diff --git a/tests/integration/gemm/fp32/CMakeLists.txt b/tests/integration/gemm/fp32/CMakeLists.txt
@@ -4,3 +4,4 @@ string(PREPEND ProjectId "gemm_")
 
 FILE(GLOB src main.cpp)
 add_integration_test(${ProjectId} ${src})
+add_integration_test(${ProjectId}_dg2 main_dg2.cpp)
diff --git a/tests/integration/gemm/fp32/common.hpp b/tests/integration/gemm/fp32/common.hpp
@@ -49,6 +49,7 @@ class TestBase {
 
   static constexpr size_t batch_size = 1;
   static constexpr mma_engine engine = mma_engine::fpu;
+  static constexpr gpu_arch gpu_arch = gpu_arch::XeHpc;
 };
 
 class Test1 : public TestBase {
diff --git a/tests/integration/gemm/fp32/main_dg2.cpp b/tests/integration/gemm/fp32/main_dg2.cpp
diff --git a/tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt b/tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt
diff --git a/tests/integration/gemm/int4_dequantization_bias/main_client.cpp b/tests/integration/gemm/int4_dequantization_bias/main_client.cpp
diff --git a/tests/integration/gemm/int8/common.hpp b/tests/integration/gemm/int8/common.hpp
diff --git a/tests/integration/gemm/tf32/test.hpp b/tests/integration/gemm/tf32/test.hpp
diff --git a/tests/integration/gemm/unaligned_bf16/common.hpp b/tests/integration/gemm/unaligned_bf16/common.hpp
diff --git a/tests/utils/buff_compare.hpp b/tests/utils/buff_compare.hpp
diff --git a/tests/utils/execution.hpp b/tests/utils/execution.hpp