Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit 3043b5a

Browse files
sunjiweiswiftairMengDDEle
authored
Xetla int4 mat b col major (#261)
* save * save(some error with kslicing) * fix kslicing bug * save(g128 MTL 270Gflops bug on g32) save(g128 MTL 270Gflops bug on g32) add UT for gemv * add Specialized for FPU * support int scale col_major(with opt 10% perf when g = 32) * support int4x8 for int32 weight * save(perf bug with int4x8 load) * save * add first token UT * opt mma code * opt perf for int4x8 * support load one fp16 data * support zero_pt * support ASYM and SYM * save * ut improve * support sg_n > 1 * add #pragma unroll * support HF zero pt layout K x N, compress int4 along N dimensions * save * sg_m =4 for first token * Extract dequant func * update row_major for origin PVC/ARC template * save(fix HPC 2D load) * fix XEHPC 2D load * fix compile for all UT * sync ipex 20240618 * opt PVC arch * fix group_qkv * fix group_qkv * fix sdp bug * channel num ->1 8 16 32 * remove comments of unused code * add -ftemplate-backtrace-limit=0 only UNIX --------- Co-authored-by: Meng, Hengyu <[email protected]> Co-authored-by: Ding, Yi1 <[email protected]>
1 parent e5510c6 commit 3043b5a

File tree

65 files changed

+2008
-925
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+2008
-925
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ if (${LOG} STREQUAL "on")
4646
endif ()
4747

4848
# For large registers mode, enable 256 registers for kernels
49-
set(XETLA_OFFLINE_OPTIONS "-doubleGRF")
49+
# set(XETLA_OFFLINE_OPTIONS "-doubleGRF")
5050
set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -vc-disable-indvars-opt")
5151
set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -vc-codegen")
5252
# Enable bank conflict reduction.
@@ -75,7 +75,7 @@ add_link_options(-fsycl -fsycl-device-code-split=per_kernel -fsycl-max-parallel-
7575
add_link_options(${XETLA_KERNEL_FLAGS})
7676

7777
if(UNIX)
78-
add_compile_options(-fp-model=precise)
78+
add_compile_options(-fp-model=precise -ftemplate-backtrace-limit=0)
7979
add_link_options(-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm)
8080
link_libraries(-lgtest -lgtest_main)
8181
else() # Windows

examples/05_batch_gemm/batch_gemm.hpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -276,20 +276,20 @@ class batch_gemm_t {
276276
args.matB_base.base, args.matB_ld);
277277
}
278278
}
279-
if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
280-
if (epilogue_t::msg_type_c == msg_type::block_2d) {
281-
implementable &=
282-
kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
283-
(uint64_t)(args.matC_base.base),
284-
args.matrix_n,
285-
args.matrix_m * args.batch_size,
286-
args.matC_ld);
287-
} else {
288-
implementable &=
289-
kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
290-
args.matC_base.base, args.matC_ld);
291-
}
292-
}
279+
// if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
280+
// if (epilogue_t::msg_type_c == msg_type::block_2d) {
281+
// implementable &=
282+
// kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
283+
// (uint64_t)(args.matC_base.base),
284+
// args.matrix_n,
285+
// args.matrix_m * args.batch_size,
286+
// args.matC_ld);
287+
// } else {
288+
// implementable &=
289+
// kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
290+
// args.matC_base.base, args.matC_ld);
291+
// }
292+
// }
293293

294294
return implementable;
295295
}

examples/07_multi_layer_perceptron/multi_layer_perceptron.hpp

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -409,20 +409,20 @@ class multi_layer_perceptron_t {
409409
args.matW_base.base, args.matW_ld);
410410
}
411411
}
412-
if (epilogue_layer1_t::msg_type_c != msg_type::unaligned_2d) {
413-
if (epilogue_layer1_t::msg_type_c == msg_type::block_2d) {
414-
implementable &=
415-
kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
416-
(uint64_t)(args.matB_base.base),
417-
args.matrix_n_layer1,
418-
args.matrix_m_layer1,
419-
args.matB_ld);
420-
} else {
421-
implementable &=
422-
kernel::general_1d<gpu_arch::XeHpc, dtype_b>::check_alignment(
423-
args.matB_base.base, args.matB_ld);
424-
}
425-
}
412+
// if (epilogue_layer1_t::msg_type_c != msg_type::unaligned_2d) {
413+
// if (epilogue_layer1_t::msg_type_c == msg_type::block_2d) {
414+
// implementable &=
415+
// kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
416+
// (uint64_t)(args.matB_base.base),
417+
// args.matrix_n_layer1,
418+
// args.matrix_m_layer1,
419+
// args.matB_ld);
420+
// } else {
421+
// implementable &=
422+
// kernel::general_1d<gpu_arch::XeHpc, dtype_b>::check_alignment(
423+
// args.matB_base.base, args.matB_ld);
424+
// }
425+
// }
426426
if (gemm_layer2_t::msg_type_a != msg_type::unaligned_2d) {
427427
if (gemm_layer2_t::msg_type_a == msg_type::block_2d) {
428428
implementable &=
@@ -451,20 +451,20 @@ class multi_layer_perceptron_t {
451451
args.matV_base.base, args.matV_ld);
452452
}
453453
}
454-
if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
455-
if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
456-
implementable &=
457-
kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
458-
(uint64_t)(args.matC_base.base),
459-
args.matrix_n_layer2,
460-
args.matrix_m_layer2,
461-
args.matC_ld);
462-
} else {
463-
implementable &=
464-
kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
465-
args.matC_base.base, args.matC_ld);
466-
}
467-
}
454+
// if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
455+
// if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
456+
// implementable &=
457+
// kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
458+
// (uint64_t)(args.matC_base.base),
459+
// args.matrix_n_layer2,
460+
// args.matrix_m_layer2,
461+
// args.matC_ld);
462+
// } else {
463+
// implementable &=
464+
// kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
465+
// args.matC_base.base, args.matC_ld);
466+
// }
467+
// }
468468

469469
return implementable;
470470
}

examples/08_scaled_dot_product_attention/softmax.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,18 +60,21 @@ struct xetla_softmax_fwd_t {
6060
using softmax_tile_desc_t = subgroup::
6161
tile_desc_t<SIMD, block_height, SIMD, block_height, reg_layout::tiled>;
6262
using softmax_load_t = subgroup::tile_t<dtype_in, softmax_tile_desc_t>;
63+
using mem_desc_in_t = mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>;
6364
using softmax_load_payload_t = subgroup::mem_payload_t<
64-
mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>,
65+
mem_desc_in_t,
6566
softmax_tile_desc_t,
66-
subgroup::msg_type_v<softmax_tile_desc_t, mem_space_in>,
67+
subgroup::msg_type_v<softmax_tile_desc_t, mem_desc_in_t>,
6768
arch_tag>;
6869

6970
// this tile will store the softmax result to global memory
7071
using softmax_store_t = subgroup::tile_t<dtype_out, softmax_tile_desc_t>;
72+
using mem_desc_out_t =
73+
mem_desc_t<dtype_out, mem_layout::row_major, mem_space_out>;
7174
using softmax_store_payload_t = subgroup::mem_payload_t<
72-
mem_desc_t<dtype_out, mem_layout::row_major, mem_space_out>,
75+
mem_desc_out_t,
7376
softmax_tile_desc_t,
74-
subgroup::msg_type_v<softmax_tile_desc_t, mem_space_out>,
77+
subgroup::msg_type_v<softmax_tile_desc_t, mem_desc_out_t>,
7578
arch_tag>;
7679

7780
struct arguments_t {

examples/09_gate_recurrent_unit/kernel_func.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ struct gru_layer {
156156
using mat_hidden_payload_t = mem_payload_t<
157157
mem_desc_a_t,
158158
matC_tile_desc_t,
159-
msg_type_v<matC_tile_desc_t, mem_loc_input>,
159+
msg_type_v<matC_tile_desc_t, mem_desc_a_t>,
160160
gpu_arch::XeHpc>;
161161
using matC_payload_t = mem_payload_t<
162162
mem_desc_c_t,

include/common/core/arch_config.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,15 @@ inline constexpr bool arch_has_2d_load_store =
9393

9494
template <gpu_arch arch_tag>
9595
struct load_store_attr_t<msg_type::block_1d, arch_tag> {
96-
static constexpr uint32_t max_load_vec_len = 32;
97-
static constexpr uint32_t max_store_vec_len = 32;
96+
static constexpr uint32_t max_load_vec_len = 256;
97+
static constexpr uint32_t max_store_vec_len = 256;
9898
static constexpr uint32_t max_prefetch_vec_len = 32;
9999
};
100100

101101
template <>
102102
struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
103-
static constexpr uint32_t max_load_vec_len = 64;
104-
static constexpr uint32_t max_store_vec_len = 64;
103+
static constexpr uint32_t max_load_vec_len = 512;
104+
static constexpr uint32_t max_store_vec_len = 512;
105105
static constexpr uint32_t max_prefetch_vec_len = 64;
106106
};
107107

include/common/core/base_consts.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@
2323

2424
namespace gpu::xetla {
2525

26-
/// @addtogroup xetla_core_base_types
26+
/// @addtogroup xetla_core_base_consts
2727
/// @{
28-
29-
/// @} xetla_core_base_types
28+
/// @} xetla_core_base_consts
3029

3130
} // namespace gpu::xetla

include/common/core/base_types.hpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,32 @@ using fp16 = sycl::half;
5555
///
5656
using tf32 = sycl::ext::intel::experimental::esimd::tfloat32;
5757

58+
/// @brief xetla 4bits data packed as 8bits data type.
59+
/// 2 4bit data pack to one byte
60+
struct int4x2 {
61+
uint8_t data;
62+
63+
operator uint8_t() const {
64+
return data;
65+
}
66+
int4x2(uint8_t val) {
67+
data = val;
68+
}
69+
};
70+
71+
/// @brief xetla 4bits data packed as 32bits data type.
72+
/// 8 4bit data pack to 4 bytes
73+
struct int4x8 {
74+
uint32_t data;
75+
76+
operator uint32_t() const {
77+
return data;
78+
}
79+
int4x8(uint32_t val) {
80+
data = val;
81+
}
82+
};
83+
5884
/// @brief mx_fp4(E2M1) data packed as 8bits data type.
5985
struct mx_fp4 {
6086
uint8_t data;
@@ -89,6 +115,8 @@ template <typename T>
89115
struct is_internal_type {
90116
static constexpr bool value = std::is_same<remove_const_t<T>, bf16>::value ||
91117
std::is_same<remove_const_t<T>, tf32>::value ||
118+
std::is_same<remove_const_t<T>, int4x2>::value ||
119+
std::is_same<remove_const_t<T>, int4x8>::value ||
92120
std::is_same<remove_const_t<T>, mx_fp4>::value;
93121
};
94122
template <typename T>
@@ -137,6 +165,18 @@ struct native_type<mx_fp4> {
137165
using type = uint8_t;
138166
};
139167

168+
/// @brief Set uint8_t as the native data type of int4x2.
169+
template <>
170+
struct native_type<int4x2> {
171+
using type = uint8_t;
172+
};
173+
174+
/// @brief Set uint8_t as the native data type of int4x8.
175+
template <>
176+
struct native_type<int4x8> {
177+
using type = uint32_t;
178+
};
179+
140180
/// @brief Return the native data type of T
141181
template <typename T>
142182
using native_type_t = typename native_type<T>::type;

include/common/core/common_types.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,13 @@ enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };
2626
enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };
2727

2828
enum class mem_layout : uint8_t { row_major = 0, col_major = 1 };
29+
30+
enum class quant_mode : uint8_t { S4_ASYM = 0, S4_FULLRANGE_NO_ZP = 1 };
31+
32+
struct quant_info {
33+
quant_mode quant_mode;
34+
uint32_t dequant_s;
35+
mem_layout weight_mem_layout;
36+
};
37+
2938
} // namespace gpu::xetla

0 commit comments

Comments
 (0)