Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit b3d8f65

Browse files
authored
Support int4 linear and SDP on MTL iGPU (#222)
* copy include folder * rename arch * fix compile errors * add fmha test
1 parent de1cd8f commit b3d8f65

File tree

117 files changed

+2853
-817
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+2853
-817
lines changed

.editorconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ trim_trailing_whitespace = true
1212
# C/C++ follows clang-format
1313
[*.{c,cpp,h,hpp}]
1414
indent_style = space
15-
indent_size = 4
15+
indent_size = 2

examples/03_gemm_relu_bias/gemm_relu_bias.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ void gemm_relu_bias_run(uint32_t iter) {
148148
xetla::mem_desc_t<float, mem_layout::row_major, mem_space::global>;
149149

150150
using bias_op_t =
151-
xetla::subgroup::bias_add_op_t<mem_desc_bias_t, gpu_arch::Xe>;
151+
xetla::subgroup::bias_add_op_t<mem_desc_bias_t, gpu_arch::XeHpc>;
152152
using tile_op_t = xetla::subgroup::chained_tile_op_t<
153153
xetla::subgroup::relu_op_t, // apply elementwise ReLU
154154
bias_op_t // apply elementwise BiasAdd
@@ -160,7 +160,7 @@ void gemm_relu_bias_run(uint32_t iter) {
160160
// Mathematically epilogue_t is a map that applies to each element:
161161
// epilogue_t: [m, n] -> [m, n], C_acc |-> tile_op_t(C_acc)
162162
using epilogue_policy =
163-
xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
163+
xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::XeHpc>;
164164

165165
// Micro-kernel configuration
166166
using tune_option = dict_t<
@@ -180,7 +180,7 @@ void gemm_relu_bias_run(uint32_t iter) {
180180
mem_layout::row_major, // memory layout for C
181181
8, // leading dimension alignment for C, in unit of element
182182
data_type_acc, // accumulator data type for intermediate results
183-
gpu_arch::Xe, // GPU arch
183+
gpu_arch::XeHpc, // GPU arch
184184
tune_option>;
185185
using gemm_op_t = typename default_config_t::type;
186186

examples/04_gemm_polynomial/gemm_polynomial.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ void gemm_polynomial_run(int iter) {
153153
// Mathematically epilogue_t is a map that applies to each element:
154154
// epilogue_t: [m, n] -> [m, n], C_acc |-> tile_op_t(C_acc)
155155
using epilogue_policy =
156-
xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
156+
xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::XeHpc>;
157157

158158
// Micro-kernel configuration
159159
using tune_option = dict_t<
@@ -174,7 +174,7 @@ void gemm_polynomial_run(int iter) {
174174
mem_layout::row_major, // memory layout for C
175175
8, // leading dimension alignment for C, in unit of element
176176
data_type_acc, // accumulator data type for intermediate results
177-
gpu_arch::Xe, // GPU arch
177+
gpu_arch::XeHpc, // GPU arch
178178
tune_option>;
179179

180180
using gemm_op_t = typename default_config_t::type;

examples/05_batch_gemm/batch_gemm.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ void batch_gemm_run(uint32_t iter) {
118118
data_type_acc, // accumulator data type for intermediate results
119119
wg_shape, // computation tile shape
120120
wg_tile_k, // elements in each iteration
121-
gpu_arch::Xe, // GPU arch
121+
gpu_arch::XeHpc, // GPU arch
122122
tune_option>;
123123

124124
using epilogue_t = xetla::group::default_epilogue_selector_t<
@@ -128,11 +128,11 @@ void batch_gemm_run(uint32_t iter) {
128128
mem_space::global, // memory writing to global mem for C
129129
wg_shape, // computation tile shape
130130
wg_tile_k, // elements in each iteration
131-
gpu_arch::Xe, // GPU arch
131+
gpu_arch::XeHpc, // GPU arch
132132
tune_option>;
133133

134134
using batch_gemm_op_t =
135-
xetla::kernel::batch_gemm_t<gemm_t, epilogue_t, gpu_arch::Xe>;
135+
xetla::kernel::batch_gemm_t<gemm_t, epilogue_t, gpu_arch::XeHpc>;
136136

137137
// set up gemm_universal arguments
138138
typename batch_gemm_op_t::arguments_t gemm_arg(

examples/05_batch_gemm/batch_gemm.hpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -250,40 +250,43 @@ class batch_gemm_t {
250250
bool implementable = true;
251251
if (gemm_t::msg_type_a != msg_type::unaligned_2d) {
252252
if (gemm_t::msg_type_a == msg_type::block_2d) {
253-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_a>::check_tensor(
254-
(uint64_t)(args.matA_base.base),
255-
args.matrix_k,
256-
args.matrix_m * args.batch_size,
257-
args.matA_ld);
253+
implementable &=
254+
kernel::block_2d<gpu_arch::XeHpc, dtype_a>::check_tensor(
255+
(uint64_t)(args.matA_base.base),
256+
args.matrix_k,
257+
args.matrix_m * args.batch_size,
258+
args.matA_ld);
258259
} else {
259260
implementable &=
260-
kernel::general_1d<gpu_arch::Xe, dtype_a>::check_alignment(
261+
kernel::general_1d<gpu_arch::XeHpc, dtype_a>::check_alignment(
261262
args.matA_base.base, args.matA_ld);
262263
}
263264
}
264265
if (gemm_t::msg_type_b != msg_type::unaligned_2d) {
265266
if (gemm_t::msg_type_b == msg_type::block_2d) {
266-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_b>::check_tensor(
267-
(uint64_t)(args.matB_base.base),
268-
args.matrix_n,
269-
args.matrix_k * args.batch_size,
270-
args.matB_ld);
267+
implementable &=
268+
kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
269+
(uint64_t)(args.matB_base.base),
270+
args.matrix_n,
271+
args.matrix_k * args.batch_size,
272+
args.matB_ld);
271273
} else {
272274
implementable &=
273-
kernel::general_1d<gpu_arch::Xe, dtype_b>::check_alignment(
275+
kernel::general_1d<gpu_arch::XeHpc, dtype_b>::check_alignment(
274276
args.matB_base.base, args.matB_ld);
275277
}
276278
}
277279
if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
278280
if (epilogue_t::msg_type_c == msg_type::block_2d) {
279-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_c>::check_tensor(
280-
(uint64_t)(args.matC_base.base),
281-
args.matrix_n,
282-
args.matrix_m * args.batch_size,
283-
args.matC_ld);
281+
implementable &=
282+
kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
283+
(uint64_t)(args.matC_base.base),
284+
args.matrix_n,
285+
args.matrix_m * args.batch_size,
286+
args.matC_ld);
284287
} else {
285288
implementable &=
286-
kernel::general_1d<gpu_arch::Xe, dtype_c>::check_alignment(
289+
kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
287290
args.matC_base.base, args.matC_ld);
288291
}
289292
}

examples/06_gemm_softmax/gemm_softmax.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ void gemm_softmax_run(uint32_t iter) {
227227
data_type_sfx, // accumulator data type for intermediate results
228228
wg_shape, // computation tile shape
229229
k_iter_num, // elements in each iteration
230-
gpu_arch::Xe, // GPU arch
230+
gpu_arch::XeHpc, // GPU arch
231231
tune_option>;
232232

233233
using gemm_args_t = gemm_op_t::arguments_t;
@@ -239,14 +239,14 @@ void gemm_softmax_run(uint32_t iter) {
239239
mem_space::global, // memory writing to global mem for C
240240
wg_shape, // computation tile shape
241241
k_iter_num, // elements in each iteration
242-
gpu_arch::Xe, // GPU arch
242+
gpu_arch::XeHpc, // GPU arch
243243
tune_option>;
244244

245245
// using experimental::group::softmax
246246
// define softmax forward op
247247
using tile_shape = typename gemm_op_t::tile_shape;
248248
using softmax_fwd_t = softmax_t<
249-
softmax_policy_fwd<data_type_sfx, gpu_arch::Xe>,
249+
softmax_policy_fwd<data_type_sfx, gpu_arch::XeHpc>,
250250
tile_shape>;
251251
using softmax_fwd_args_t = typename softmax_fwd_t::arguments_t;
252252

examples/07_multi_layer_perceptron/multi_layer_perceptron.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ void mlp_run(uint32_t iter) {
192192
// Micro-kernel configuration
193193
using epilogue_policy_layer1 = xetla::group::epilogue_policy_tile_op<
194194
xetla::subgroup::chained_tile_op_t<gpu::xetla::subgroup::relu_op_t>,
195-
gpu_arch::Xe>;
195+
gpu_arch::XeHpc>;
196196
using layer1_tune_option = dict_t<
197197
elem_v_t<
198198
tune_key::param_optimizer_type,
@@ -213,7 +213,7 @@ void mlp_run(uint32_t iter) {
213213
data_type_acc, // accumulator data type for intermediate results
214214
wg_shape_layer1, // computation tile shape
215215
wg_tile_k, // elements in each iteration
216-
gpu_arch::Xe, // GPU arch
216+
gpu_arch::XeHpc, // GPU arch
217217
layer1_tune_option>;
218218

219219
using epilogue_layer1_t = xetla::group::default_epilogue_selector_t<
@@ -223,7 +223,7 @@ void mlp_run(uint32_t iter) {
223223
mem_space::global, // memory writing to global mem for B
224224
wg_shape_layer1, // computation tile shape
225225
wg_tile_k, // elements in each iteration
226-
gpu_arch::Xe, // GPU arch
226+
gpu_arch::XeHpc, // GPU arch
227227
layer1_tune_option>;
228228

229229
using wg_shape_layer2 = shape<wg_tile_n_layer2, wg_tile_m_layer2>;
@@ -249,7 +249,7 @@ void mlp_run(uint32_t iter) {
249249
data_type_acc, // accumulator data type for intermediate results
250250
wg_shape_layer2, // computation tile shape
251251
wg_tile_k, // elements in each iteration
252-
gpu_arch::Xe, // GPU arch
252+
gpu_arch::XeHpc, // GPU arch
253253
layer2_tune_option>;
254254

255255
using epilogue_layer2_t = xetla::group::default_epilogue_selector_t<
@@ -259,15 +259,15 @@ void mlp_run(uint32_t iter) {
259259
mem_space::global, // memory writing to global mem for C
260260
wg_shape_layer2, // computation tile shape
261261
wg_tile_k, // elements in each iteration
262-
gpu_arch::Xe, // GPU arch
262+
gpu_arch::XeHpc, // GPU arch
263263
layer2_tune_option>;
264264

265265
using mlp_op_t = xetla::kernel::multi_layer_perceptron_t<
266266
gemm_layer1_t,
267267
epilogue_layer1_t,
268268
gemm_layer2_t,
269269
epilogue_layer2_t,
270-
gpu_arch::Xe>;
270+
gpu_arch::XeHpc>;
271271

272272
// set up mlp arguments
273273
// for relu we don't need to set arguments

examples/07_multi_layer_perceptron/multi_layer_perceptron.hpp

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -383,79 +383,85 @@ class multi_layer_perceptron_t {
383383
bool implementable = true;
384384
if (gemm_layer1_t::msg_type_a != msg_type::unaligned_2d) {
385385
if (gemm_layer1_t::msg_type_a == msg_type::block_2d) {
386-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_a>::check_tensor(
387-
(uint64_t)(args.matA_base.base),
388-
args.matrix_k_layer1,
389-
args.matrix_m_layer1,
390-
args.matA_ld);
386+
implementable &=
387+
kernel::block_2d<gpu_arch::XeHpc, dtype_a>::check_tensor(
388+
(uint64_t)(args.matA_base.base),
389+
args.matrix_k_layer1,
390+
args.matrix_m_layer1,
391+
args.matA_ld);
391392
} else {
392393
implementable &=
393-
kernel::general_1d<gpu_arch::Xe, dtype_a>::check_alignment(
394+
kernel::general_1d<gpu_arch::XeHpc, dtype_a>::check_alignment(
394395
args.matA_base.base, args.matA_ld);
395396
}
396397
}
397398
if (gemm_layer1_t::msg_type_b != msg_type::unaligned_2d) {
398399
if (gemm_layer1_t::msg_type_b == msg_type::block_2d) {
399-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_w>::check_tensor(
400-
(uint64_t)(args.matW_base.base),
401-
args.matrix_n_layer1,
402-
args.matrix_k_layer1,
403-
args.matW_ld);
400+
implementable &=
401+
kernel::block_2d<gpu_arch::XeHpc, dtype_w>::check_tensor(
402+
(uint64_t)(args.matW_base.base),
403+
args.matrix_n_layer1,
404+
args.matrix_k_layer1,
405+
args.matW_ld);
404406
} else {
405407
implementable &=
406-
kernel::general_1d<gpu_arch::Xe, dtype_w>::check_alignment(
408+
kernel::general_1d<gpu_arch::XeHpc, dtype_w>::check_alignment(
407409
args.matW_base.base, args.matW_ld);
408410
}
409411
}
410412
if (epilogue_layer1_t::msg_type_c != msg_type::unaligned_2d) {
411413
if (epilogue_layer1_t::msg_type_c == msg_type::block_2d) {
412-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_b>::check_tensor(
413-
(uint64_t)(args.matB_base.base),
414-
args.matrix_n_layer1,
415-
args.matrix_m_layer1,
416-
args.matB_ld);
414+
implementable &=
415+
kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
416+
(uint64_t)(args.matB_base.base),
417+
args.matrix_n_layer1,
418+
args.matrix_m_layer1,
419+
args.matB_ld);
417420
} else {
418421
implementable &=
419-
kernel::general_1d<gpu_arch::Xe, dtype_b>::check_alignment(
422+
kernel::general_1d<gpu_arch::XeHpc, dtype_b>::check_alignment(
420423
args.matB_base.base, args.matB_ld);
421424
}
422425
}
423426
if (gemm_layer2_t::msg_type_a != msg_type::unaligned_2d) {
424427
if (gemm_layer2_t::msg_type_a == msg_type::block_2d) {
425-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_b>::check_tensor(
426-
(uint64_t)(args.matB_base.base),
427-
args.matrix_k_layer2,
428-
args.matrix_m_layer2,
429-
args.matB_ld);
428+
implementable &=
429+
kernel::block_2d<gpu_arch::XeHpc, dtype_b>::check_tensor(
430+
(uint64_t)(args.matB_base.base),
431+
args.matrix_k_layer2,
432+
args.matrix_m_layer2,
433+
args.matB_ld);
430434
} else {
431435
implementable &=
432-
kernel::general_1d<gpu_arch::Xe, dtype_a>::check_alignment(
436+
kernel::general_1d<gpu_arch::XeHpc, dtype_a>::check_alignment(
433437
args.matB_base.base, args.matB_ld);
434438
}
435439
}
436440
if (gemm_layer2_t::msg_type_b != msg_type::unaligned_2d) {
437441
if (gemm_layer2_t::msg_type_b == msg_type::block_2d) {
438-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_v>::check_tensor(
439-
(uint64_t)(args.matV_base.base),
440-
args.matrix_n_layer2,
441-
args.matrix_k_layer2,
442-
args.matV_ld);
442+
implementable &=
443+
kernel::block_2d<gpu_arch::XeHpc, dtype_v>::check_tensor(
444+
(uint64_t)(args.matV_base.base),
445+
args.matrix_n_layer2,
446+
args.matrix_k_layer2,
447+
args.matV_ld);
443448
} else {
444449
implementable &=
445-
kernel::general_1d<gpu_arch::Xe, dtype_v>::check_alignment(
450+
kernel::general_1d<gpu_arch::XeHpc, dtype_v>::check_alignment(
446451
args.matV_base.base, args.matV_ld);
447452
}
448453
}
449454
if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
450455
if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
451-
implementable &= kernel::block_2d<gpu_arch::Xe, dtype_c>::check_tensor(
452-
(uint64_t)(args.matC_base.base),
453-
args.matrix_n_layer2,
454-
args.matrix_m_layer2,
455-
args.matC_ld);
456+
implementable &=
457+
kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
458+
(uint64_t)(args.matC_base.base),
459+
args.matrix_n_layer2,
460+
args.matrix_m_layer2,
461+
args.matC_ld);
456462
} else {
457463
implementable &=
458-
kernel::general_1d<gpu_arch::Xe, dtype_c>::check_alignment(
464+
kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
459465
args.matC_base.base, args.matC_ld);
460466
}
461467
}
@@ -557,7 +563,7 @@ class multi_layer_perceptron_t {
557563
xetla_nbarrier_t<
558564
work_group_layer2_t::size,
559565
work_group_layer2_t::size,
560-
gpu_arch::Xe>
566+
gpu_arch::XeHpc>
561567
nbarrier_global;
562568
nbarrier_global.init_nbarrier(
563569
global_nbarr_base, nbarrier_role::producer_consumer);

examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
170170

171171
constexpr double slm_ratio_to_pvc =
172172
static_cast<double>(arch_attr_t<arch_tag>::local_mem_size) /
173-
arch_attr_t<gpu_arch::Xe>::local_mem_size;
173+
arch_attr_t<gpu_arch::XeHpc>::local_mem_size;
174174

175175
constexpr uint32_t wg_tile_m_qksv = 64 * slm_ratio_to_pvc;
176176

examples/09_gate_recurrent_unit/kernel_func.hpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,11 @@ struct gru_layer {
113113
using perf_tuning_knob =
114114
perf_tuning_knob_t<sg_tile_k, prefetch_distance, periodic_sync_interval>;
115115

116-
using compute_attr = group::compute_attr_t<T, T, Act_T>;
117-
using compute_policy =
118-
compute_policy_default_xmx<compute_attr, perf_tuning_knob, gpu_arch::Xe>;
116+
using compute_attr = xetla::group::compute_attr_t<T, T, Act_T>;
117+
using compute_policy = compute_policy_default_xmx<
118+
compute_attr,
119+
perf_tuning_knob,
120+
gpu_arch::XeHpc>;
119121
using mem_desc_a_t = mem_desc_t<T, layout_input, mem_loc_input>;
120122
using mem_desc_b_t = mem_desc_t<T, layout_weight, mem_loc_weight>;
121123
// Org the compute shape for sub-matrix
@@ -138,7 +140,7 @@ struct gru_layer {
138140
// define arguments for each epilogue_tile_op in chained_tile_op_t<>
139141

140142
using epilogue_t = epilogue_t<
141-
epilogue_policy_default<gpu_arch::Xe>,
143+
epilogue_policy_default<gpu_arch::XeHpc>,
142144
tile_shape,
143145
mem_desc_c_t>;
144146
using epilogue_args_t = typename epilogue_t::arguments_t;
@@ -155,12 +157,12 @@ struct gru_layer {
155157
mem_desc_a_t,
156158
matC_tile_desc_t,
157159
msg_type_v<matC_tile_desc_t, mem_loc_input>,
158-
gpu_arch::Xe>;
160+
gpu_arch::XeHpc>;
159161
using matC_payload_t = mem_payload_t<
160162
mem_desc_c_t,
161163
matC_tile_desc_t,
162164
msg_type::block_2d,
163-
gpu_arch::Xe>;
165+
gpu_arch::XeHpc>;
164166
using sigmoid_t = typename subgroup::sigmoid_op_t;
165167
using tanh_t = typename subgroup::tanh_op_t;
166168
static void inline call(sycl::nd_item<3>& item, fused_config_t<T>* args) {

0 commit comments

Comments
 (0)