Skip to content

Commit 836d35c

Browse files
committed
cleanup & optimizate intermediate memory
1 parent 39601a4 commit 836d35c

File tree

4 files changed

+33
-47
lines changed

4 files changed

+33
-47
lines changed

src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_opt.cpp

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
#include "moe_opt.hpp"
66

7-
// #define ENABLE_ONEDNN_FOR_GPU
87
#ifdef ENABLE_ONEDNN_FOR_GPU
98
# include <initializer_list>
109
# include <oneapi/dnnl/dnnl.hpp>
@@ -302,11 +301,9 @@ struct onednn_linear {
302301
if (scale) {
303302
// https://uxlfoundation.github.io/oneDNN/page_weights_decompression_matmul_cpp.html
304303
// Quantization Group size for scales. Must be divisible by 32.
305-
auto wei_scale_md = dnnl::memory::desc(dnnl::memory::dims({mm->m_K_groups, mm->m_N}), dnnl::memory::data_type::f16, dnnl::memory::format_tag::ab);
306-
linear.scale = scale; // dnnl::ocl_interop::make_memory(wei_scale_md, linear.m_engine, dnnl::ocl_interop::memory_kind::usm, scale);
304+
linear.scale = scale;
307305
if (zp) {
308-
auto wei_zp_md = dnnl::memory::desc(dnnl::memory::dims({mm->m_K_groups, mm->m_N}), mm->m_w_type, dnnl::memory::format_tag::ab);
309-
linear.zp = zp; // dnnl::ocl_interop::make_memory(wei_zp_md, linear.m_engine, dnnl::ocl_interop::memory_kind::usm, zp);
306+
linear.zp = zp;
310307
}
311308
}
312309
return linear;
@@ -315,18 +312,8 @@ struct onednn_linear {
315312
void forward(dnnl::stream& stream, int m, dnnl::memory src_mem, dnnl::memory dst_mem, dnnl::memory bin_mem) {
316313
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("onednn_linear::forward()"));
317314
dnnl::memory::dim M = m;
318-
319315
OPENVINO_ASSERT(m_batch == 0 || m_batch == M, "m_batch=", m_batch, " M=", M);
320316

321-
dnnl::memory::desc rt_src_md = dnnl::memory::desc(dnnl::memory::dims({M, m_K}), m_a_type, dnnl::memory::format_tag::ab);
322-
dnnl::memory::desc rt_dst_md = dnnl::memory::desc(dnnl::memory::dims({M, m_N}), m_a_type, dnnl::memory::format_tag::ab);
323-
dnnl::memory::desc rt_bin_md;
324-
if (mm->bin_per_row) {
325-
rt_bin_md = dnnl::memory::desc(dnnl::memory::dims({M, 1}), m_a_type, dnnl::memory::format_tag::ab);
326-
} else {
327-
rt_bin_md = dnnl::memory::desc(dnnl::memory::dims({M, m_N}), m_a_type, dnnl::memory::format_tag::ab);
328-
}
329-
330317
std::unordered_map<int, dnnl::memory> args;
331318
args.insert({DNNL_ARG_SRC, src_mem});
332319
args.insert({DNNL_ARG_WEIGHTS, weight});
@@ -340,7 +327,6 @@ struct onednn_linear {
340327
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp});
341328
}
342329
if (bin_mem) {
343-
// auto bin_mem = dnnl::ocl_interop::make_memory(rt_bin_md, m_engine, dnnl::ocl_interop::memory_kind::usm, (void *)(bin_input));
344330
args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(bin_post_id) | DNNL_ARG_SRC_1, bin_mem});
345331
}
346332
m_prim.execute(stream, args);
@@ -425,6 +411,7 @@ class MOEOptScatter : public KernelGenerator {
425411
}
426412
};
427413

414+
// Performance tuning parameters
428415
# define N_BLOCK 4
429416
# define SUBGROUP_NUM 8
430417

@@ -692,31 +679,31 @@ class MOEOptImpl : public PrimitiveImplOCL {
692679
internal_buffers.emplace_back(routing_layout, true); // 5: routing_weights
693680
internal_buffers.emplace_back(layout_gateup_out, true); // 6: gate, scratch.gate has same layout with up
694681
// expert masks for gpu
695-
layout index_layout(ov::PartialShape{batch}, ov::element::i32, cldnn::format::bfyx);
696-
for (int i = 0; i < expert_num; i++) {
697-
internal_buffers.emplace_back(index_layout, true); // 7: batch
698-
internal_buffers.emplace_back(index_layout, true); // 8: topk
699-
}
682+
layout index_layout(ov::PartialShape{expert_num, batch}, ov::element::i32, cldnn::format::bfyx);
683+
internal_buffers.emplace_back(index_layout, true); // 7: batch
684+
internal_buffers.emplace_back(index_layout, true); // 8: topk
700685

701686
return internal_buffers;
702687
}
703688

704-
void prepare_internal_buffers(typed_primitive_inst<moe_fused_compressed>& instance, scratch_buffers& scratch, bool is_single_batch) {
689+
void prepare_internal_buffers(typed_primitive_inst<moe_fused_compressed>& instance, scratch_buffers& scratch, size_t batch) {
705690
const auto& intermediates_memories = instance.get_intermediates_memories();
691+
auto& engine = instance.get_network().get_engine();
706692
scratch.topk_id = intermediates_memories[0];
707693
scratch.topk_weights = intermediates_memories[1];
708694
scratch.up = intermediates_memories[2];
709695
scratch.y = intermediates_memories[3];
710-
if (!is_single_batch) {
696+
if (batch > 1) {
711697
scratch.x = intermediates_memories[4];
712698
scratch.routing_weights = intermediates_memories[5];
713699
scratch.gate = intermediates_memories[6];
714700
const auto& config = instance.get_typed_desc<moe_fused_compressed>()->_config;
715701
int expert_num = static_cast<int>(config.num_expert);
716702
scratch.expert_masks.resize(expert_num);
717703
for (int i = 0; i < expert_num; i++) {
718-
scratch.expert_masks[i].batch = intermediates_memories[7 + 2 * i + 0];
719-
scratch.expert_masks[i].topk = intermediates_memories[7 + 2 * i + 1];
704+
auto mask_layout = cldnn::layout({static_cast<int>(batch)}, cldnn::data_types::i32, cldnn::format::get_default_format(1));
705+
scratch.expert_masks[i].batch = engine.create_subbuffer(*intermediates_memories[7], mask_layout, i * batch * sizeof(int32_t));
706+
scratch.expert_masks[i].topk = engine.create_subbuffer(*intermediates_memories[8], mask_layout, i * batch * sizeof(int32_t));
720707
}
721708
}
722709

@@ -1001,7 +988,7 @@ class MOEOptImpl : public PrimitiveImplOCL {
1001988
auto batch = static_cast<int>(hidden_states_layout.get_shape()[0]);
1002989

1003990
scratch_buffers scratch;
1004-
prepare_internal_buffers(instance, scratch, batch == 1);
991+
prepare_internal_buffers(instance, scratch, batch);
1005992

1006993
// softmax+topk
1007994
auto lws_size = cur_moe->_config.num_expert;
@@ -1050,7 +1037,6 @@ class MOEOptImpl : public PrimitiveImplOCL {
10501037
OPENVINO_ASSERT(false, "hidden_size=", hidden_size, " is not divisible by any of ", sizeof(candidate) / sizeof(size_t), " candidates");
10511038
};
10521039
lws_size = get_best_lws(_hidden_size);
1053-
// std::cout << "routing_mem_ptr layout: " << routing_mem_ptr->get_layout().to_short_string() << std::endl;
10541040

10551041
OPENVINO_ASSERT(batch != 1, "batch size shouldn't be 1 for this path!");
10561042
for (size_t expert_no = 0; expert_no < config.num_expert; expert_no++) {

src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_opt.hpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
using namespace cldnn; // TODO: Remove once namespaces are aligned
1616
namespace ov::intel_gpu::ocl {
1717

18-
// TODO: need confirm, gate is 1st matmul or up is 1st matmul?
1918
// mlp_gate: 0
2019
// mlp_up: 1
2120
// mlp_down: 2
@@ -57,19 +56,34 @@ struct MOEOpt : public ImplementationManager {
5756
return false;
5857
}
5958

60-
// Only support u4 weights for now
59+
// Only support weight: u4
6160
static constexpr std::array supported_wei_type = {
6261
ov::element::u4,
6362
};
6463
const auto& wei_layout = node.get_input_layout(static_cast<size_t>(MOEInputIndex::WEIGHT_0));
6564
if (!one_of(wei_layout.data_type, supported_wei_type)) {
6665
return false;
6766
}
68-
static bool first_time = true;
69-
if (first_time) {
70-
first_time = false;
71-
std::cout << "[ ocl::moe::opt ] validation passed!" << std::endl;
67+
68+
// Only support scale: f16
69+
static constexpr std::array supported_scale_type = {
70+
ov::element::f16,
71+
};
72+
const auto& scale_layout = node.get_input_layout(static_cast<size_t>(MOEInputIndex::SCALE_0));
73+
if (!one_of(scale_layout.data_type, supported_scale_type)) {
74+
return false;
75+
}
76+
77+
// Only support zp: u4
78+
static constexpr std::array supported_zp_type = {
79+
ov::element::u4,
80+
};
81+
const auto& zp_layout = node.get_input_layout(static_cast<size_t>(MOEInputIndex::ZP_0));
82+
if (!one_of(zp_layout.data_type, supported_zp_type)) {
83+
std::cout << "MOEOpt validate_impl: unsupported zp type " << zp_layout.to_string() << std::endl;
84+
return false;
7285
}
86+
7387
return true;
7488
}
7589
};

src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -217,13 +217,6 @@ ConvertMOEToMOECompressed::ConvertMOEToMOECompressed() {
217217
ov::copy_runtime_info(moe, moe_compressed);
218218
ov::replace_node(moe, moe_compressed);
219219

220-
static bool first_time = true;
221-
if (first_time) {
222-
first_time = false;
223-
std::cout << "[ ConvertMOEToMOECompressed ]: num_expert = " << config.num_expert << ", top_k = " << config.top_k
224-
<< ", hidden_size = " << config.hidden_size << ", inter_size = " << config.inter_size << ", group_size = " << config.group_size
225-
<< std::endl;
226-
}
227220
return true;
228221
};
229222

src/plugins/intel_gpu/src/plugin/transformations/fuse_moe_compressed.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,6 @@ FuseMOECompressed::FuseMOECompressed() {
123123
ov::copy_runtime_info(moe_compressed, moe_fused_compressed);
124124
ov::replace_node(moe_compressed, moe_fused_compressed);
125125

126-
static bool first_time = true;
127-
if (first_time) {
128-
first_time = false;
129-
std::cout << "[ FuseMOECompressed ]: num_expert = " << config.num_expert << ", top_k = " << config.top_k << ", hidden_size = " << config.hidden_size
130-
<< ", inter_size = " << config.inter_size << ", group_size = " << config.group_size << std::endl;
131-
}
132-
133126
return true;
134127
};
135128

0 commit comments

Comments
 (0)