44
55#include " moe_opt.hpp"
66
7- // #define ENABLE_ONEDNN_FOR_GPU
87#ifdef ENABLE_ONEDNN_FOR_GPU
98# include < initializer_list>
109# include < oneapi/dnnl/dnnl.hpp>
@@ -302,11 +301,9 @@ struct onednn_linear {
302301 if (scale) {
303302 // https://uxlfoundation.github.io/oneDNN/page_weights_decompression_matmul_cpp.html
304303 // Quantization Group size for scales. Must be divisible by 32.
305- auto wei_scale_md = dnnl::memory::desc (dnnl::memory::dims ({mm->m_K_groups , mm->m_N }), dnnl::memory::data_type::f16 , dnnl::memory::format_tag::ab);
306- linear.scale = scale; // dnnl::ocl_interop::make_memory(wei_scale_md, linear.m_engine, dnnl::ocl_interop::memory_kind::usm, scale);
304+ linear.scale = scale;
307305 if (zp) {
308- auto wei_zp_md = dnnl::memory::desc (dnnl::memory::dims ({mm->m_K_groups , mm->m_N }), mm->m_w_type , dnnl::memory::format_tag::ab);
309- linear.zp = zp; // dnnl::ocl_interop::make_memory(wei_zp_md, linear.m_engine, dnnl::ocl_interop::memory_kind::usm, zp);
306+ linear.zp = zp;
310307 }
311308 }
312309 return linear;
@@ -315,18 +312,8 @@ struct onednn_linear {
315312 void forward (dnnl::stream& stream, int m, dnnl::memory src_mem, dnnl::memory dst_mem, dnnl::memory bin_mem) {
316313 OV_ITT_SCOPED_TASK (ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle (" onednn_linear::forward()" ));
317314 dnnl::memory::dim M = m;
318-
319315 OPENVINO_ASSERT (m_batch == 0 || m_batch == M, " m_batch=" , m_batch, " M=" , M);
320316
321- dnnl::memory::desc rt_src_md = dnnl::memory::desc (dnnl::memory::dims ({M, m_K}), m_a_type, dnnl::memory::format_tag::ab);
322- dnnl::memory::desc rt_dst_md = dnnl::memory::desc (dnnl::memory::dims ({M, m_N}), m_a_type, dnnl::memory::format_tag::ab);
323- dnnl::memory::desc rt_bin_md;
324- if (mm->bin_per_row ) {
325- rt_bin_md = dnnl::memory::desc (dnnl::memory::dims ({M, 1 }), m_a_type, dnnl::memory::format_tag::ab);
326- } else {
327- rt_bin_md = dnnl::memory::desc (dnnl::memory::dims ({M, m_N}), m_a_type, dnnl::memory::format_tag::ab);
328- }
329-
330317 std::unordered_map<int , dnnl::memory> args;
331318 args.insert ({DNNL_ARG_SRC, src_mem});
332319 args.insert ({DNNL_ARG_WEIGHTS, weight});
@@ -340,7 +327,6 @@ struct onednn_linear {
340327 args.insert ({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp});
341328 }
342329 if (bin_mem) {
343- // auto bin_mem = dnnl::ocl_interop::make_memory(rt_bin_md, m_engine, dnnl::ocl_interop::memory_kind::usm, (void *)(bin_input));
344330 args.insert ({DNNL_ARG_ATTR_MULTIPLE_POST_OP (bin_post_id) | DNNL_ARG_SRC_1, bin_mem});
345331 }
346332 m_prim.execute (stream, args);
@@ -425,6 +411,7 @@ class MOEOptScatter : public KernelGenerator {
425411 }
426412};
427413
414+ // Performance tuning parameters
428415# define N_BLOCK 4
429416# define SUBGROUP_NUM 8
430417
@@ -692,31 +679,31 @@ class MOEOptImpl : public PrimitiveImplOCL {
692679 internal_buffers.emplace_back (routing_layout, true ); // 5: routing_weights
693680 internal_buffers.emplace_back (layout_gateup_out, true ); // 6: gate, scratch.gate has same layout with up
694681 // expert masks for gpu
695- layout index_layout (ov::PartialShape{batch}, ov::element::i32 , cldnn::format::bfyx);
696- for (int i = 0 ; i < expert_num; i++) {
697- internal_buffers.emplace_back (index_layout, true ); // 7: batch
698- internal_buffers.emplace_back (index_layout, true ); // 8: topk
699- }
682+ layout index_layout (ov::PartialShape{expert_num, batch}, ov::element::i32 , cldnn::format::bfyx);
683+ internal_buffers.emplace_back (index_layout, true ); // 7: batch
684+ internal_buffers.emplace_back (index_layout, true ); // 8: topk
700685
701686 return internal_buffers;
702687 }
703688
704- void prepare_internal_buffers (typed_primitive_inst<moe_fused_compressed>& instance, scratch_buffers& scratch, bool is_single_batch ) {
689+ void prepare_internal_buffers (typed_primitive_inst<moe_fused_compressed>& instance, scratch_buffers& scratch, size_t batch ) {
705690 const auto & intermediates_memories = instance.get_intermediates_memories ();
691+ auto & engine = instance.get_network ().get_engine ();
706692 scratch.topk_id = intermediates_memories[0 ];
707693 scratch.topk_weights = intermediates_memories[1 ];
708694 scratch.up = intermediates_memories[2 ];
709695 scratch.y = intermediates_memories[3 ];
710- if (!is_single_batch ) {
696+ if (batch > 1 ) {
711697 scratch.x = intermediates_memories[4 ];
712698 scratch.routing_weights = intermediates_memories[5 ];
713699 scratch.gate = intermediates_memories[6 ];
714700 const auto & config = instance.get_typed_desc <moe_fused_compressed>()->_config ;
715701 int expert_num = static_cast <int >(config.num_expert );
716702 scratch.expert_masks .resize (expert_num);
717703 for (int i = 0 ; i < expert_num; i++) {
718- scratch.expert_masks [i].batch = intermediates_memories[7 + 2 * i + 0 ];
719- scratch.expert_masks [i].topk = intermediates_memories[7 + 2 * i + 1 ];
704+ auto mask_layout = cldnn::layout ({static_cast <int >(batch)}, cldnn::data_types::i32 , cldnn::format::get_default_format (1 ));
705+ scratch.expert_masks [i].batch = engine.create_subbuffer (*intermediates_memories[7 ], mask_layout, i * batch * sizeof (int32_t ));
706+ scratch.expert_masks [i].topk = engine.create_subbuffer (*intermediates_memories[8 ], mask_layout, i * batch * sizeof (int32_t ));
720707 }
721708 }
722709
@@ -1001,7 +988,7 @@ class MOEOptImpl : public PrimitiveImplOCL {
1001988 auto batch = static_cast <int >(hidden_states_layout.get_shape ()[0 ]);
1002989
1003990 scratch_buffers scratch;
1004- prepare_internal_buffers (instance, scratch, batch == 1 );
991+ prepare_internal_buffers (instance, scratch, batch);
1005992
1006993 // softmax+topk
1007994 auto lws_size = cur_moe->_config .num_expert ;
@@ -1050,7 +1037,6 @@ class MOEOptImpl : public PrimitiveImplOCL {
10501037 OPENVINO_ASSERT (false , " hidden_size=" , hidden_size, " is not divisible by any of " , sizeof (candidate) / sizeof (size_t ), " candidates" );
10511038 };
10521039 lws_size = get_best_lws (_hidden_size);
1053- // std::cout << "routing_mem_ptr layout: " << routing_mem_ptr->get_layout().to_short_string() << std::endl;
10541040
10551041 OPENVINO_ASSERT (batch != 1 , " batch size shouldn't be 1 for this path!" );
10561042 for (size_t expert_no = 0 ; expert_no < config.num_expert ; expert_no++) {
0 commit comments