Skip to content

Commit 024511a

Browse files
committed
Fuse softmax_topk_oneshot with moe_compressed
1 parent 8db00f6 commit 024511a

File tree

13 files changed

+138
-149
lines changed

13 files changed

+138
-149
lines changed

src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,4 +311,4 @@ REGISTER_FACTORY(internal, PagedAttentionExtension);
311311
REGISTER_FACTORY(internal, LoraSubgraph);
312312
REGISTER_FACTORY(internal, LoraSubgraphFused);
313313
REGISTER_FACTORY(internal, VLSDPA);
314-
REGISTER_FACTORY(internal, MOECompressed);
314+
REGISTER_FACTORY(internal, MOEFusedCompressed);

src/plugins/intel_gpu/include/intel_gpu/primitives/moe_compressed.hpp renamed to src/plugins/intel_gpu/include/intel_gpu/primitives/moe_fused_compressed.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,45 @@
66
#include <vector>
77

88
#include "intel_gpu/runtime/engine.hpp"
9-
#include "intel_gpu/op/moe_compressed.hpp"
9+
#include "intel_gpu/op/moe_fused_compressed.hpp"
1010
#include "primitive.hpp"
1111

1212
namespace cldnn {
13-
using MOECompressed = ov::intel_gpu::op::MOECompressed;
13+
using MOEFusedCompressed = ov::intel_gpu::op::MOEFusedCompressed;
1414

1515
/// @brief moe compressed primitive
1616
/// @details Performs moe compressed
17-
struct moe_compressed : public primitive_base<moe_compressed> {
18-
CLDNN_DECLARE_PRIMITIVE(moe_compressed)
17+
struct moe_fused_compressed : public primitive_base<moe_fused_compressed> {
18+
CLDNN_DECLARE_PRIMITIVE(moe_fused_compressed)
1919

20-
moe_compressed() : primitive_base("", {}) {}
20+
moe_fused_compressed() : primitive_base("", {}) {}
2121

2222
/// @brief Constructs moe primitive / layer.
2323
///
2424
/// @param id An identifier of new primitive.
2525
/// @param inputs A list of Input primitive ids (inputs).
26-
moe_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOECompressed::Config& config)
26+
moe_fused_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOEFusedCompressed::Config& config)
2727
: primitive_base(id, inputs, 1, {optional_data_type()}),
2828
_config(config) {}
2929

30-
MOECompressed::Config _config;
30+
MOEFusedCompressed::Config _config;
3131

3232
bool operator==(const primitive& rhs) const override {
3333
if (!compare_common_params(rhs))
3434
return false;
3535

36-
auto rhs_casted = downcast<const moe_compressed>(rhs);
36+
auto rhs_casted = downcast<const moe_fused_compressed>(rhs);
3737

3838
return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
3939
}
4040

4141
void save(BinaryOutputBuffer& ob) const override {
42-
primitive_base<moe_compressed>::save(ob);
42+
primitive_base<moe_fused_compressed>::save(ob);
4343
ob << make_data(&_config, sizeof(_config));
4444
}
4545

4646
void load(BinaryInputBuffer& ib) override {
47-
primitive_base<moe_compressed>::load(ib);
47+
primitive_base<moe_fused_compressed>::load(ib);
4848
ib >> make_data(&_config, sizeof(_config));
4949
}
5050
};

src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_opt.cl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,8 @@ KERNEL (gather_2d_ref)(
9292
#endif
9393

9494
if (off == 0) {
95-
// int top_idx = top_index[k];
96-
// dst_rweight[k] = src_rweight[top_idx];
97-
dst_rweight[k] = src_rweight[tok_idx];
95+
int top_idx = top_index[k];
96+
dst_rweight[k] = src_rweight[top_idx];
9897
}
9998
}
10099

src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_opt.cpp

Lines changed: 88 additions & 104 deletions
Large diffs are not rendered by default.

src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe_opt.hpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,15 @@ namespace ov::intel_gpu::ocl {
2222
enum class MOEInputIndex : uint8_t {
2323
HIDDEN_STATES = 0,
2424
ROUTING_WEIGHTS = 1,
25-
ROUTER_TOPK_OUTPUT_INDICES = 2,
26-
WEIGHT_0 = 3,
27-
SCALE_0 = 4,
28-
ZP_0 = 5,
29-
WEIGHT_1 = 6,
30-
SCALE_1 = 7,
31-
ZP_1 = 8,
32-
WEIGHT_2 = 9,
33-
SCALE_2 = 10,
34-
ZP_2 = 11
25+
WEIGHT_0 = 2,
26+
SCALE_0 = 3,
27+
ZP_0 = 4,
28+
WEIGHT_1 = 5,
29+
SCALE_1 = 6,
30+
ZP_1 = 7,
31+
WEIGHT_2 = 8,
32+
SCALE_2 = 9,
33+
ZP_2 = 10
3534
};
3635

3736
struct MOEOpt : public ImplementationManager {

src/plugins/intel_gpu/src/graph/include/moe_inst.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,21 @@
88
#include <string>
99
#include <vector>
1010

11-
#include "intel_gpu/primitives/moe_compressed.hpp"
11+
#include "intel_gpu/primitives/moe_fused_compressed.hpp"
1212
#include "primitive_inst.h"
1313

1414
namespace cldnn {
1515
namespace details {}
1616

1717
template <>
18-
struct typed_program_node<moe_compressed> : public typed_program_node_base<moe_compressed> {
18+
struct typed_program_node<moe_fused_compressed> : public typed_program_node_base<moe_fused_compressed> {
1919
private:
20-
using parent = typed_program_node_base<moe_compressed>;
20+
using parent = typed_program_node_base<moe_fused_compressed>;
2121

2222
public:
2323
using parent::parent;
2424

25-
typed_program_node(std::shared_ptr<moe_compressed> prim, program& prog) : parent(prim, prog) {}
25+
typed_program_node(std::shared_ptr<moe_fused_compressed> prim, program& prog) : parent(prim, prog) {}
2626

2727
using parent::get_kernel_impl_params;
2828
std::unique_ptr<kernel_impl_params> get_kernel_impl_params(const std::vector<layout>& in_layouts, const std::vector<layout>& out_layouts) const override {
@@ -32,11 +32,11 @@ struct typed_program_node<moe_compressed> : public typed_program_node_base<moe_c
3232
}
3333
};
3434

35-
using moe_node = typed_program_node<moe_compressed>;
35+
using moe_node = typed_program_node<moe_fused_compressed>;
3636

3737
template <>
38-
class typed_primitive_inst<moe_compressed> : public typed_primitive_inst_base<moe_compressed> {
39-
using parent = typed_primitive_inst_base<moe_compressed>;
38+
class typed_primitive_inst<moe_fused_compressed> : public typed_primitive_inst_base<moe_fused_compressed> {
39+
using parent = typed_primitive_inst_base<moe_fused_compressed>;
4040
using parent::parent;
4141
using primitive_inst::update_output_memory;
4242

@@ -48,5 +48,5 @@ class typed_primitive_inst<moe_compressed> : public typed_primitive_inst_base<mo
4848
typed_primitive_inst(network& network, const moe_node& node);
4949
};
5050

51-
using moe_inst = typed_primitive_inst<moe_compressed>;
51+
using moe_inst = typed_primitive_inst<moe_fused_compressed>;
5252
} // namespace cldnn

src/plugins/intel_gpu/src/graph/moe.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include "program_node.h"
1414

1515
namespace cldnn {
16-
GPU_DEFINE_PRIMITIVE_TYPE_ID(moe_compressed)
16+
GPU_DEFINE_PRIMITIVE_TYPE_ID(moe_fused_compressed)
1717

1818
/*
1919
Calc_output_layout method is called only when output layout is invalidated.

src/plugins/intel_gpu/src/graph/registry/moe_impls.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33
//
44

5-
#include "intel_gpu/primitives/moe_compressed.hpp"
5+
#include "intel_gpu/primitives/moe_fused_compressed.hpp"
66
#include "primitive_inst.h"
77
#include "registry.hpp"
88

@@ -14,7 +14,7 @@ namespace ov::intel_gpu {
1414

1515
using namespace cldnn;
1616

17-
const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<moe_compressed>::get_implementations() {
17+
const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<moe_fused_compressed>::get_implementations() {
1818
static const std::vector<std::shared_ptr<ImplementationManager>> impls = {OV_GPU_CREATE_INSTANCE_OCL(ocl::MOEOpt, shape_types::any)};
1919

2020
return impls;

src/plugins/intel_gpu/src/graph/registry/registry.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ REGISTER_IMPLS(strided_slice);
166166
REGISTER_IMPLS(tile);
167167
REGISTER_IMPLS(col2im);
168168
REGISTER_IMPLS(vl_sdpa);
169-
REGISTER_IMPLS(moe_compressed);
169+
REGISTER_IMPLS(moe_fused_compressed);
170170

171171
REGISTER_DEFAULT_IMPLS(assign, CPU_S, CPU_D);
172172
REGISTER_DEFAULT_IMPLS(read_value, CPU_S, CPU_D);

src/plugins/intel_gpu/src/plugin/ops/moe.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,36 @@
22
// SPDX-License-Identifier: Apache-2.0
33
//
44

5-
#include "intel_gpu/op/moe_compressed.hpp"
5+
#include "intel_gpu/op/moe_fused_compressed.hpp"
66
#include "intel_gpu/plugin/common_utils.hpp"
77
#include "intel_gpu/plugin/program_builder.hpp"
8-
#include "intel_gpu/primitives/moe_compressed.hpp"
8+
#include "intel_gpu/primitives/moe_fused_compressed.hpp"
9+
#include "intel_gpu/primitives/moe_fused_compressed.hpp"
910

1011

1112
namespace ov {
1213
namespace op {
1314
namespace internal {
14-
using MOECompressed = ov::intel_gpu::op::MOECompressed;
15+
using MOEFusedCompressed = ov::intel_gpu::op::MOEFusedCompressed ;
1516
} // namespace internal
1617
} // namespace op
1718
} // namespace ov
1819

1920
namespace ov::intel_gpu {
2021

21-
static void CreateMOECompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::intel_gpu::op::MOECompressed>& op) {
22+
static void CreateMOEFusedCompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::intel_gpu::op::MOEFusedCompressed>& op) {
2223
auto inputs = p.GetInputInfo(op);
2324
const auto& config = op->get_config();
24-
OPENVINO_ASSERT(inputs.size() == 12, "Inputs count of MOECompressed should be 12");
25+
OPENVINO_ASSERT(inputs.size() == 11, "Inputs count of MOEFusedCompressed should be 11");
2526

2627
const std::string layerName = layer_type_name_ID(op);
2728
// auto& engine = p.get_engine();
2829

29-
const cldnn::moe_compressed moe(layerName, inputs, config);
30+
const cldnn::moe_fused_compressed moe(layerName, inputs, config);
3031

3132
p.add_primitive(*op, moe);
3233
}
3334

34-
REGISTER_FACTORY_IMPL(internal, MOECompressed);
35+
REGISTER_FACTORY_IMPL(internal, MOEFusedCompressed);
3536

3637
} // namespace ov::intel_gpu

0 commit comments

Comments
 (0)