Skip to content

Commit e61e47a

Browse files
riverlijunjiechenhu-wangzhaixuejun1993zaixing-wangpeterchen-intel
authored
[GPU]qwen3 moe fused compressed (#32536)
### Details: - Qwen3 moe model support for weight fusion compression - moe transformation: FuseVectorizedMOE3GEMM->ConvertMOEToMOECompressed->FuseMOECompressed - ov::intel_gpu::op::MOEFusedCompressed fuses softmax_topk/onehot into moe computation for performance optimization - prefill stage leverages gemm kernel to compute each experts output one by one - decode stage leverages ocl kernels to compute experts output in parallel. - moe exec graph: <img width="194" height="432" alt="image" src="https://github.com/user-attachments/assets/fb5fd9b9-3b56-43cc-a71c-27cd4b9cd0d2" /> ### Tickets: - *CVS-169299* --------- Co-authored-by: chenhu-wang <[email protected]> Co-authored-by: Xuejun,Zhai <[email protected]> Co-authored-by: Zaixing,Wang <[email protected]> Co-authored-by: Chen Peter <[email protected]>
1 parent cad9e8e commit e61e47a

27 files changed

+5572
-106
lines changed

src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,10 +291,9 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr<ov::Model>
291291
REGISTER_PASS(manager, ConstantFolding)
292292
REGISTER_PASS(manager, SymbolicOptimizations)
293293
REGISTER_PASS(manager, ResolveNameCollisions, true);
294-
// todo: enable after plugin support for MoE
295-
// Remove pytestmark to enable e2e test:
294+
// TODO: Remove pytestmark to enable e2e test:
296295
// tests/model_hub_tests/transformation_tests/test_moe_transformation.py
297-
// REGISTER_PASS(manager, FuseMOE)
296+
REGISTER_PASS(manager, FuseMOE)
298297
REGISTER_PASS(manager, VectorizedMOE2GEMMTransposeWeights)
299298

300299
manager.run_passes(f);
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "intel_gpu/op/moe_compressed.hpp"
8+
9+
namespace ov::intel_gpu::op {
10+
11+
/// \brief MOE3GemmFusedCompressed that support compressed and fused MOE for GEMM3_SWIGLU.
12+
class MOE3GemmFusedCompressed : public MOECompressed {
13+
public:
14+
OPENVINO_OP("MOE3GemmFusedCompressed", "gpu_opset", MOECompressed);
15+
16+
MOE3GemmFusedCompressed() = default;
17+
18+
/// \brief Constructs a MOE3GemmFusedCompressed operation with config only
19+
/// \param args The input tensors, in the following order:
20+
/// 0: hidden_states - input tensor with hidden representations
21+
/// 1: routing_weights - [num_seq, num_experts] routing weights for all experts
22+
/// 2: w0_weight - expert weights for first projection,
23+
/// shape [num_experts, inter_size, group_num, group_size]
24+
/// 3: w0_scale - expert scale for first projection for compressed experts,
25+
/// shape [num_experts, inter_size, group_num, 1]
26+
/// 4: w0_zp - expert zp for first projection for compressed experts,
27+
/// shape [num_experts, inter_size, group_num, 1]
28+
/// 5: w1_weight - expert weights for second projection,
29+
/// shape [num_experts, inter_size, group_num, group_size]
30+
/// 6: w1_scale - expert scale for second projection for compressed experts,
31+
/// shape [num_experts, inter_size, group_num, 1]
32+
/// 7: w1_zp - expert zp for second projection for compressed experts,
33+
/// shape [num_experts, inter_size, group_num, 1]
34+
/// 8: w2_weight - expert weights for final projection,
35+
/// shape [num_experts, hidden_size, group_num, group_size]
36+
/// 9: w2_scale - expert scale for final projection for compressed experts,
37+
/// shape [num_experts, hidden_size, group_num, 1]
38+
/// 10: w2_zp - expert zp for final projection for compressed experts,
39+
/// shape [num_experts, hidden_size, group_num, 1]
40+
/// \param config Configuration for the MOE 3GEMM SWIGLU fused operation
41+
MOE3GemmFusedCompressed(const OutputVector& args, const MOECompressed::Config config);
42+
43+
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
44+
};
45+
46+
} // namespace ov::intel_gpu::op

src/plugins/intel_gpu/include/intel_gpu/op/moe_compressed.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class MOECompressed : public ov::op::internal::MOE {
1515
OPENVINO_OP("MOECompressed", "gpu_opset", ov::op::internal::MOE);
1616

1717
MOECompressed() = default;
18+
MOECompressed(const OutputVector& args) : MOE(args) {}
1819

1920
struct Config : public MOE::Config {
2021
size_t hidden_size = 0;

src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,4 +312,5 @@ REGISTER_FACTORY(internal, PagedAttentionExtension);
312312
REGISTER_FACTORY(internal, LoraSubgraph);
313313
REGISTER_FACTORY(internal, LoraSubgraphFused);
314314
REGISTER_FACTORY(internal, VLSDPA);
315+
REGISTER_FACTORY(internal, MOE3GemmFusedCompressed);
315316
REGISTER_FACTORY(internal, MOECompressed);
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Copyright (C) 2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
#include <vector>
7+
8+
#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
9+
#include "intel_gpu/runtime/engine.hpp"
10+
#include "primitive.hpp"
11+
12+
namespace cldnn {
13+
using MOE3GemmFusedCompressed = ov::intel_gpu::op::MOE3GemmFusedCompressed;
14+
15+
/// @brief moe compressed primitive
16+
/// @details Performs moe compressed
17+
struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compressed> {
18+
CLDNN_DECLARE_PRIMITIVE(moe_3gemm_fused_compressed)
19+
20+
moe_3gemm_fused_compressed() : primitive_base("", {}) {}
21+
22+
// @brief Constructs moe primitive / layer.
23+
//
24+
// @param id An identifier of new primitive.
25+
// @param inputs A list of Input primitive ids (inputs).
26+
// 0: hidden_states - input tensor with hidden representations
27+
// 1: routing_weights - [num_seq, num_experts] routing weights for all experts
28+
// 2: w0_weight - expert weights for first projection,
29+
// shape [num_experts, inter_size, group_num, group_size]
30+
// 3: w0_scale - expert scale for first projection for compressed experts,
31+
// shape [num_experts, inter_size, group_num, 1]
32+
// 4: w0_zp - expert zp for first projection for compressed experts,
33+
// shape [num_experts, inter_size, group_num, 1]
34+
// 5: w1_weight - expert weights for second projection,
35+
// shape [num_experts, inter_size, group_num, group_size]
36+
// 6: w1_scale - expert scale for second projection for compressed experts,
37+
// shape [num_experts, inter_size, group_num, 1]
38+
// 7: w1_zp - expert zp for second projection for compressed experts,
39+
// shape [num_experts, inter_size, group_num, 1]
40+
// 8: w2_weight - expert weights for final projection,
41+
// shape [num_experts, hidden_size, group_num, group_size]
42+
// 9: w2_scale - expert scale for final projection for compressed experts,
43+
// shape [num_experts, hidden_size, group_num, 1]
44+
// 10: w2_zp - expert zp for final projection for compressed experts,
45+
//
46+
moe_3gemm_fused_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOE3GemmFusedCompressed::Config& config)
47+
: primitive_base(id, inputs, 1, {optional_data_type()}),
48+
_config(config) {}
49+
50+
MOE3GemmFusedCompressed::Config _config;
51+
52+
bool operator==(const primitive& rhs) const override {
53+
if (!compare_common_params(rhs))
54+
return false;
55+
56+
auto rhs_casted = downcast<const moe_3gemm_fused_compressed>(rhs);
57+
58+
return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
59+
}
60+
61+
void save(BinaryOutputBuffer& ob) const override {
62+
primitive_base<moe_3gemm_fused_compressed>::save(ob);
63+
ob << make_data(&_config, sizeof(_config));
64+
}
65+
66+
void load(BinaryInputBuffer& ib) override {
67+
primitive_base<moe_3gemm_fused_compressed>::load(ib);
68+
ib >> make_data(&_config, sizeof(_config));
69+
}
70+
};
71+
72+
} // namespace cldnn

0 commit comments

Comments
 (0)