Skip to content

Commit 79d6a13

Browse files
committed
add transformations test
1 parent b277578 commit 79d6a13

File tree

2 files changed

+279
-0
lines changed

2 files changed

+279
-0
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include <gtest/gtest.h>
6+
7+
#include "common_test_utils/ov_test_utils.hpp"
8+
#include "intel_gpu/op/moe_compressed.hpp"
9+
#include "intel_gpu/op/placeholder.hpp"
10+
#include "openvino/op/add.hpp"
11+
#include "openvino/op/concat.hpp"
12+
#include "openvino/op/convert.hpp"
13+
#include "openvino/op/matmul.hpp"
14+
#include "openvino/op/moe.hpp"
15+
#include "openvino/op/multiply.hpp"
16+
#include "openvino/op/reshape.hpp"
17+
#include "openvino/op/shape_of.hpp"
18+
#include "openvino/op/split.hpp"
19+
#include "openvino/op/subtract.hpp"
20+
#include "openvino/op/transpose.hpp"
21+
#include "openvino/op/variadic_split.hpp"
22+
#include "plugin/transformations/convert_moe_to_compressed.hpp"
23+
24+
using namespace testing;
25+
using namespace ov::intel_gpu;
26+
27+
namespace ov {
28+
namespace test {
29+
namespace intel_gpu {
30+
TEST_F(TransformationTestsF, ConvertMOEToMOECompressedTest) {
31+
disable_rt_info_check();
32+
{
33+
// tokens:32, hidden_size:2048, iter_size:768, experts:128, topk:8
34+
auto hidden_states = std::make_shared<ov::op::v0::Parameter>(element::f16, Shape{32, 2048});
35+
auto routing_weights = std::make_shared<ov::op::v0::Parameter>(element::f16, Shape{128, 1, 32, 1});
36+
auto routing_idx = std::make_shared<ov::op::v0::Parameter>(element::i32, Shape{32, 8});
37+
38+
// Gate projection
39+
auto wei_gate = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
40+
auto zp_gate = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 1}, {0});
41+
auto scale_gate = op::v0::Constant::create(element::f16, Shape{128, 768, 16, 1}, {0.01f});
42+
auto reshape_const_gate = op::v0::Constant::create(element::i32, Shape{3}, {128, 768, 2048});
43+
44+
auto w_gate_f16 = std::make_shared<op::v0::Convert>(wei_gate, element::f16);
45+
auto zp_gate_f16 = std::make_shared<op::v0::Convert>(zp_gate, element::f16);
46+
auto sub_gate = std::make_shared<op::v1::Subtract>(w_gate_f16, zp_gate_f16);
47+
auto mul_gate = std::make_shared<op::v1::Multiply>(sub_gate, scale_gate);
48+
auto reshape_gate = std::make_shared<op::v1::Reshape>(mul_gate, reshape_const_gate, false);
49+
auto convert_gate = std::make_shared<op::v0::Convert>(reshape_gate, element::f32);
50+
51+
// Up projection
52+
auto wei_up = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
53+
auto zp_up = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 1}, {0});
54+
auto scale_up = op::v0::Constant::create(element::f16, Shape{128, 768, 16, 1}, {0.01f});
55+
auto reshape_const_up = op::v0::Constant::create(element::i32, Shape{3}, {128, 768, 2048});
56+
57+
auto w_up_f16 = std::make_shared<op::v0::Convert>(wei_up, element::f16);
58+
auto zp_up_f16 = std::make_shared<op::v0::Convert>(zp_up, element::f16);
59+
auto sub_up = std::make_shared<op::v1::Subtract>(w_up_f16, zp_up_f16);
60+
auto mul_up = std::make_shared<op::v1::Multiply>(sub_up, scale_up);
61+
auto reshape_up = std::make_shared<op::v1::Reshape>(mul_up, reshape_const_up, false);
62+
auto convert_up = std::make_shared<op::v0::Convert>(reshape_up, element::f32);
63+
64+
// Down projection
65+
auto wei_down = op::v0::Constant::create(element::u4, Shape{128, 2048, 6, 128}, {1});
66+
auto zp_down = op::v0::Constant::create(element::u4, Shape{128, 2048, 6, 1}, {0});
67+
auto scale_down = op::v0::Constant::create(element::f16, Shape{128, 2048, 6, 1}, {0.01f});
68+
auto reshape_const_down = op::v0::Constant::create(element::i32, Shape{3}, {128, 2048, 768});
69+
70+
auto wei_down_f16 = std::make_shared<op::v0::Convert>(wei_down, element::f16);
71+
auto zp_down_f16 = std::make_shared<op::v0::Convert>(zp_down, element::f16);
72+
auto sub_down = std::make_shared<op::v1::Subtract>(wei_down_f16, zp_down_f16);
73+
auto mul_down = std::make_shared<op::v1::Multiply>(sub_down, scale_down);
74+
auto reshape_down = std::make_shared<op::v1::Reshape>(mul_down, reshape_const_down, false);
75+
auto convert_down = std::make_shared<op::v0::Convert>(reshape_down, element::f32);
76+
77+
// Construct MOE node
78+
ov::op::internal::MOE::Config config;
79+
config.expert_type = ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU;
80+
auto moe = std::make_shared<ov::op::internal::MOE>(
81+
ov::OutputVector{hidden_states, routing_weights, routing_idx, convert_gate, convert_up, convert_down}, config);
82+
model = std::make_shared<ov::Model>(moe, ov::ParameterVector{hidden_states, routing_weights, routing_idx});
83+
manager.register_pass<ConvertMOEToMOECompressed>();
84+
}
85+
{
86+
// Inputs
87+
auto hidden_states = std::make_shared<ov::op::v0::Parameter>(element::f16, Shape{32, 2048});
88+
auto routing_weights = std::make_shared<ov::op::v0::Parameter>(element::f16, Shape{128, 1, 32, 1});
89+
auto routing_idx = std::make_shared<ov::op::v0::Parameter>(element::i32, Shape{32, 8});
90+
91+
// Gate and up projection
92+
auto reshape_const_gate_up = op::v0::Constant::create(element::i32, Shape{3}, {128, 768, 16});
93+
auto transpose_const_gate_up = op::v0::Constant::create(element::i32, Shape{3}, {0, 2, 1});
94+
95+
auto wei_gate = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
96+
auto zp_gate = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 1}, {0});
97+
auto scale_gate = op::v0::Constant::create(element::f16, Shape{128, 768, 16, 1}, {0.01f});
98+
auto zp_reshape_gate = std::make_shared<op::v1::Reshape>(zp_gate, reshape_const_gate_up, false);
99+
auto zp_transpose_gate = std::make_shared<ov::op::v1::Transpose>(zp_reshape_gate, transpose_const_gate_up);
100+
auto scale_reshape_gate = std::make_shared<op::v1::Reshape>(scale_gate, reshape_const_gate_up, false);
101+
auto scale_transpose_gate = std::make_shared<ov::op::v1::Transpose>(scale_reshape_gate, transpose_const_gate_up);
102+
103+
auto wei_up = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
104+
auto zp_up = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 1}, {0});
105+
auto scale_up = op::v0::Constant::create(element::f16, Shape{128, 768, 16, 1}, {0.01f});
106+
auto zp_reshape_up = std::make_shared<op::v1::Reshape>(zp_up, reshape_const_gate_up, false);
107+
auto zp_transpose_up = std::make_shared<ov::op::v1::Transpose>(zp_reshape_up, transpose_const_gate_up);
108+
auto scale_reshape_up = std::make_shared<op::v1::Reshape>(scale_up, reshape_const_gate_up, false);
109+
auto scale_transpose_up = std::make_shared<ov::op::v1::Transpose>(scale_reshape_up, transpose_const_gate_up);
110+
111+
// Down projection
112+
auto wei_down = op::v0::Constant::create(element::u4, Shape{128, 2048, 6, 128}, {1});
113+
auto zp_down = op::v0::Constant::create(element::u4, Shape{128, 2048, 6, 1}, {0});
114+
auto scale_down = op::v0::Constant::create(element::f16, Shape{128, 2048, 6, 1}, {0.01f});
115+
auto reshape_const_down = op::v0::Constant::create(element::i32, Shape{3}, {128, 2048, 6});
116+
auto transpose_const_down = op::v0::Constant::create(element::i32, Shape{3}, {0, 2, 1});
117+
auto zp_reshape_down = std::make_shared<op::v1::Reshape>(zp_down, reshape_const_down, false);
118+
auto zp_transpose_down = std::make_shared<ov::op::v1::Transpose>(zp_reshape_down, transpose_const_down);
119+
auto scale_reshape_down = std::make_shared<op::v1::Reshape>(scale_down, reshape_const_down, false);
120+
auto scale_transpose_down = std::make_shared<ov::op::v1::Transpose>(scale_reshape_down, transpose_const_down);
121+
122+
ov::intel_gpu::op::MOECompressed::Config config;
123+
config.hidden_size = 2048;
124+
config.inter_size = 768;
125+
config.num_expert = 128;
126+
config.top_k = 8;
127+
config.group_size = 128;
128+
config.out_type = ov::element::f16;
129+
auto moe_compressed = std::make_shared<ov::intel_gpu::op::MOECompressed>(
130+
ov::OutputVector{hidden_states, routing_weights, routing_idx,
131+
wei_gate, scale_transpose_gate, zp_transpose_gate,
132+
wei_up, scale_transpose_up, zp_transpose_up,
133+
wei_down, scale_transpose_down, zp_transpose_down}, config);
134+
model_ref = std::make_shared<ov::Model>(moe_compressed, ov::ParameterVector{hidden_states, routing_weights, routing_idx});
135+
}
136+
}
137+
} // namespace intel_gpu
138+
} // namespace test
139+
} // namespace ov
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include <gtest/gtest.h>
6+
7+
#include "common_test_utils/ov_test_utils.hpp"
8+
#include "intel_gpu/op/moe_compressed.hpp"
9+
#include "intel_gpu/op/moe_fused_compressed.hpp"
10+
#include "openvino/op/broadcast.hpp"
11+
#include "openvino/op/concat.hpp"
12+
#include "openvino/op/constant.hpp"
13+
#include "openvino/op/convert.hpp"
14+
#include "openvino/op/divide.hpp"
15+
#include "openvino/op/gather.hpp"
16+
#include "openvino/op/matmul.hpp"
17+
#include "openvino/op/multiply.hpp"
18+
#include "openvino/op/reduce_sum.hpp"
19+
#include "openvino/op/reshape.hpp"
20+
#include "openvino/op/scatter_elements_update.hpp"
21+
#include "openvino/op/shape_of.hpp"
22+
#include "openvino/op/softmax.hpp"
23+
#include "openvino/op/subtract.hpp"
24+
#include "openvino/op/topk.hpp"
25+
#include "openvino/op/transpose.hpp"
26+
#include "openvino/op/unsqueeze.hpp"
27+
#include "plugin/transformations/fuse_moe_compressed.hpp"
28+
29+
using namespace testing;
30+
using namespace ov::intel_gpu;
31+
32+
namespace ov {
33+
namespace test {
34+
namespace intel_gpu {
35+
TEST_F(TransformationTestsF, FuseMOECompressedTest) {
36+
{
37+
// tokens:32, hidden_size:2048, iter_size:768, experts:128, topk:8
38+
auto hidden_states = std::make_shared<ov::op::v0::Parameter>(element::f16, Shape{32, 2048});
39+
auto routers = op::v0::Constant::create(element::f16, Shape{2048, 128}, {0.2});
40+
auto routing_weights = std::make_shared<ov::op::v0::MatMul>(hidden_states, routers);
41+
42+
auto softmax = std::make_shared<ov::op::v8::Softmax>(routing_weights, 1);
43+
auto k = op::v0::Constant::create(element::i32, Shape{}, {8});
44+
auto topk = std::make_shared<ov::op::v11::TopK>(softmax, k, 1,
45+
ov::op::v11::TopK::Mode::MAX, ov::op::v11::TopK::SortType::SORT_VALUES);
46+
47+
// weight output
48+
auto reduce_axis = op::v0::Constant::create(element::i64, Shape{1}, {1});
49+
auto reduce_sum = std::make_shared<ov::op::v1::ReduceSum>(topk->output(0), reduce_axis->output(0), true);
50+
auto norm = std::make_shared<ov::op::v1::Divide>(topk->output(0), reduce_sum->output(0));
51+
52+
// 32
53+
auto shape_of = std::make_shared<ov::op::v3::ShapeOf>(topk->output(1)); // [2]{32, 8}
54+
auto gather_idx = op::v0::Constant::create(element::i64, Shape{}, {0});
55+
auto gather_axis = op::v0::Constant::create(element::i64, Shape{}, {0});
56+
auto gather = std::make_shared<ov::op::v8::Gather>(shape_of, gather_idx, gather_axis); // scalar: 32
57+
auto const_unsqueeze = op::v0::Constant::create(element::i64, Shape{1}, {0});
58+
auto unsqueeze = std::make_shared<ov::op::v0::Unsqueeze>(gather, const_unsqueeze); // [1]{32}
59+
60+
// 128
61+
auto const0 = op::v0::Constant::create(element::i64, Shape{}, {128});
62+
auto const1 = op::v0::Constant::create(element::i64, Shape{1}, {0});
63+
auto unsqueeze1 = std::make_shared<ov::op::v0::Unsqueeze>(const0, const1); // [1]{128}
64+
auto concat = std::make_shared<ov::op::v0::Concat>(OutputVector{unsqueeze, unsqueeze1}, 0); // [2]{32,128}
65+
auto const3 = op::v0::Constant::create(element::i64, Shape{1}, {1});
66+
auto concat1 = std::make_shared<ov::op::v0::Concat>(OutputVector{unsqueeze1, unsqueeze, const3}, 0);
67+
68+
// [32, 128]
69+
auto zero = op::v0::Constant::create(element::f16, Shape{1}, {0});
70+
auto bc = std::make_shared<ov::op::v3::Broadcast>(zero, concat);
71+
auto scatter_axis = op::v0::Constant::create(element::i64, Shape{1}, {1});
72+
auto scatter = std::make_shared<ov::op::v12::ScatterElementsUpdate>(bc, // [32, 128]
73+
topk->output(1), // [32, 8]
74+
norm, // [32, 8]
75+
scatter_axis, // [1]
76+
ov::op::v12::ScatterElementsUpdate::Reduction::SUM);
77+
auto transpose_shape = op::v0::Constant::create(element::i64, Shape{2}, {1, 0});
78+
auto transpose = std::make_shared<ov::op::v1::Transpose>(scatter, transpose_shape); // [128, 32]
79+
auto reshape = std::make_shared<ov::op::v1::Reshape>(transpose, concat1, false);
80+
auto unsqueeze_const = op::v0::Constant::create(element::i64, Shape{1}, {3});
81+
auto unsqueeze_moe = std::make_shared<ov::op::v0::Unsqueeze>(reshape, unsqueeze_const); // [128, 1, 32, 1]
82+
83+
// weight
84+
auto wei_gate = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
85+
auto scale_gate = op::v0::Constant::create(element::f16, Shape{128, 16, 768}, {0.01f});
86+
auto zp_gate = op::v0::Constant::create(element::u4, Shape{128, 16, 768}, {0});
87+
auto wei_up = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
88+
auto scale_up = op::v0::Constant::create(element::f16, Shape{128, 16, 768}, {0.01f});
89+
auto zp_up = op::v0::Constant::create(element::u4, Shape{128, 16, 768, 16}, {0});
90+
auto wei_down = op::v0::Constant::create(element::u4, Shape{128, 2048, 6, 128}, {1});
91+
auto scale_down = op::v0::Constant::create(element::f16, Shape{128, 6, 2048}, {0.01f});
92+
auto zp_down = op::v0::Constant::create(element::u4, Shape{128, 6, 2048}, {0});
93+
94+
ov::intel_gpu::op::MOECompressed::Config config;
95+
config.hidden_size = 2048;
96+
config.inter_size = 768;
97+
config.num_expert = 128;
98+
config.group_size = 128;
99+
config.top_k = 8;
100+
config.out_type = ov::element::f16;
101+
auto moe_compressed = std::make_shared<ov::intel_gpu::op::MOECompressed>(
102+
ov::OutputVector{hidden_states, unsqueeze_moe, topk->output(1),
103+
wei_gate, scale_gate, zp_gate, wei_up, scale_up, zp_up, wei_down, scale_down, zp_down}, config);
104+
model = std::make_shared<ov::Model>(moe_compressed, ov::ParameterVector{hidden_states});
105+
manager.register_pass<FuseMOECompressed>();
106+
}
107+
{
108+
// tokens:32, hidden_size:2048, iter_size:768, experts:128, topk:8
109+
auto hidden_states = std::make_shared<ov::op::v0::Parameter>(element::f16, Shape{32, 2048});
110+
auto routers = op::v0::Constant::create(element::f16, Shape{2048, 128}, {0.2});
111+
auto routing_weights = std::make_shared<ov::op::v0::MatMul>(hidden_states, routers);
112+
113+
// weight
114+
auto wei_gate = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
115+
auto scale_gate = op::v0::Constant::create(element::f16, Shape{128, 16, 768}, {0.01f});
116+
auto zp_gate = op::v0::Constant::create(element::u4, Shape{128, 16, 768}, {0});
117+
auto wei_up = op::v0::Constant::create(element::u4, Shape{128, 768, 16, 128}, {1});
118+
auto scale_up = op::v0::Constant::create(element::f16, Shape{128, 16, 768}, {0.01f});
119+
auto zp_up = op::v0::Constant::create(element::u4, Shape{128, 16, 768, 16}, {0});
120+
auto wei_down = op::v0::Constant::create(element::u4, Shape{128, 2048, 6, 128}, {1});
121+
auto scale_down = op::v0::Constant::create(element::f16, Shape{128, 6, 2048}, {0.01f});
122+
auto zp_down = op::v0::Constant::create(element::u4, Shape{128, 6, 2048}, {0});
123+
124+
ov::intel_gpu::op::MOECompressed::Config config;
125+
config.hidden_size = 2048;
126+
config.inter_size = 768;
127+
config.num_expert = 128;
128+
config.group_size = 128;
129+
config.top_k = 8;
130+
config.out_type = ov::element::f16;
131+
auto moe_fused_compressed = std::make_shared<ov::intel_gpu::op::MOEFusedCompressed>(
132+
ov::OutputVector{hidden_states, routing_weights,
133+
wei_gate, scale_gate, zp_gate, wei_up, scale_up, zp_up, wei_down, scale_down, zp_down}, config);
134+
135+
model_ref = std::make_shared<ov::Model>(moe_fused_compressed, ov::ParameterVector{hidden_states});
136+
}
137+
}
138+
} // namespace intel_gpu
139+
} // namespace test
140+
} // namespace ov

0 commit comments

Comments
 (0)