Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
0cc7848
First working subgraph test
maxnick Sep 26, 2025
f15e27e
Add din shapes
maxnick Sep 26, 2025
53a406c
Add batch gather matmul internal op
maxnick Oct 10, 2025
abbc740
Use the new transformation name
maxnick Oct 10, 2025
875e60c
Make shapes dynamic and the patter closer to gpt-oss
maxnick Oct 13, 2025
3ead725
Fix the batch gather matmul shapes check
maxnick Oct 14, 2025
9a3028a
Yet another shape inference fix
maxnick Oct 14, 2025
9628560
Fixed the matmul shape inference reuse
maxnick Oct 14, 2025
ad97d62
Introduce GatherMatmul node
maxnick Oct 14, 2025
86e3126
[CPU] Introduced ConvertMoEMatMuls transformation
v-Golubev Oct 14, 2025
34c5c50
ConvertBatchGatherMatmulToBatchGatherMatmulCompressed
v-Golubev Oct 15, 2025
1a9fd5b
Gather Matmul initial impl
maxnick Oct 16, 2025
a88baea
1. Skip Slice in case of GPT-oss case
v-Golubev Oct 16, 2025
a7148cc
Floating point calculation working state
maxnick Oct 17, 2025
ebf6605
Make GPT-OSS test working
maxnick Oct 17, 2025
55b5e98
Use keep_dims in the top_k normalization
maxnick Oct 17, 2025
db4eb6d
Make fp32 tests working
maxnick Oct 17, 2025
f7d654e
[TESTS] MoECompressedWeightsSubgraphTest class
v-Golubev Oct 16, 2025
461df5a
Apply transformations ToDo
maxnick Oct 17, 2025
a6d5711
Adjust bf16 test threshold
maxnick Oct 20, 2025
e0916c8
Fix decompression support check
maxnick Oct 20, 2025
4f5b60e
MoECompressedWeightsSubgraphTest::check_results fix
v-Golubev Oct 20, 2025
0206e94
GatherMatmulShapeInfer: fixed warning
v-Golubev Oct 20, 2025
50b378a
Move BatchGatherMatmulCompressed conversion to cpu specific transform…
v-Golubev Oct 20, 2025
1bc3b14
BatchGatherMatmulCompressed fixes
v-Golubev Oct 20, 2025
f94425e
Added validation pass after BatchGatherMatmul conversion
v-Golubev Oct 20, 2025
024facb
format+tidy
v-Golubev Oct 21, 2025
3d0c58f
Fix Unsqueeze axis handling in ConvertMoEMatMuls
v-Golubev Oct 21, 2025
37ea942
Fix scales and zp processing
maxnick Oct 21, 2025
d1e35d5
Add 3d FC related reorders for 4bit data types
maxnick Oct 22, 2025
d458b87
Increase bf16 tolerance in tests
maxnick Oct 23, 2025
b932f93
Merge remote-tracking branch 'origin/master' into cpu_moe_op_support
maxnick Oct 23, 2025
cfa6cf6
Process by expert keeping weights in cache
maxnick Oct 23, 2025
7f17828
Enable weights decompression graph checks
maxnick Oct 23, 2025
8ac2b2b
Clang tidy fix
maxnick Oct 23, 2025
9a2d889
Fix clang format
maxnick Oct 23, 2025
4f920ef
clang format fix
maxnick Oct 23, 2025
0f6fc7a
Added check_results for MoESubgraphTest
v-Golubev Oct 23, 2025
9d2a8b7
Refactored test builders to exactly match the real models
v-Golubev Oct 23, 2025
92532f6
ConvertMoEMatMuls fix
v-Golubev Oct 23, 2025
1ef951c
style & tidy fixes
v-Golubev Oct 23, 2025
f5b75dc
Prepare code to optimize prefill via gemm
maxnick Oct 24, 2025
2671609
Use GEMM on prefill
maxnick Oct 27, 2025
706bf0c
Optimize gather GEMM keeping data hot in cache
maxnick Oct 27, 2025
04caa44
Enable bf16 decompression tests
maxnick Oct 27, 2025
6b2a7ea
Add GatherMatmul to bf16 markup
maxnick Oct 28, 2025
cf5404b
Split prefill strategies for amx and avx
maxnick Oct 28, 2025
9e848a0
BF16 support filtering in tests
maxnick Oct 28, 2025
0824a49
Fix pure bf16 use case
maxnick Oct 28, 2025
65bd473
Fix CC build
maxnick Oct 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@

#pragma once

#include <memory>
#include <tuple>
#include <vector>

#include "openvino/pass/matcher_pass.hpp"
#include "openvino/pass/pattern/matcher.hpp"
#include "ov_ops/fully_connected.hpp"
#include "transformations/pattern_blocks/compressed_weights_block.hpp"
#include "transformations_visibility.hpp"

namespace ov {
Expand All @@ -27,4 +33,26 @@ class ov::pass::ConvertFullyConnectedToFullyConnectedCompressed : public ov::pas
const std::vector<ov::element::Type>& supported_weights_types,
SupportsPredicate supports_config = nullptr,
bool convert_u4zp_to_u8 = false);

/**
* @brief Processes compressed weights from a pattern block and prepares them for compressed operations.
*
* @param weights_block The CompressedWeightsBlock pattern containing the weight compression graph
* @param pattern_map The pattern value map from the matcher containing matched nodes
* @param convert_u4zp_to_u8 Flag indicating whether to convert u4 zero points to u8
* @param has_transpose Flag indicating whether the weights require transpose operation
* @param grouped Flag indicating whether the compression uses grouped quantization
* @param batched_weights Flag indicating whether the weights have a batch dimension
* @param result_nodes Output vector to collect intermediate nodes created during processing
*
* @return A tuple containing processed compressed weights, decompression scales, and decompression zero points.
*/
static std::tuple<std::shared_ptr<ov::Node>, std::shared_ptr<ov::Node>, std::shared_ptr<ov::Node>>
process_compressed_weights(const std::shared_ptr<ov::pass::pattern::op::CompressedWeightsBlock>& weights_block,
const ov::pass::pattern::PatternValueMap& pattern_map,
bool convert_u4zp_to_u8,
bool has_transpose,
bool grouped,
bool batched_weights,
std::vector<std::shared_ptr<ov::Node>>& result_nodes);
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/pass/matcher_pass.hpp"
#include "openvino/pass/pattern/op/block.hpp"
#include "ov_ops/fully_connected.hpp"
#include "transformations_visibility.hpp"

namespace ov::pass::pattern::op {

class TRANSFORMATIONS_API CompressedWeightsBlock;

} // namespace ov::pass::pattern::op

class ov::pass::pattern::op::CompressedWeightsBlock : public ov::pass::pattern::op::Block {
public:
CompressedWeightsBlock(const std::vector<ov::element::Type>& supported_weights_types,
const std::set<size_t>& supported_weights_ranks);
};

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "transformations/pattern_blocks/compressed_weights_block.hpp"

#include <algorithm>
#include <memory>

#include "openvino/core/graph_util.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/core/type/element_type.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/subtract.hpp"
#include "openvino/op/transpose.hpp"
#include "openvino/pass/pattern/op/optional.hpp"
#include "openvino/pass/pattern/op/or.hpp"
#include "openvino/pass/pattern/op/pattern.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "ov_ops/fully_connected.hpp"
#include "ov_ops/fully_connected_compressed.hpp"
#include "transformations/utils/utils.hpp"

using namespace ov::pass::pattern;
ov::pass::pattern::op::CompressedWeightsBlock::CompressedWeightsBlock(const std::vector<ov::element::Type>& supported_weights_types,
const std::set<size_t>& supported_weights_ranks)
: Block({}, {}, "CompressedWeightsBlock") {
auto weights = wrap_type<ov::op::v0::Constant>(type_matches_any(supported_weights_types));
auto convert = wrap_type<ov::op::v0::Convert>({weights});

auto sub_const = wrap_type<ov::op::v0::Constant>();
auto sub_convert_const = wrap_type<ov::op::v0::Convert>({sub_const});
auto sub_with_convert = wrap_type<ov::op::v1::Subtract>({convert, sub_convert_const});
auto sub_no_convert = wrap_type<ov::op::v1::Subtract>({convert, sub_const});
auto subtract = sub_with_convert | sub_no_convert;

auto mul_const = wrap_type<ov::op::v0::Constant>();
auto mul_convert_const = wrap_type<ov::op::v0::Convert>({mul_const});
auto mul_scale = mul_const | mul_convert_const;

auto mul_with_sub = wrap_type<ov::op::v1::Multiply>({subtract, mul_scale});
auto mul_no_sub = wrap_type<ov::op::v1::Multiply>({convert, mul_scale});
auto mul = mul_with_sub | mul_no_sub;

auto reshape_predicate = [supported_weights_ranks](const ov::Output<ov::Node>& output) {
const auto& in_ps = output.get_node()->get_input_partial_shape(0);
const auto& out_ps = output.get_node()->get_output_partial_shape(0);
std::set<size_t> supported_weights_ranks_before_reshape;
for (auto r : supported_weights_ranks) {
supported_weights_ranks_before_reshape.insert(r + 1);
}
return in_ps.rank().is_static() && out_ps.rank().is_static() &&
supported_weights_ranks_before_reshape.count(in_ps.size()) &&
supported_weights_ranks.count(out_ps.size());
};
auto reshape_const = wrap_type<ov::op::v0::Constant>();
auto reshape = wrap_type<ov::op::v1::Reshape>({mul, reshape_const}, reshape_predicate);

auto transpose_input = reshape | mul;
auto transpose_const = wrap_type<ov::op::v0::Constant>();
auto transpose = wrap_type<ov::op::v1::Transpose>({transpose_input, transpose_const});

auto weights_input = optional<ov::op::v0::Convert>({reshape | transpose | mul});

// Block initialization
m_inputs = ov::OutputVector{weights};
m_outputs = ov::OutputVector{weights_input};
REGISTER_ANCHORS(this,
weights,
convert,
sub_const,
sub_with_convert,
sub_no_convert,
mul_const,
transpose,
transpose_const);
}
11 changes: 6 additions & 5 deletions src/core/include/openvino/pass/pattern/op/block.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace {

// _MAKE_ANCHOR is an internal macro for REGISTER_ANCHORS that is not supposed to used separately.
#define _MAKE_ANCHOR(x) block->register_anchor(#x, x);
#define _MAKE_ANCHOR(block, x) (block)->register_anchor(#x, x);

} // namespace

Expand All @@ -23,9 +23,9 @@ namespace ov::pass::pattern::op {
*
*/

#define REGISTER_ANCHORS(block, ...) \
do { \
FOR_EACH(_MAKE_ANCHOR, __VA_ARGS__) \
#define REGISTER_ANCHORS(block, ...) \
do { \
FOR_EACH(_MAKE_ANCHOR, block, __VA_ARGS__) \
} while (0)

/**
Expand Down Expand Up @@ -95,10 +95,11 @@ class OPENVINO_API Block : public Pattern {
return m_named_anchors;
}

private:
protected:
OutputVector m_inputs;
OutputVector m_outputs;

private:
std::map<std::string, Output<Node>> m_named_anchors;
};

Expand Down
67 changes: 40 additions & 27 deletions src/core/include/openvino/pass/pattern/op/block_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,49 @@
namespace {

// FOR_EACH macros up to 16 arguments:
#define FOR_EACH_1(M, x1) M(x1)
#define FOR_EACH_2(M, x1, x2) M(x1) M(x2)
#define FOR_EACH_3(M, x1, x2, x3) M(x1) M(x2) M(x3)
#define FOR_EACH_4(M, x1, x2, x3, x4) M(x1) M(x2) M(x3) M(x4)
#define FOR_EACH_5(M, x1, x2, x3, x4, x5) M(x1) M(x2) M(x3) M(x4) M(x5)
#define FOR_EACH_6(M, x1, x2, x3, x4, x5, x6) M(x1) M(x2) M(x3) M(x4) M(x5) M(x6)
#define FOR_EACH_7(M, x1, x2, x3, x4, x5, x6, x7) M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7)
#define FOR_EACH_8(M, x1, x2, x3, x4, x5, x6, x7, x8) M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8)
#define FOR_EACH_9(M, x1, x2, x3, x4, x5, x6, x7, x8, x9) M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9)
#define FOR_EACH_10(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10)
#define FOR_EACH_11(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11)
#define FOR_EACH_12(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12)
#define FOR_EACH_13(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13)
#define FOR_EACH_14(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13) M(x14)
#define FOR_EACH_15(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13) M(x14) M(x15)
#define FOR_EACH_16(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13) M(x14) M(x15) M(x16)
#define FOR_EACH_1(M, B, x1) M(B, x1)
#define FOR_EACH_2(M, B, x1, x2) M(B, x1) M(B, x2)
#define FOR_EACH_3(M, B, x1, x2, x3) M(B, x1) M(B, x2) M(B, x3)
#define FOR_EACH_4(M, B, x1, x2, x3, x4) M(B, x1) M(B, x2) M(B, x3) M(B, x4)
#define FOR_EACH_5(M, B, x1, x2, x3, x4, x5) M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5)
#define FOR_EACH_6(M, B, x1, x2, x3, x4, x5, x6) M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6)
#define FOR_EACH_7(M, B, x1, x2, x3, x4, x5, x6, x7) M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7)
#define FOR_EACH_8(M, B, x1, x2, x3, x4, x5, x6, x7, x8) \
M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8)
#define FOR_EACH_9(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9)
#define FOR_EACH_10(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) \
M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10)
#define FOR_EACH_11(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) \
M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11)
#define FOR_EACH_12(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) \
M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12)
#define FOR_EACH_13(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
M(B, x1) \
M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13)
#define FOR_EACH_14(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
M(B, x1) \
M(B, x2) \
M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13) M(B, x14)
#define FOR_EACH_15(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
M(B, x1) \
M(B, x2) \
M(B, x3) \
M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13) M(B, x14) M(B, x15)
#define FOR_EACH_16(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
M(B, x1) \
M(B, x2) \
M(B, x3) \
M(B, x4) \
M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13) M(B, x14) M(B, x15) M(B, x16)

Copy link

Copilot AI Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The GET_MACRO now includes _0 parameter but this parameter is not actually used - it's only added to shift the argument positions. Consider adding a comment explaining this trick for future maintainers, as it's a non-obvious macro implementation detail.

Suggested change
// The _0 parameter in GET_MACRO is intentionally unused.
// It is included to shift the argument positions so that the correct FOR_EACH_N macro is selected
// based on the number of arguments passed to FOR_EACH. This is a common macro trick for variadic macros.

Copilot uses AI. Check for mistakes.
#define GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, NAME, ...) NAME
#define GET_MACRO(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, NAME, ...) NAME

#define EXPAND(x) x

#define FOR_EACH(M, ...) \
EXPAND(GET_MACRO(__VA_ARGS__, \
#define FOR_EACH(M, B, ...) \
EXPAND(GET_MACRO(_0, \
__VA_ARGS__, \
FOR_EACH_16, \
FOR_EACH_15, \
FOR_EACH_14, \
Expand All @@ -48,6 +61,6 @@ namespace {
FOR_EACH_4, \
FOR_EACH_3, \
FOR_EACH_2, \
FOR_EACH_1)(M, __VA_ARGS__))
FOR_EACH_1)(M, B, __VA_ARGS__))

} // namespace
5 changes: 4 additions & 1 deletion src/plugins/intel_cpu/src/cpu_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ static const TypeToNameMap& get_type_to_name_tbl() {
{"QKVProjection", Type::QKVProjection},
{"RMS", Type::RMS},
{"SearchSorted", Type::SearchSorted},
{"LoraSubgraph", Type::LoRA}};
{"LoraSubgraph", Type::LoRA},
{"BatchGatherMatmul", Type::GatherMatmul},
{"BatchGatherMatmulCompressed", Type::GatherMatmul}};
return type_to_name_tbl;
}

Expand Down Expand Up @@ -398,6 +400,7 @@ std::string NameFromType(const Type type) {
CASE(SearchSorted);
CASE(SegmentMax);
CASE(LoRA);
CASE(GatherMatmul);
CASE(Unknown);
}
#undef CASE
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/cpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ enum class Type : uint8_t {
RMS,
SearchSorted,
SegmentMax,
LoRA
LoRA,
GatherMatmul
};

enum class Algorithm : uint8_t {
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2009,6 +2009,7 @@ void Graph::EnforceInferencePrecision() {
Type::Interpolate, // super resolution nets
Type::PagedAttention, // page attention
Type::QKVProjection,
Type::GatherMatmul,
Type::LLMMLP)) {
continue; // stop at significant nodes
}
Expand Down
3 changes: 0 additions & 3 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,6 @@ class FullyConnected : public Node {
this->attrs.weightsNonTransposed = weightsNonTransposed;
}

void fuseDecompressionMultiply(const MemoryCPtr& memory);
void fuseDecompressionSubtract(const MemoryCPtr& memory);

protected:
void toNumaNodeImpl(int numaID) override;

Expand Down
Loading
Loading