openvinotoolkit · maxnick · Sep 26, 2025 · Sep 26, 2025 · Oct 10, 2025 · Oct 10, 2025
@@ -4,8 +4,14 @@
 
 #pragma once
 
+#include <memory>
+#include <tuple>
+#include <vector>
+
 #include "openvino/pass/matcher_pass.hpp"
+#include "openvino/pass/pattern/matcher.hpp"
 #include "ov_ops/fully_connected.hpp"
+#include "transformations/pattern_blocks/compressed_weights_block.hpp"
 #include "transformations_visibility.hpp"
 
 namespace ov {
@@ -27,4 +33,26 @@ class ov::pass::ConvertFullyConnectedToFullyConnectedCompressed : public ov::pas
                                                     const std::vector<ov::element::Type>& supported_weights_types,
                                                     SupportsPredicate supports_config = nullptr,
                                                     bool convert_u4zp_to_u8 = false);
+
+    /**
+     * @brief Processes compressed weights from a pattern block and prepares them for compressed operations.
+     *
+     * @param weights_block The CompressedWeightsBlock pattern containing the weight compression graph
+     * @param pattern_map The pattern value map from the matcher containing matched nodes
+     * @param convert_u4zp_to_u8 Flag indicating whether to convert u4 zero points to u8
+     * @param has_transpose Flag indicating whether the weights require transpose operation
+     * @param grouped Flag indicating whether the compression uses grouped quantization
+     * @param batched_weights Flag indicating whether the weights have a batch dimension
+     * @param result_nodes Output vector to collect intermediate nodes created during processing
+     *
+     * @return A tuple containing processed compressed weights, decompression scales, and decompression zero points.
+     */
+    static std::tuple<std::shared_ptr<ov::Node>, std::shared_ptr<ov::Node>, std::shared_ptr<ov::Node>>
+    process_compressed_weights(const std::shared_ptr<ov::pass::pattern::op::CompressedWeightsBlock>& weights_block,
+                               const ov::pass::pattern::PatternValueMap& pattern_map,
+                               bool convert_u4zp_to_u8,
+                               bool has_transpose,
+                               bool grouped,
+                               bool batched_weights,
+                               std::vector<std::shared_ptr<ov::Node>>& result_nodes);
 };
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "openvino/pass/pattern/op/block.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov::pass::pattern::op {
+
+class TRANSFORMATIONS_API CompressedWeightsBlock;
+
+}  // namespace ov::pass::pattern::op
+
+class ov::pass::pattern::op::CompressedWeightsBlock : public ov::pass::pattern::op::Block {
+public:
+    CompressedWeightsBlock(const std::vector<ov::element::Type>& supported_weights_types,
+                           const std::set<size_t>& supported_weights_ranks);
+};
@@ -0,0 +1,80 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/pattern_blocks/compressed_weights_block.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include "openvino/core/graph_util.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/subtract.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "ov_ops/fully_connected.hpp"
+#include "ov_ops/fully_connected_compressed.hpp"
+#include "transformations/utils/utils.hpp"
+
+using namespace ov::pass::pattern;
+ov::pass::pattern::op::CompressedWeightsBlock::CompressedWeightsBlock(const std::vector<ov::element::Type>& supported_weights_types,
+                                                   const std::set<size_t>& supported_weights_ranks)
+    : Block({}, {}, "CompressedWeightsBlock") {
+    auto weights = wrap_type<ov::op::v0::Constant>(type_matches_any(supported_weights_types));
+    auto convert = wrap_type<ov::op::v0::Convert>({weights});
+
+    auto sub_const = wrap_type<ov::op::v0::Constant>();
+    auto sub_convert_const = wrap_type<ov::op::v0::Convert>({sub_const});
+    auto sub_with_convert = wrap_type<ov::op::v1::Subtract>({convert, sub_convert_const});
+    auto sub_no_convert = wrap_type<ov::op::v1::Subtract>({convert, sub_const});
+    auto subtract = sub_with_convert | sub_no_convert;
+
+    auto mul_const = wrap_type<ov::op::v0::Constant>();
+    auto mul_convert_const = wrap_type<ov::op::v0::Convert>({mul_const});
+    auto mul_scale = mul_const | mul_convert_const;
+
+    auto mul_with_sub = wrap_type<ov::op::v1::Multiply>({subtract, mul_scale});
+    auto mul_no_sub = wrap_type<ov::op::v1::Multiply>({convert, mul_scale});
+    auto mul = mul_with_sub | mul_no_sub;
+
+    auto reshape_predicate = [supported_weights_ranks](const ov::Output<ov::Node>& output) {
+        const auto& in_ps = output.get_node()->get_input_partial_shape(0);
+        const auto& out_ps = output.get_node()->get_output_partial_shape(0);
+        std::set<size_t> supported_weights_ranks_before_reshape;
+        for (auto r : supported_weights_ranks) {
+            supported_weights_ranks_before_reshape.insert(r + 1);
+        }
+        return in_ps.rank().is_static() && out_ps.rank().is_static() &&
+               supported_weights_ranks_before_reshape.count(in_ps.size()) &&
+               supported_weights_ranks.count(out_ps.size());
+    };
+    auto reshape_const = wrap_type<ov::op::v0::Constant>();
+    auto reshape = wrap_type<ov::op::v1::Reshape>({mul, reshape_const}, reshape_predicate);
+
+    auto transpose_input = reshape | mul;
+    auto transpose_const = wrap_type<ov::op::v0::Constant>();
+    auto transpose = wrap_type<ov::op::v1::Transpose>({transpose_input, transpose_const});
+
+    auto weights_input = optional<ov::op::v0::Convert>({reshape | transpose | mul});
+
+    // Block initialization
+    m_inputs = ov::OutputVector{weights};
+    m_outputs = ov::OutputVector{weights_input};
+    REGISTER_ANCHORS(this,
+                     weights,
+                     convert,
+                     sub_const,
+                     sub_with_convert,
+                     sub_no_convert,
+                     mul_const,
+                     transpose,
+                     transpose_const);
+}
@@ -9,7 +9,7 @@
 namespace {
 
 // _MAKE_ANCHOR is an internal macro for REGISTER_ANCHORS that is not supposed to used separately.
-#define _MAKE_ANCHOR(x) block->register_anchor(#x, x);
+#define _MAKE_ANCHOR(block, x) (block)->register_anchor(#x, x);
 
 }  // namespace
 
@@ -23,9 +23,9 @@ namespace ov::pass::pattern::op {
  *
  */
 
-#define REGISTER_ANCHORS(block, ...)        \
-    do {                                    \
-        FOR_EACH(_MAKE_ANCHOR, __VA_ARGS__) \
+#define REGISTER_ANCHORS(block, ...)               \
+    do {                                           \
+        FOR_EACH(_MAKE_ANCHOR, block, __VA_ARGS__) \
     } while (0)
 
 /**
@@ -95,10 +95,11 @@ class OPENVINO_API Block : public Pattern {
         return m_named_anchors;
     }
 
-private:
+protected:
     OutputVector m_inputs;
     OutputVector m_outputs;
 
+private:
     std::map<std::string, Output<Node>> m_named_anchors;
 };
 

@@ -3,36 +3,49 @@
 namespace {
 
 // FOR_EACH macros up to 16 arguments:
-#define FOR_EACH_1(M, x1)                                 M(x1)
-#define FOR_EACH_2(M, x1, x2)                             M(x1) M(x2)
-#define FOR_EACH_3(M, x1, x2, x3)                         M(x1) M(x2) M(x3)
-#define FOR_EACH_4(M, x1, x2, x3, x4)                     M(x1) M(x2) M(x3) M(x4)
-#define FOR_EACH_5(M, x1, x2, x3, x4, x5)                 M(x1) M(x2) M(x3) M(x4) M(x5)
-#define FOR_EACH_6(M, x1, x2, x3, x4, x5, x6)             M(x1) M(x2) M(x3) M(x4) M(x5) M(x6)
-#define FOR_EACH_7(M, x1, x2, x3, x4, x5, x6, x7)         M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7)
-#define FOR_EACH_8(M, x1, x2, x3, x4, x5, x6, x7, x8)     M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8)
-#define FOR_EACH_9(M, x1, x2, x3, x4, x5, x6, x7, x8, x9) M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9)
-#define FOR_EACH_10(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10)
-#define FOR_EACH_11(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11)
-#define FOR_EACH_12(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12)
-#define FOR_EACH_13(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13)
-#define FOR_EACH_14(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13) M(x14)
-#define FOR_EACH_15(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13) M(x14) M(x15)
-#define FOR_EACH_16(M, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
-    M(x1) M(x2) M(x3) M(x4) M(x5) M(x6) M(x7) M(x8) M(x9) M(x10) M(x11) M(x12) M(x13) M(x14) M(x15) M(x16)
+#define FOR_EACH_1(M, B, x1)                         M(B, x1)
+#define FOR_EACH_2(M, B, x1, x2)                     M(B, x1) M(B, x2)
+#define FOR_EACH_3(M, B, x1, x2, x3)                 M(B, x1) M(B, x2) M(B, x3)
+#define FOR_EACH_4(M, B, x1, x2, x3, x4)             M(B, x1) M(B, x2) M(B, x3) M(B, x4)
+#define FOR_EACH_5(M, B, x1, x2, x3, x4, x5)         M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5)
+#define FOR_EACH_6(M, B, x1, x2, x3, x4, x5, x6)     M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6)
+#define FOR_EACH_7(M, B, x1, x2, x3, x4, x5, x6, x7) M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7)
+#define FOR_EACH_8(M, B, x1, x2, x3, x4, x5, x6, x7, x8) \
+    M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8)
+#define FOR_EACH_9(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
+    M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9)
+#define FOR_EACH_10(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) \
+    M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10)
+#define FOR_EACH_11(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) \
+    M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11)
+#define FOR_EACH_12(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) \
+    M(B, x1) M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12)
+#define FOR_EACH_13(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
+    M(B, x1)                                                                      \
+    M(B, x2) M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13)
+#define FOR_EACH_14(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
+    M(B, x1)                                                                           \
+    M(B, x2)                                                                           \
+    M(B, x3) M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13) M(B, x14)
+#define FOR_EACH_15(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
+    M(B, x1)                                                                                \
+    M(B, x2)                                                                                \
+    M(B, x3)                                                                                \
+    M(B, x4) M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13) M(B, x14) M(B, x15)
+#define FOR_EACH_16(M, B, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
+    M(B, x1)                                                                                     \
+    M(B, x2)                                                                                     \
+    M(B, x3)                                                                                     \
+    M(B, x4)                                                                                     \
+    M(B, x5) M(B, x6) M(B, x7) M(B, x8) M(B, x9) M(B, x10) M(B, x11) M(B, x12) M(B, x13) M(B, x14) M(B, x15) M(B, x16)
 
-
+
+// The _0 parameter in GET_MACRO is intentionally unused.
+// It is included to shift the argument positions so that the correct FOR_EACH_N macro is selected
+// based on the number of arguments passed to FOR_EACH. This is a common macro trick for variadic macros.
-
+
+// The _0 parameter in GET_MACRO is intentionally unused.
+// It is included to shift the argument positions so that the correct FOR_EACH_N macro is selected
+// based on the number of arguments passed to FOR_EACH. This is a common macro trick for variadic macros.
-#define GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, NAME, ...) NAME
+#define GET_MACRO(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, NAME, ...) NAME
 
 #define EXPAND(x) x
 
-#define FOR_EACH(M, ...)          \
-    EXPAND(GET_MACRO(__VA_ARGS__, \
+#define FOR_EACH(M, B, ...)       \
+    EXPAND(GET_MACRO(_0,          \
+                     __VA_ARGS__, \
                      FOR_EACH_16, \
                      FOR_EACH_15, \
                      FOR_EACH_14, \
@@ -48,6 +61,6 @@ namespace {
                      FOR_EACH_4,  \
                      FOR_EACH_3,  \
                      FOR_EACH_2,  \
-                     FOR_EACH_1)(M, __VA_ARGS__))
+                     FOR_EACH_1)(M, B, __VA_ARGS__))
 
 }  // namespace
@@ -264,7 +264,9 @@ static const TypeToNameMap& get_type_to_name_tbl() {
         {"QKVProjection", Type::QKVProjection},
         {"RMS", Type::RMS},
         {"SearchSorted", Type::SearchSorted},
-        {"LoraSubgraph", Type::LoRA}};
+        {"LoraSubgraph", Type::LoRA},
+        {"BatchGatherMatmul", Type::GatherMatmul},
+        {"BatchGatherMatmulCompressed", Type::GatherMatmul}};
     return type_to_name_tbl;
 }
 
@@ -398,6 +400,7 @@ std::string NameFromType(const Type type) {
         CASE(SearchSorted);
         CASE(SegmentMax);
         CASE(LoRA);
+        CASE(GatherMatmul);
         CASE(Unknown);
     }
 #undef CASE

@@ -136,7 +136,8 @@ enum class Type : uint8_t {
     RMS,
     SearchSorted,
     SegmentMax,
-    LoRA
+    LoRA,
+    GatherMatmul
 };
 
 enum class Algorithm : uint8_t {

@@ -2009,6 +2009,7 @@ void Graph::EnforceInferencePrecision() {
                            Type::Interpolate,     // super resolution nets
                            Type::PagedAttention,  // page attention
                            Type::QKVProjection,
+                           Type::GatherMatmul,
                            Type::LLMMLP)) {
                     continue;  // stop at significant nodes
                 }

@@ -91,9 +91,6 @@ class FullyConnected : public Node {
         this->attrs.weightsNonTransposed = weightsNonTransposed;
     }
 
-    void fuseDecompressionMultiply(const MemoryCPtr& memory);
-    void fuseDecompressionSubtract(const MemoryCPtr& memory);
-
 protected:
     void toNumaNodeImpl(int numaID) override;