openvinotoolkit · riverlijunjie · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
@@ -0,0 +1,65 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+
+namespace ov::intel_gpu::op {
+
+/// \brief MOECompressed experts that support compressed weights for GEMM3_SWIGLU MOE.
+class MOECompressed : public ov::op::Op {
+public:
+    OPENVINO_OP("MOECompressed", "gpu_opset");
+
+    MOECompressed() = default;
+
+    struct Config {
+        size_t hidden_size = 0;
+        size_t inter_size = 0;
+        size_t num_expert = 0;
+        size_t top_k = 0;
+        size_t group_size = 0;
+        ov::element::Type out_type = ov::element::dynamic;  // fp16
+    };
+
+    /// \brief Constructs a MOECompressed operation with config only
+    /// \param args The input tensors, in the following order:
+    ///   0: hidden_states - input tensor with hidden representations
+    ///   1: routing_weights - [num_experts, ...] normalized weights for selected experts
+    ///      (input to final multiplication)
+    ///   2: router_topk_output_indices - [..., topk] indices of selected top-k experts
+    ///   3: w0_weight - expert weights for first projection,
+    ///   shape [num_experts, inter_size, group_num, group_size]
+    ///   4: w0_scale - expert scale for first projection for compressed experts,
+    ///   shape [num_experts, inter_size, group_num, 1]
+    ///   5: w0_zp - expert zp for first projection for compressed experts,
+    ///   shape [num_experts, inter_size, group_num, 1]
+    ///   6: w1_weight - expert weights for second projection,
+    ///   shape [num_experts, inter_size, group_num, group_size]
+    ///   7: w1_scale - expert scale for second projection for compressed experts,
+    ///   shape [num_experts, inter_size, group_num, 1]
+    ///   8: w1_zp - expert zp for second projection for compressed experts,
+    ///   shape [num_experts, inter_size, group_num, 1]
+    ///   9: w2_weight - expert weights for final projection,
+    ///   shape [num_experts, hidden_size, group_num, group_size]
+    ///   10: w2_scale - expert scale for final projection for compressed experts,
+    ///   shape [num_experts, hidden_size, group_num, 1]
+    ///   11: w2_zp - expert zp for final projection for compressed experts,
+    ///   shape [num_experts, hidden_size, group_num, 1]
+    /// \param config Configuration for the MOE operation
+    MOECompressed(const OutputVector& args, const Config& config);
+
+    const Config& get_config() const;
+    void set_config(const Config& config);
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+private:
+    Config m_config;
+};
+
+}  // namespace ov::intel_gpu::op
@@ -311,3 +311,4 @@ REGISTER_FACTORY(internal, PagedAttentionExtension);
 REGISTER_FACTORY(internal, LoraSubgraph);
 REGISTER_FACTORY(internal, LoraSubgraphFused);
 REGISTER_FACTORY(internal, VLSDPA);
+REGISTER_FACTORY(internal, MOECompressed);
@@ -0,0 +1,52 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <vector>
+
+#include "intel_gpu/runtime/engine.hpp"
+#include "intel_gpu/op/moe_compressed.hpp"
+#include "primitive.hpp"
+
+namespace cldnn {
+using MOECompressed = ov::intel_gpu::op::MOECompressed;
+
+/// @brief moe compressed primitive
+/// @details Performs moe compressed
+struct moe_compressed : public primitive_base<moe_compressed> {
+    CLDNN_DECLARE_PRIMITIVE(moe_compressed)
+
+    moe_compressed() : primitive_base("", {}) {}
+
+    /// @brief Constructs moe primitive / layer.
+    ///
+    /// @param id                 An identifier of new primitive.
+    /// @param inputs             A list of Input primitive ids (inputs).
+    moe_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOECompressed::Config& config)
+        : primitive_base(id, inputs, 1, {optional_data_type()}),
+          _config(config) {}
+
+    MOECompressed::Config _config;
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs))
+            return false;
+
+        auto rhs_casted = downcast<const moe_compressed>(rhs);
+
+        return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        primitive_base<moe_compressed>::save(ob);
+        ob << make_data(&_config, sizeof(_config));
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        primitive_base<moe_compressed>::load(ib);
+        ib >> make_data(&_config, sizeof(_config));
+    }
+};
+
+}  // namespace cldnn