Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/op/moe_compressed.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"

namespace ov::intel_gpu::op {

/// \brief MOECompressed experts that support compressed weights for GEMM3_SWIGLU MOE.
class MOECompressed : public ov::op::Op {
public:
OPENVINO_OP("MOECompressed", "gpu_opset");

MOECompressed() = default;

struct Config {
size_t hidden_size = 0;
size_t inter_size = 0;
size_t num_expert = 0;
size_t top_k = 0;
size_t group_size = 0;
ov::element::Type out_type = ov::element::dynamic; // fp16
};

/// \brief Constructs a MOECompressed operation with config only
/// \param args The input tensors, in the following order:
/// 0: hidden_states - input tensor with hidden representations
/// 1: routing_weights - [num_experts, ...] normalized weights for selected experts
/// (input to final multiplication)
/// 2: router_topk_output_indices - [..., topk] indices of selected top-k experts
/// 3: w0_weight - expert weights for first projection,
/// shape [num_experts, inter_size, group_num, group_size]
/// 4: w0_scale - expert scale for first projection for compressed experts,
/// shape [num_experts, inter_size, group_num, 1]
/// 5: w0_zp - expert zp for first projection for compressed experts,
/// shape [num_experts, inter_size, group_num, 1]
/// 6: w1_weight - expert weights for second projection,
/// shape [num_experts, inter_size, group_num, group_size]
/// 7: w1_scale - expert scale for second projection for compressed experts,
/// shape [num_experts, inter_size, group_num, 1]
/// 8: w1_zp - expert zp for second projection for compressed experts,
/// shape [num_experts, inter_size, group_num, 1]
/// 9: w2_weight - expert weights for final projection,
/// shape [num_experts, hidden_size, group_num, group_size]
/// 10: w2_scale - expert scale for final projection for compressed experts,
/// shape [num_experts, hidden_size, group_num, 1]
/// 11: w2_zp - expert zp for final projection for compressed experts,
/// shape [num_experts, hidden_size, group_num, 1]
/// \param config Configuration for the MOE operation
MOECompressed(const OutputVector& args, const Config& config);

const Config& get_config() const;
void set_config(const Config& config);

bool visit_attributes(AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;

private:
Config m_config;
};

} // namespace ov::intel_gpu::op
Original file line number Diff line number Diff line change
Expand Up @@ -311,3 +311,4 @@ REGISTER_FACTORY(internal, PagedAttentionExtension);
REGISTER_FACTORY(internal, LoraSubgraph);
REGISTER_FACTORY(internal, LoraSubgraphFused);
REGISTER_FACTORY(internal, VLSDPA);
REGISTER_FACTORY(internal, MOECompressed);
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#include <vector>

#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/op/moe_compressed.hpp"
#include "primitive.hpp"

namespace cldnn {
using MOECompressed = ov::intel_gpu::op::MOECompressed;

/// @brief moe compressed primitive
/// @details Performs moe compressed
struct moe_compressed : public primitive_base<moe_compressed> {
CLDNN_DECLARE_PRIMITIVE(moe_compressed)

moe_compressed() : primitive_base("", {}) {}

/// @brief Constructs moe primitive / layer.
///
/// @param id An identifier of new primitive.
/// @param inputs A list of Input primitive ids (inputs).
moe_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOECompressed::Config& config)
: primitive_base(id, inputs, 1, {optional_data_type()}),
_config(config) {}

MOECompressed::Config _config;

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;

auto rhs_casted = downcast<const moe_compressed>(rhs);

return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<moe_compressed>::save(ob);
ob << make_data(&_config, sizeof(_config));
}

void load(BinaryInputBuffer& ib) override {
primitive_base<moe_compressed>::load(ib);
ib >> make_data(&_config, sizeof(_config));
}
};

} // namespace cldnn
Loading
Loading