openvinotoolkit · songbell · Sep 17, 2025 · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -522,7 +522,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test }}
             timeout: 240
           - name: 'LLM & VLM'
-            cmd: 'python -m pytest -v ./tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py ./tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py --override-ini cache_dir=/mount/caches/pytest/'
+            cmd: 'python -m pytest -v ./tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py ./tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py tests/python_tests/test_eagle3.py --override-ini cache_dir=/mount/caches/pytest/'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
             timeout: 180
           - name: 'GGUF Reader tests'

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -623,7 +623,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test }}
             timeout: 240
           - name: 'LLM & VLM'
-            cmd: 'python -m pytest -s -v tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py --override-ini cache_dir=/mount/caches/pytest/'
+            cmd: 'python -m pytest -s -v tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py tests/python_tests/test_eagle3.py --override-ini cache_dir=/mount/caches/pytest/'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
             timeout: 180
           - name: 'GGUF Reader tests'

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -65,13 +65,18 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     class ContinuousBatchingImpl;
 
     class ContinuousBatchingForSpeculativeDecodingImpl;
+    class ContinuousBatchingForEagle3DecodingImpl;
     class ContinuousBatchingForPromptLookupImpl;
     class SpeculativeDecodingImpl;
+    class Eagle3DecodingImpl;
     class PromptLookupImpl;
 
     friend class ContinuousBatchingForSpeculativeDecodingImpl;
+
     friend class ContinuousBatchingForPromptLookupImpl;
+    friend class ContinuousBatchingForEagle3DecodingImpl;
     friend class SpeculativeDecodingImpl;
+    friend class Eagle3DecodingImpl;
     friend class PromptLookupImpl;
 
     std::shared_ptr<IContinuousBatchingPipeline> m_impl;

diff --git a/src/cpp/src/continuous_batching/model_runner.hpp b/src/cpp/src/continuous_batching/model_runner.hpp
diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp
@@ -11,14 +11,55 @@
 #include "openvino/genai/tokenizer.hpp"
 #include "continuous_batching/pipeline_impl.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
+#include "speculative_decoding/speculative_decoding_eagle3_impl.hpp"
 #include "prompt_lookup/prompt_lookup_impl.hpp"
 #include "continuous_batching/timer.hpp"
 #include "utils.hpp"
 #include "visual_language/inputs_embedder.hpp"
+#include "safe_tensor_wrapper.hpp"
+#include "json_utils.hpp"
 
 using namespace ov::genai;
 
 namespace {
+struct Eagle3RTInfo {
+    bool eagle3_mode = false;
+    std::vector<int> hidden_layers_list;
+    std::filesystem::path dt_mapping_table;
+};
+
+Eagle3RTInfo
+extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path& models_path) {
+    Eagle3RTInfo eagle_rt_info;
+    if (config.find("eagle3_mode") != config.end()) {
+        eagle_rt_info.eagle3_mode = config.at("eagle3_mode").as<bool>();
+        config.erase("eagle3_mode");
+        if (config.find("hidden_layers_list") != config.end()) {
+            eagle_rt_info.hidden_layers_list = config.at("hidden_layers_list").as<std::vector<int>>();
+            config.erase("hidden_layers_list");
+        } else {
+            // compute the layers from number of hidden layers
+            auto config_file_path = models_path / "config.json";
+            if (!std::filesystem::exists(config_file_path))
+                OPENVINO_THROW("cannot deduce layers for hidden layer extraction");
+            std::ifstream file(config_file_path);
+
+            nlohmann::json data = nlohmann::json::parse(file);
+            using ov::genai::utils::read_json_param;
+            int num_decoder_layers = 0;
+            read_json_param(data, "num_hidden_layers", num_decoder_layers);
+            OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
+            eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
+        }
+        if (config.find("dt_mapping_path") != config.end()) {
+            eagle_rt_info.dt_mapping_table = config.at("dt_mapping_path").as<std::filesystem::path>();
+            eagle_rt_info.dt_mapping_table = eagle_rt_info.dt_mapping_table / "eagle3.safetensors";
+            config.erase("dt_mapping_path");
+        }
+    }
+    return eagle_rt_info;
+}
+
 bool
 extract_prompt_lookup_from_config(ov::AnyMap& config) {
     bool res = false;
@@ -45,6 +86,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
+    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
 
     auto model = utils::read_model(models_path, properties);
     auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
@@ -61,6 +103,26 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config);
+    } else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
+        OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
+        // Eagle speculative decoding does not support dynamic_split_fuse mode
+        // because it requires hidden state interaction from main model to draft model
+        // to be implemented future
+        SchedulerConfig scheduler_config_copy = scheduler_config;
+        if (scheduler_config.dynamic_split_fuse) {
+            std::cout << "Note: disable dynamic split fuse for eagle3 speculative decoding" << std::endl;
+            scheduler_config_copy.dynamic_split_fuse = false;
+            // Use scheduler_config_copy in subsequent code if modification is needed
+        }
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config_copy, generation_config);
+        m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
+        // parse d2t from safe tensors
+        if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
+            ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
+            if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
+                std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
+            }
+        }
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -85,13 +147,12 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-
+    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
     auto model = utils::read_model(models_path, properties_without_draft_model);
     auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
     properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
 
     auto generation_config = utils::from_config_json_if_exists(models_path);
-
     std::shared_ptr<InputsEmbedder> embedder;
     if (std::filesystem::exists(models_path / "openvino_text_embeddings_model.xml")) {
         embedder = std::make_shared<InputsEmbedder>(models_path, device, properties_without_draft_model_without_gguf);
@@ -101,6 +162,26 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model_without_gguf, generation_config);
+    } else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
+        OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
+        // Eagle speculative decoding does not support dynamic_split_fuse mode
+        // because it requires hidden state interaction from main model to draft model
+        // to be implemented future
+        SchedulerConfig scheduler_config_copy = scheduler_config;
+        if (scheduler_config.dynamic_split_fuse) {
+            std::cout << "Note: disable dynamic split fuse for eagle3 speculative decoding" << std::endl;
+            scheduler_config_copy.dynamic_split_fuse = false;
+            // Use scheduler_config_copy in subsequent code if modification is needed
+        }
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config_copy, generation_config);
+        m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
+        // parse d2t from safe tensors
+        if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
+            ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
+            if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
+                std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
+            }
+        }
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -127,6 +208,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
+    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, std::filesystem::path(model_str));
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
 
     auto rt_info = model->get_rt_info();
@@ -144,6 +226,26 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
         OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
         OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
         m_impl = std::make_shared<PromptLookupImpl>(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config);
+    } else if (draft_model_desr.model != nullptr && eagle_rt_info.eagle3_mode) {
+        OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
+        // Eagle speculative decoding does not support dynamic_split_fuse mode
+        // because it requires hidden state interaction from main model to draft model
+        // to be implemented future
+        SchedulerConfig scheduler_config_copy = scheduler_config;
+        if (scheduler_config.dynamic_split_fuse) {
+            std::cout << "Note: disable dynamic split fuse for eagle3 speculative decoding" << std::endl;
+            scheduler_config_copy.dynamic_split_fuse = false;
+            // Use scheduler_config_copy in subsequent code if modification is needed
+        }
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config_copy, generation_config);
+        m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
+         // parse d2t from safe tensors
+        if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
+            ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
+            if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
+                std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
+            }
+        }
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);

diff --git a/src/cpp/src/llm/pipeline.cpp b/src/cpp/src/llm/pipeline.cpp
@@ -34,6 +34,23 @@ std::pair<std::string, Any> generation_config(const GenerationConfig& config) {
     return {utils::CONFIG_ARG_NAME, Any::make<GenerationConfig>(config)};
 }
 
+inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model, ov::AnyMap& properties, const std::filesystem::path& mapping_path) {
+    if (model->has_rt_info("eagle3_mode") && model->get_rt_info<bool>("eagle3_mode")) {
+        properties["eagle3_mode"] = true;
+        if (model->has_rt_info("hidden_layers_list"))
+            properties["hidden_layers_list"] = model->get_rt_info<std::vector<int>>("hidden_layers_list");
+        if (!mapping_path.empty()) {
+            properties["dt_mapping_path"] = mapping_path; // d2t mapping path
+        }
+    }
+}
+
+inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model,
+                                ov::AnyMap& properties,
+                                const std::string& mapping_path) {
+    apply_eagle_rt_info(model, properties, std::filesystem::path(mapping_path));
+}
+
 std::pair<std::string, Any> draft_model(
     const std::filesystem::path& models_path,
     const std::string& device,
@@ -42,6 +59,7 @@ std::pair<std::string, Any> draft_model(
 
     std::filesystem::path openvino_model_name = "openvino_model.xml";
     auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
+    apply_eagle_rt_info(model, plugin_config, models_path);
     auto generation_config = utils::from_config_json_if_exists(models_path);
     auto tokenizer = ov::genai::Tokenizer(models_path);
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
@@ -57,6 +75,7 @@ std::pair<std::string, Any> draft_model(
     auto [plugin_config, scheduler_config] = utils::extract_scheduler_config(properties);
 
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+    apply_eagle_rt_info(model, plugin_config, model_str);
     return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
 }
 

diff --git a/src/cpp/src/lora/adapter.cpp b/src/cpp/src/lora/adapter.cpp
@@ -40,13 +40,10 @@
 #include "openvino/genai/lora_adapter.hpp"
 
 #include "utils.hpp"
+#include "safe_tensor_wrapper.hpp"
 #include "lora/common.hpp"
 #include "lora/names_mapping.hpp"
 
-extern "C" {
-    #include "safetensors.h"
-}
-
 // FIXME: Remove or move to a dedicated common header
 #ifdef NDEBUG
     #define DEBUG_PRINT(X) do {} while(false)
@@ -69,57 +66,6 @@ using ConstantVector = std::vector<std::shared_ptr<v0::Constant>>;
 using LoRANode = LoRAParts<std::shared_ptr<ov::Node>>;
 using LoRAPartsParser = LoRAParts<std::function<std::optional<std::string>(const std::string& name)>>;
 
-// Converts Safetensors element type to OV element type. Only part of the types are supported.
-ov::element::Type safetensors_to_ov_element_type (int dtype) {
-    switch(dtype) {
-        case SAFETENSORS_F32:
-            return ov::element::f32;
-        case SAFETENSORS_F16:
-            return ov::element::f16;
-        case SAFETENSORS_BF16:
-            return ov::element::bf16;
-        default:
-            OPENVINO_THROW("Not supported safetensors dtype: ", dtype);
-    }
-}
-
-using ConstantMap = std::map<std::string, std::shared_ptr<ov::op::v0::Constant>>;
-
-// Safetensor file parser that deallocates temporary buffers automatically.
-// Drop-in replacement for the third party safetensors_File struct.
-struct AutoSafetensor: public safetensors_File {
-    ~AutoSafetensor () {
-        std::free(tensors);
-        std::free(metadata);
-    }
-};
-
-// The key in the map is a tensor name and the Constant uses a region of memory from the memory block.
-// Each Constant holds a shared pointer to the block in the runtime info.
-// The memory block will be deallocated when the last Constant is destroyed.
-ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor) {
-    AutoSafetensor safe_tensors_file{};
-
-    OPENVINO_ASSERT(safetensors_file_init(safetensor.data<char>(), safetensor.get_byte_size(), &safe_tensors_file) == nullptr,
-        "Cannot parse safetensor as a Safetensors file format. Safetensors file format is supported only"
-    );
-
-    ConstantMap tensors;
-    for (int i = 0; i < safe_tensors_file.num_tensors; i++) {
-        safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i];
-        std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len);
-        ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions);
-        void* ptr = tensor.ptr;     // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer
-
-        auto type = safetensors_to_ov_element_type(tensor.dtype);
-        auto constant =
-            std::make_shared<v0::Constant>(type, shape, ptr, nullptr);      // wraps existing memory, no ownership
-        constant->get_rt_info()["__safetensors_buffer_holder"] = safetensor;    // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed
-        tensors[name] = constant;
-    }
-    return tensors;
-}
-
 // Reads a file with a given filename expecting Safetensors file format.
 // The file data is mmaped to tensor.
 ConstantMap read_safetensors(const std::filesystem::path& filename) {