update per model d2t changes

songbell · songbell · commit a9f68dbe63e9 · 2025-10-24T00:03:47.000+08:00
Signed-off-by: fishbell &lt;bell.song@intel.com&gt;
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -522,7 +522,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test }}
             timeout: 240
           - name: 'LLM & VLM'
-            cmd: 'python -m pytest -v ./tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py ./tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py tests/python_tests/test_eagle3.py --override-ini cache_dir=/mount/caches/pytest/'
+            cmd: 'python -m pytest -v ./tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py ./tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py --override-ini cache_dir=/mount/caches/pytest/'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
             timeout: 180
           - name: 'GGUF Reader tests'
@@ -551,6 +551,12 @@ jobs:
               python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
             timeout: 90
+          - name: 'EAGLE3 speculative decoding tests'
+            cmd: |
+              python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@e67abb1a20fb190b39c1dc0216cddb65b300210f
+              python -m pytest -v ./tests/python_tests/test_eagle3.py'
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).eagle3_speculative_decoding.test }}
+            timeout: 90
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -623,7 +623,7 @@ jobs:
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test }}
             timeout: 240
           - name: 'LLM & VLM'
-            cmd: 'python -m pytest -s -v tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py tests/python_tests/test_eagle3.py --override-ini cache_dir=/mount/caches/pytest/'
+            cmd: 'python -m pytest -s -v tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py --override-ini cache_dir=/mount/caches/pytest/'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
             timeout: 180
           - name: 'GGUF Reader tests'
@@ -652,6 +652,12 @@ jobs:
               python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
             timeout: 90
+          - name: 'EAGLE3 speculative decoding tests'
+            cmd: |
+              python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@e67abb1a20fb190b39c1dc0216cddb65b300210f
+              python -m pytest -v ./tests/python_tests/test_eagle3.py'
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).eagle3_speculative_decoding.test }}
+            timeout: 90
     defaults:
       run:
         shell: pwsh
diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp
@@ -16,7 +16,6 @@
 #include "continuous_batching/timer.hpp"
 #include "utils.hpp"
 #include "visual_language/inputs_embedder.hpp"
-#include "safe_tensor_wrapper.hpp"
 #include "json_utils.hpp"
 
 using namespace ov::genai;
@@ -51,11 +50,6 @@ extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path&
             OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
             eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
         }
-        if (config.find("dt_mapping_path") != config.end()) {
-            eagle_rt_info.dt_mapping_table = config.at("dt_mapping_path").as<std::filesystem::path>();
-            eagle_rt_info.dt_mapping_table = eagle_rt_info.dt_mapping_table / "eagle3.safetensors";
-            config.erase("dt_mapping_path");
-        }
     }
     return eagle_rt_info;
 }
@@ -107,13 +101,6 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
         OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
         m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
-        // parse d2t from safe tensors
-        if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
-            ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
-            if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
-                std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
-            }
-        }
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -160,13 +147,6 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
         // to be implemented future
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
         m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
-        // parse d2t from safe tensors
-        if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
-            ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
-            if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
-                std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
-            }
-        }
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -215,13 +195,6 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
         OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
         m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
-         // parse d2t from safe tensors
-        if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
-            ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
-            if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
-                std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
-            }
-        }
     } else if (draft_model_desr.model != nullptr) {
         OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
         auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
diff --git a/src/cpp/src/llm/pipeline.cpp b/src/cpp/src/llm/pipeline.cpp
@@ -39,9 +39,6 @@ inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model, ov::AnyMap& p
         properties["eagle3_mode"] = true;
         if (model->has_rt_info("hidden_layers_list"))
             properties["hidden_layers_list"] = model->get_rt_info<std::vector<int>>("hidden_layers_list");
-        if (!mapping_path.empty()) {
-            properties["dt_mapping_path"] = mapping_path; // d2t mapping path
-        }
     }
 }
 
diff --git a/src/cpp/src/lora/adapter.cpp b/src/cpp/src/lora/adapter.cpp
@@ -40,10 +40,13 @@
 #include "openvino/genai/lora_adapter.hpp"
 
 #include "utils.hpp"
-#include "safe_tensor_wrapper.hpp"
 #include "lora/common.hpp"
 #include "lora/names_mapping.hpp"
 
+extern "C" {
+    #include "safetensors.h"
+}
+
 // FIXME: Remove or move to a dedicated common header
 #ifdef NDEBUG
     #define DEBUG_PRINT(X) do {} while(false)
@@ -66,6 +69,57 @@ using ConstantVector = std::vector<std::shared_ptr<v0::Constant>>;
 using LoRANode = LoRAParts<std::shared_ptr<ov::Node>>;
 using LoRAPartsParser = LoRAParts<std::function<std::optional<std::string>(const std::string& name)>>;
 
+// Converts Safetensors element type to OV element type. Only part of the types are supported.
+ov::element::Type safetensors_to_ov_element_type (int dtype) {
+    switch(dtype) {
+        case SAFETENSORS_F32:
+            return ov::element::f32;
+        case SAFETENSORS_F16:
+            return ov::element::f16;
+        case SAFETENSORS_BF16:
+            return ov::element::bf16;
+        default:
+            OPENVINO_THROW("Not supported safetensors dtype: ", dtype);
+    }
+}
+
+using ConstantMap = std::map<std::string, std::shared_ptr<ov::op::v0::Constant>>;
+
+// Safetensor file parser that deallocates temporary buffers automatically.
+// Drop-in replacement for the third party safetensors_File struct.
+struct AutoSafetensor: public safetensors_File {
+    ~AutoSafetensor () {
+        std::free(tensors);
+        std::free(metadata);
+    }
+};
+
+// The key in the map is a tensor name and the Constant uses a region of memory from the memory block.
+// Each Constant holds a shared pointer to the block in the runtime info.
+// The memory block will be deallocated when the last Constant is destroyed.
+ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor) {
+    AutoSafetensor safe_tensors_file{};
+
+    OPENVINO_ASSERT(safetensors_file_init(safetensor.data<char>(), safetensor.get_byte_size(), &safe_tensors_file) == nullptr,
+        "Cannot parse safetensor as a Safetensors file format. Safetensors file format is supported only"
+    );
+
+    ConstantMap tensors;
+    for (int i = 0; i < safe_tensors_file.num_tensors; i++) {
+        safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i];
+        std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len);
+        ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions);
+        void* ptr = tensor.ptr;     // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer
+
+        auto type = safetensors_to_ov_element_type(tensor.dtype);
+        auto constant =
+            std::make_shared<v0::Constant>(type, shape, ptr, nullptr);      // wraps existing memory, no ownership
+        constant->get_rt_info()["__safetensors_buffer_holder"] = safetensor;    // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed
+        tensors[name] = constant;
+    }
+    return tensors;
+}
+
 // Reads a file with a given filename expecting Safetensors file format.
 // The file data is mmaped to tensor.
 ConstantMap read_safetensors(const std::filesystem::path& filename) {
@@ -1713,4 +1767,4 @@ void AdapterConfig::set_adapters_and_alphas(const std::vector<std::pair<Adapter,
 
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
diff --git a/src/cpp/src/lora/safetensors.c b/src/cpp/src/lora/safetensors.c
diff --git a/src/cpp/src/safe_tensor_wrapper.cpp b/src/cpp/src/safe_tensor_wrapper.cpp
diff --git a/src/cpp/src/safe_tensor_wrapper.hpp b/src/cpp/src/safe_tensor_wrapper.hpp
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.cpp
@@ -51,6 +51,16 @@ void share_embedding_weights(std::shared_ptr<ov::Model>& main_model, std::shared
     }
 }
 
+std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(std::shared_ptr<ov::Model>& model) {
+    // extract result nodes from model
+    for (const auto& result : model->get_results()) {
+        auto input_node = result->input_value(0).get_node_shared_ptr();
+        if (ov::is_type<ov::op::v0::Constant>(input_node) && input_node->get_friendly_name().find("d2t") != std::string::npos) {
+            return ov::as_type_ptr<ov::op::v0::Constant>(input_node);
+        }
+    }
+    return nullptr;
+}
 void extract_hidden_state_generic(std::shared_ptr<ov::Model>& model,
                                   const std::vector<int>& hidden_layers_to_abstract) {
     ov::pass::Manager pm;
@@ -317,7 +327,9 @@ ContinuousBatchingPipeline::Eagle3DecodingImpl::Eagle3DecodingImpl(const ov::gen
     m_draft_pipeline->raw_perf_metrics.m_inference_durations = {{ MicroSeconds(0.0f) }};
 
     // specific params update for eagle pipeline
-    update_eagle_pipeline_params();
+    // check draft_model, retrieve d2t table if exists
+    auto d2t_tensor = extract_d2t_mapping_table(draft_model);
+    update_eagle_pipeline_params(d2t_tensor);
 }
 
 ov::Tensor ContinuousBatchingPipeline::Eagle3DecodingImpl::create_draft_input_ids(const ov::Tensor& original_input_ids) {
@@ -339,7 +351,7 @@ ov::Tensor ContinuousBatchingPipeline::Eagle3DecodingImpl::create_draft_input_id
     return draft_input_ids;
 }
 
-void ContinuousBatchingPipeline::Eagle3DecodingImpl::update_eagle_pipeline_params() {
+void ContinuousBatchingPipeline::Eagle3DecodingImpl::update_eagle_pipeline_params(std::shared_ptr<ov::op::v0::Constant>& d2t_tensor) {
     auto m_main_eagle_pipeline  = std::dynamic_pointer_cast<ContinuousBatchingForEagle3DecodingImpl>(m_main_pipeline);
     auto m_draft_eagle_pipeline = std::dynamic_pointer_cast<ContinuousBatchingForEagle3DecodingImpl>(m_draft_pipeline);
     m_main_eagle_pipeline->set_hidden_state_export_needed(true);
@@ -348,6 +360,7 @@ void ContinuousBatchingPipeline::Eagle3DecodingImpl::update_eagle_pipeline_param
     m_draft_eagle_pipeline->set_hidden_state_internal_needed(true);
     m_draft_eagle_pipeline->set_adjust_factor(
         m_hidden_layers_to_abstract.size() > 0 ? m_hidden_layers_to_abstract.size() : 1);
+    m_draft_eagle_pipeline->set_d2t_for_draft_decoding(d2t_tensor);
 }
 
 GenerationHandle
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp
@@ -44,13 +44,8 @@ class ContinuousBatchingPipeline::Eagle3DecodingImpl : public ContinuousBatching
     GenerationHandle add_request(uint64_t request_id,
                                  const std::string& prompt,
                                  ov::genai::GenerationConfig sampling_params) override;
-
-    void set_d2t_for_draft_decoding(std::shared_ptr<ov::op::v0::Constant>& d2t_tensor) {
-        auto eagle_impl = std::static_pointer_cast<ContinuousBatchingForEagle3DecodingImpl>(m_draft_pipeline);
-        eagle_impl->set_d2t_for_draft_decoding(d2t_tensor);
-    };
 protected:
-    void update_eagle_pipeline_params();
+    void update_eagle_pipeline_params(std::shared_ptr<ov::op::v0::Constant>& d2t_tensor);
     ov::Tensor create_draft_input_ids(const ov::Tensor& original_input_ids);
     std::vector<int> m_hidden_layers_to_abstract;
 };
diff --git a/tests/python_tests/test_eagle3.py b/tests/python_tests/test_eagle3.py
@@ -22,7 +22,6 @@ def add(a, b):
 @pytest.mark.parametrize("main_model,draft_model,prompt", eagle_models_and_input)
 @pytest.mark.parametrize("main_device,draft_device", devices)
 @pytest.mark.precommit
-@pytest.mark.skip(reason="CVS-174959 enable model conversion for eagle3 and enable the test")
 def test_eagle3_sd_string_inputs(main_model, main_device, draft_model, draft_device, prompt):
     # Download and convert model:
     main_opt_model, main_hf_tokenizer, main_model_path = download_and_convert_model(main_model)
@@ -48,7 +47,6 @@ def test_eagle3_sd_string_inputs(main_model, main_device, draft_model, draft_dev
 @pytest.mark.parametrize("main_model,draft_model,prompt", eagle_models_and_input)
 @pytest.mark.parametrize("main_device,draft_device", devices)
 @pytest.mark.precommit
-@pytest.mark.skip(reason="CVS-174959 enable model conversion for eagle3 and enable the test")
 def test_eagle3_sd_extended_perf_metrics(main_model, main_device, draft_model, draft_device, prompt):
     import time
     extended_perf_metrics = None

Original file line number	Diff line number	Diff line change
`@@ -39,9 +39,6 @@ inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model, ov::AnyMap& p`
`39`	`39`	`properties["eagle3_mode"] = true;`
`40`	`40`	`if (model->has_rt_info("hidden_layers_list"))`
`41`	`41`	`properties["hidden_layers_list"] = model->get_rt_info<std::vector<int>>("hidden_layers_list");`
`42`		`- if (!mapping_path.empty()) {`
`43`		`- properties["dt_mapping_path"] = mapping_path; // d2t mapping path`
`44`		`- }`
`45`	`42`	`}`
`46`	`43`	`}`
`47`	`44`