Skip to content

Commit a9f68db

Browse files
committed
update per model d2t changes
Signed-off-by: fishbell <[email protected]>
1 parent efd854b commit a9f68db

File tree

11 files changed

+86
-114
lines changed

11 files changed

+86
-114
lines changed

.github/workflows/linux.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ jobs:
522522
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test }}
523523
timeout: 240
524524
- name: 'LLM & VLM'
525-
cmd: 'python -m pytest -v ./tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py ./tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py tests/python_tests/test_eagle3.py --override-ini cache_dir=/mount/caches/pytest/'
525+
cmd: 'python -m pytest -v ./tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py ./tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py --override-ini cache_dir=/mount/caches/pytest/'
526526
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
527527
timeout: 180
528528
- name: 'GGUF Reader tests'
@@ -551,6 +551,12 @@ jobs:
551551
python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
552552
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
553553
timeout: 90
554+
- name: 'EAGLE3 speculative decoding tests'
555+
cmd: |
556+
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@e67abb1a20fb190b39c1dc0216cddb65b300210f
557+
python -m pytest -v ./tests/python_tests/test_eagle3.py'
558+
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).eagle3_speculative_decoding.test }}
559+
timeout: 90
554560
defaults:
555561
run:
556562
shell: bash

.github/workflows/windows.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ jobs:
623623
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test }}
624624
timeout: 240
625625
- name: 'LLM & VLM'
626-
cmd: 'python -m pytest -s -v tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py tests/python_tests/test_eagle3.py --override-ini cache_dir=/mount/caches/pytest/'
626+
cmd: 'python -m pytest -s -v tests/python_tests/test_llm_pipeline.py tests/python_tests/test_llm_pipeline_static.py tests/python_tests/test_vlm_pipeline.py tests/python_tests/test_structured_output.py --override-ini cache_dir=/mount/caches/pytest/'
627627
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
628628
timeout: 180
629629
- name: 'GGUF Reader tests'
@@ -652,6 +652,12 @@ jobs:
652652
python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
653653
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
654654
timeout: 90
655+
- name: 'EAGLE3 speculative decoding tests'
656+
cmd: |
657+
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@e67abb1a20fb190b39c1dc0216cddb65b300210f
658+
python -m pytest -v ./tests/python_tests/test_eagle3.py'
659+
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).eagle3_speculative_decoding.test }}
660+
timeout: 90
655661
defaults:
656662
run:
657663
shell: pwsh

src/cpp/src/continuous_batching/pipeline.cpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include "continuous_batching/timer.hpp"
1717
#include "utils.hpp"
1818
#include "visual_language/inputs_embedder.hpp"
19-
#include "safe_tensor_wrapper.hpp"
2019
#include "json_utils.hpp"
2120

2221
using namespace ov::genai;
@@ -51,11 +50,6 @@ extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path&
5150
OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
5251
eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
5352
}
54-
if (config.find("dt_mapping_path") != config.end()) {
55-
eagle_rt_info.dt_mapping_table = config.at("dt_mapping_path").as<std::filesystem::path>();
56-
eagle_rt_info.dt_mapping_table = eagle_rt_info.dt_mapping_table / "eagle3.safetensors";
57-
config.erase("dt_mapping_path");
58-
}
5953
}
6054
return eagle_rt_info;
6155
}
@@ -107,13 +101,6 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
107101
OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
108102
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
109103
m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
110-
// parse d2t from safe tensors
111-
if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
112-
ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
113-
if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
114-
std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
115-
}
116-
}
117104
} else if (draft_model_desr.model != nullptr) {
118105
OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
119106
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -160,13 +147,6 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
160147
// to be implemented future
161148
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
162149
m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
163-
// parse d2t from safe tensors
164-
if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
165-
ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
166-
if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
167-
std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
168-
}
169-
}
170150
} else if (draft_model_desr.model != nullptr) {
171151
OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
172152
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model_without_gguf, scheduler_config, generation_config);
@@ -215,13 +195,6 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
215195
OPENVINO_ASSERT(embedder == nullptr, "Eagle speculative decoding is not supported for models with embeddings");
216196
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
217197
m_impl = std::make_shared<Eagle3DecodingImpl>(main_model_descr, draft_model_desr, eagle_rt_info.hidden_layers_list);
218-
// parse d2t from safe tensors
219-
if (std::filesystem::exists(eagle_rt_info.dt_mapping_table)) {
220-
ConstantMap constant_tensors = safetensor_to_constant_map(ov::read_tensor_data(eagle_rt_info.dt_mapping_table));
221-
if (constant_tensors.find("d2t") != constant_tensors.end()) { // d2t map can be optional
222-
std::dynamic_pointer_cast<Eagle3DecodingImpl>(m_impl)->set_d2t_for_draft_decoding(constant_tensors["d2t"]);
223-
}
224-
}
225198
} else if (draft_model_desr.model != nullptr) {
226199
OPENVINO_ASSERT(embedder == nullptr, "Speculative decoding is not supported for models with embeddings");
227200
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);

src/cpp/src/llm/pipeline.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@ inline void apply_eagle_rt_info(std::shared_ptr<ov::Model>& model, ov::AnyMap& p
3939
properties["eagle3_mode"] = true;
4040
if (model->has_rt_info("hidden_layers_list"))
4141
properties["hidden_layers_list"] = model->get_rt_info<std::vector<int>>("hidden_layers_list");
42-
if (!mapping_path.empty()) {
43-
properties["dt_mapping_path"] = mapping_path; // d2t mapping path
44-
}
4542
}
4643
}
4744

src/cpp/src/lora/adapter.cpp

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,13 @@
4040
#include "openvino/genai/lora_adapter.hpp"
4141

4242
#include "utils.hpp"
43-
#include "safe_tensor_wrapper.hpp"
4443
#include "lora/common.hpp"
4544
#include "lora/names_mapping.hpp"
4645

46+
extern "C" {
47+
#include "safetensors.h"
48+
}
49+
4750
// FIXME: Remove or move to a dedicated common header
4851
#ifdef NDEBUG
4952
#define DEBUG_PRINT(X) do {} while(false)
@@ -66,6 +69,57 @@ using ConstantVector = std::vector<std::shared_ptr<v0::Constant>>;
6669
using LoRANode = LoRAParts<std::shared_ptr<ov::Node>>;
6770
using LoRAPartsParser = LoRAParts<std::function<std::optional<std::string>(const std::string& name)>>;
6871

72+
// Converts Safetensors element type to OV element type. Only part of the types are supported.
73+
ov::element::Type safetensors_to_ov_element_type (int dtype) {
74+
switch(dtype) {
75+
case SAFETENSORS_F32:
76+
return ov::element::f32;
77+
case SAFETENSORS_F16:
78+
return ov::element::f16;
79+
case SAFETENSORS_BF16:
80+
return ov::element::bf16;
81+
default:
82+
OPENVINO_THROW("Not supported safetensors dtype: ", dtype);
83+
}
84+
}
85+
86+
using ConstantMap = std::map<std::string, std::shared_ptr<ov::op::v0::Constant>>;
87+
88+
// Safetensor file parser that deallocates temporary buffers automatically.
89+
// Drop-in replacement for the third party safetensors_File struct.
90+
struct AutoSafetensor: public safetensors_File {
91+
~AutoSafetensor () {
92+
std::free(tensors);
93+
std::free(metadata);
94+
}
95+
};
96+
97+
// The key in the map is a tensor name and the Constant uses a region of memory from the memory block.
98+
// Each Constant holds a shared pointer to the block in the runtime info.
99+
// The memory block will be deallocated when the last Constant is destroyed.
100+
ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor) {
101+
AutoSafetensor safe_tensors_file{};
102+
103+
OPENVINO_ASSERT(safetensors_file_init(safetensor.data<char>(), safetensor.get_byte_size(), &safe_tensors_file) == nullptr,
104+
"Cannot parse safetensor as a Safetensors file format. Safetensors file format is supported only"
105+
);
106+
107+
ConstantMap tensors;
108+
for (int i = 0; i < safe_tensors_file.num_tensors; i++) {
109+
safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i];
110+
std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len);
111+
ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions);
112+
void* ptr = tensor.ptr; // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer
113+
114+
auto type = safetensors_to_ov_element_type(tensor.dtype);
115+
auto constant =
116+
std::make_shared<v0::Constant>(type, shape, ptr, nullptr); // wraps existing memory, no ownership
117+
constant->get_rt_info()["__safetensors_buffer_holder"] = safetensor; // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed
118+
tensors[name] = constant;
119+
}
120+
return tensors;
121+
}
122+
69123
// Reads a file with a given filename expecting Safetensors file format.
70124
// The file data is mmaped to tensor.
71125
ConstantMap read_safetensors(const std::filesystem::path& filename) {
@@ -1713,4 +1767,4 @@ void AdapterConfig::set_adapters_and_alphas(const std::vector<std::pair<Adapter,
17131767

17141768

17151769
} // namespace genai
1716-
} // namespace ov
1770+
} // namespace ov
File renamed without changes.

src/cpp/src/safe_tensor_wrapper.cpp

Lines changed: 0 additions & 43 deletions
This file was deleted.

src/cpp/src/safe_tensor_wrapper.hpp

Lines changed: 0 additions & 27 deletions
This file was deleted.

src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,16 @@ void share_embedding_weights(std::shared_ptr<ov::Model>& main_model, std::shared
5151
}
5252
}
5353

54+
std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(std::shared_ptr<ov::Model>& model) {
55+
// extract result nodes from model
56+
for (const auto& result : model->get_results()) {
57+
auto input_node = result->input_value(0).get_node_shared_ptr();
58+
if (ov::is_type<ov::op::v0::Constant>(input_node) && input_node->get_friendly_name().find("d2t") != std::string::npos) {
59+
return ov::as_type_ptr<ov::op::v0::Constant>(input_node);
60+
}
61+
}
62+
return nullptr;
63+
}
5464
void extract_hidden_state_generic(std::shared_ptr<ov::Model>& model,
5565
const std::vector<int>& hidden_layers_to_abstract) {
5666
ov::pass::Manager pm;
@@ -317,7 +327,9 @@ ContinuousBatchingPipeline::Eagle3DecodingImpl::Eagle3DecodingImpl(const ov::gen
317327
m_draft_pipeline->raw_perf_metrics.m_inference_durations = {{ MicroSeconds(0.0f) }};
318328

319329
// specific params update for eagle pipeline
320-
update_eagle_pipeline_params();
330+
// check draft_model, retrieve d2t table if exists
331+
auto d2t_tensor = extract_d2t_mapping_table(draft_model);
332+
update_eagle_pipeline_params(d2t_tensor);
321333
}
322334

323335
ov::Tensor ContinuousBatchingPipeline::Eagle3DecodingImpl::create_draft_input_ids(const ov::Tensor& original_input_ids) {
@@ -339,7 +351,7 @@ ov::Tensor ContinuousBatchingPipeline::Eagle3DecodingImpl::create_draft_input_id
339351
return draft_input_ids;
340352
}
341353

342-
void ContinuousBatchingPipeline::Eagle3DecodingImpl::update_eagle_pipeline_params() {
354+
void ContinuousBatchingPipeline::Eagle3DecodingImpl::update_eagle_pipeline_params(std::shared_ptr<ov::op::v0::Constant>& d2t_tensor) {
343355
auto m_main_eagle_pipeline = std::dynamic_pointer_cast<ContinuousBatchingForEagle3DecodingImpl>(m_main_pipeline);
344356
auto m_draft_eagle_pipeline = std::dynamic_pointer_cast<ContinuousBatchingForEagle3DecodingImpl>(m_draft_pipeline);
345357
m_main_eagle_pipeline->set_hidden_state_export_needed(true);
@@ -348,6 +360,7 @@ void ContinuousBatchingPipeline::Eagle3DecodingImpl::update_eagle_pipeline_param
348360
m_draft_eagle_pipeline->set_hidden_state_internal_needed(true);
349361
m_draft_eagle_pipeline->set_adjust_factor(
350362
m_hidden_layers_to_abstract.size() > 0 ? m_hidden_layers_to_abstract.size() : 1);
363+
m_draft_eagle_pipeline->set_d2t_for_draft_decoding(d2t_tensor);
351364
}
352365

353366
GenerationHandle

src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,8 @@ class ContinuousBatchingPipeline::Eagle3DecodingImpl : public ContinuousBatching
4444
GenerationHandle add_request(uint64_t request_id,
4545
const std::string& prompt,
4646
ov::genai::GenerationConfig sampling_params) override;
47-
48-
void set_d2t_for_draft_decoding(std::shared_ptr<ov::op::v0::Constant>& d2t_tensor) {
49-
auto eagle_impl = std::static_pointer_cast<ContinuousBatchingForEagle3DecodingImpl>(m_draft_pipeline);
50-
eagle_impl->set_d2t_for_draft_decoding(d2t_tensor);
51-
};
5247
protected:
53-
void update_eagle_pipeline_params();
48+
void update_eagle_pipeline_params(std::shared_ptr<ov::op::v0::Constant>& d2t_tensor);
5449
ov::Tensor create_draft_input_ids(const ov::Tensor& original_input_ids);
5550
std::vector<int> m_hidden_layers_to_abstract;
5651
};

0 commit comments

Comments
 (0)