From a4f09c74d0992d29cbec77d14319d078d4613bd4 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Wed, 26 Mar 2025 12:40:46 +0000 Subject: [PATCH 01/15] Switch NPU Whisper to ov::genai::WhisperStatefulImpl --- src/cpp/src/whisper/models/decoder.cpp | 3 ++ .../src/whisper/models/statefull_decoder.cpp | 17 ++++++++++- .../src/whisper/models/statefull_decoder.hpp | 1 + src/cpp/src/whisper/pipeline.cpp | 30 +++++++++++++++++-- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 9f6cc9224f..4aa733f678 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -16,6 +16,9 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml"); if (has_decoder_with_past) { + if (device == "NPU") { + OPENVINO_THROW("Please use STATIC_PIPELINE config option for NPU for 3-model whisper pipeline."); + } return std::make_shared(models_path, device, properties); } diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 802e67a9c2..aaaaa3a826 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -15,7 +15,22 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo utils::apply_slice_before_matmul_transformation(model); - auto compiled_model = core.compile_model(model, device, properties); + if (device.find("NPU") != std::string::npos) { + m_is_npu = true; + } + + ov::CompiledModel compiled_model; + if (m_is_npu) { + auto kv_pos = ov::genai::utils::get_kv_axes_pos(model); + + utils::KVDesc kv_desc; + // Check max_prompt_length and min_response_length + std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu( + model, properties, kv_pos, models_path / "openvino_model.xml" + ); + } else { + compiled_model = core.compile_model(model, device, properties); + } utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); m_request = compiled_model.create_infer_request(); diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 6cfeebb6cf..24c421861c 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -27,5 +27,6 @@ class WhisperStatefullDecoder : public WhisperDecoder { private: ov::InferRequest m_request; + bool m_is_npu = false; }; } // namespace ov::genai diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp index 3e1d2483db..339be635fe 100644 --- a/src/cpp/src/whisper/pipeline.cpp +++ b/src/cpp/src/whisper/pipeline.cpp @@ -42,6 +42,23 @@ ov::InferRequest init_model(ov::CompiledModel& compiled) { } } +void reshape_to_static_encoder(std::shared_ptr model, const size_t feature_size) { + std::map new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_features") != std::string::npos) { + const auto& partial_shape = input.get_partial_shape(); + OPENVINO_ASSERT(partial_shape.size() >= 3); + new_shape = partial_shape; + new_shape[0] = 1; // batch_dim + new_shape[1] = feature_size; + } + new_shapes.emplace(input_name, new_shape); + } + model->reshape(new_shapes); +} + } // namespace namespace ov { @@ -55,9 +72,16 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi : WhisperPipelineImplBase{models_path}, m_sampler(m_tokenizer) { ov::Core core = utils::singleton_core(); + ov::CompiledModel compiled_model; + if (device == "NPU") { + auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties); + // NB: only batch_size == 1 is supported now for NPU + reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size); + compiled_model = core.compile_model(encoder_model, "NPU", properties); + } else { + compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); + } - ov::CompiledModel compiled_model = - core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model"); m_encoder = init_model(compiled_model); @@ -154,7 +178,7 @@ ov::genai::WhisperPipeline::WhisperPipeline(const std::filesystem::path& models_ const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (device == "NPU") { + if (device == "NPU" && properties.count("STATIC_PIPELINE")) { m_impl = std::make_unique(models_path, properties); } else { m_impl = std::make_unique(models_path, device, properties); From 1ba556080be281601a8e246f31e008fc750f358e Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Wed, 4 Jun 2025 17:47:11 +0100 Subject: [PATCH 02/15] Update Whisper stateful pipeline for NPU --- src/cpp/src/utils.cpp | 31 ++++++++++++++++--- src/cpp/src/utils.hpp | 3 +- src/cpp/src/whisper/models/decoder.cpp | 5 +-- src/cpp/src/whisper/models/decoder.hpp | 3 +- .../src/whisper/models/statefull_decoder.cpp | 30 +++++++++++------- .../src/whisper/models/statefull_decoder.hpp | 4 +-- src/cpp/src/whisper/pipeline.cpp | 4 +-- src/cpp/src/whisper/whisper.cpp | 5 +-- 8 files changed, 58 insertions(+), 27 deletions(-) diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index c23131b4f1..121c61d6c6 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -74,7 +74,6 @@ std::optional pop_int_and_cast(ov::AnyMap& config, const std::string& } void update_npu_config(ov::AnyMap& config, - const std::shared_ptr& model, const ov::genai::utils::KVAxesPosition& kv_pos, const ov::genai::utils::KVDesc& kv_desc) { update_config(config, {"NPU_USE_NPUW", "YES"}); @@ -97,6 +96,20 @@ void update_npu_config(ov::AnyMap& config, rename_key(config, "++SHARED_HEAD_CONFIG", "++NPUW_LLM_SHARED_HEAD_CONFIG"); } +void update_npu_config_whisper(ov::AnyMap& config, + const ov::genai::utils::KVAxesPosition& kv_pos, + const ov::genai::utils::KVDesc& kv_desc) { + update_config(config, {"NPU_USE_NPUW", "YES"}); + update_config(config, {"NPUW_ONLINE_PIPELINE", "NONE"}); + update_config(config, {"NPUW_LLM", "YES"}); + + update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch}); + update_config(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len}); + + update_config(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len}); + update_config(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len}); +} + inline bool is_paged_attention_available() { #if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) return true; @@ -546,7 +559,8 @@ void print_gguf_debug_info(const std::string &debug_info) { std::pair compile_decoder_for_npu(const std::shared_ptr& model, const ov::AnyMap& config, - const KVAxesPosition& kv_pos) { + const KVAxesPosition& kv_pos, + const bool is_whisper) { ov::CompiledModel compiled; ov::AnyMap properties = config; KVDesc kv_desc; @@ -567,9 +581,16 @@ compile_decoder_for_npu(const std::shared_ptr& model, kv_desc.max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as(); kv_desc.min_response_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as(); } else { - kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u); - kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u); - update_npu_config(properties, model, kv_pos, kv_desc); + if (is_whisper) { + kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(4u); + // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN) + kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(444u); + update_npu_config_whisper(properties, kv_pos, kv_desc); + } else { + kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u); + kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u); + update_npu_config(properties, kv_pos, kv_desc); + } compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties); // Also export compiled model if required if (export_blob) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 9bcfd1361f..4000acbcc7 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -190,7 +190,8 @@ struct KVDesc { std::pair compile_decoder_for_npu(const std::shared_ptr& model, const ov::AnyMap& config, - const KVAxesPosition& kv_pos); + const KVAxesPosition& kv_pos, + const bool is_whisper = false); /// @brief SharedOptional is a wrapper around a reference to an existing object and an optional shared alternative value. /// The difference from std::optional is that the default state is not empty and contains a reference to an existing object outside the class. diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 4aa733f678..26010c9d09 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -12,7 +12,8 @@ namespace ov::genai { std::shared_ptr WhisperDecoder::from_path(const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& properties) { + const ov::AnyMap& properties, + const ov::PartialShape& lhs_shape) { bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml"); if (has_decoder_with_past) { @@ -22,7 +23,7 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: return std::make_shared(models_path, device, properties); } - return std::make_shared(models_path, device, properties); + return std::make_shared(models_path, device, properties, lhs_shape); } std::pair WhisperDecoder::detect_language(const ov::Tensor& encoder_hidden_state, diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp index 3bdcf4d347..42bc9c39d8 100644 --- a/src/cpp/src/whisper/models/decoder.hpp +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -13,7 +13,8 @@ class WhisperDecoder { public: static std::shared_ptr from_path(const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& properties); + const ov::AnyMap& properties, + const ov::PartialShape& lhs_shape); std::pair detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id); diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index aaaaa3a826..38d1561139 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -5,30 +5,36 @@ #include "utils.hpp" +namespace { +void reshape_hidden_states_to_static(std::shared_ptr model, const ov::PartialShape& lhstates_shape) { + ov::PartialShape new_shape = model->input("encoder_hidden_states").get_partial_shape(); + new_shape[1] = lhstates_shape[1]; + std::map name_to_shape{{"encoder_hidden_states", new_shape}}; + model->reshape(name_to_shape); +} + +} // anonymous + namespace ov::genai { WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& properties) { + const ov::AnyMap& properties, + const ov::PartialShape& lhs_shape) { ov::Core core = utils::singleton_core(); auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); - utils::apply_slice_before_matmul_transformation(model); - - if (device.find("NPU") != std::string::npos) { - m_is_npu = true; - } - ov::CompiledModel compiled_model; - if (m_is_npu) { + if (device == "NPU") { auto kv_pos = ov::genai::utils::get_kv_axes_pos(model); + reshape_hidden_states_to_static(model, lhs_shape); + utils::KVDesc kv_desc; - // Check max_prompt_length and min_response_length - std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu( - model, properties, kv_pos, models_path / "openvino_model.xml" - ); + std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(model, properties, kv_pos, true); } else { + utils::apply_slice_before_matmul_transformation(model); + compiled_model = core.compile_model(model, device, properties); } diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 24c421861c..13bfc5792e 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -12,7 +12,8 @@ class WhisperStatefullDecoder : public WhisperDecoder { public: WhisperStatefullDecoder(const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& properties); + const ov::AnyMap& properties, + const ov::PartialShape& lhs_shape); void start_async(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) override; @@ -27,6 +28,5 @@ class WhisperStatefullDecoder : public WhisperDecoder { private: ov::InferRequest m_request; - bool m_is_npu = false; }; } // namespace ov::genai diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp index 339be635fe..0a15a6d03c 100644 --- a/src/cpp/src/whisper/pipeline.cpp +++ b/src/cpp/src/whisper/pipeline.cpp @@ -77,7 +77,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties); // NB: only batch_size == 1 is supported now for NPU reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size); - compiled_model = core.compile_model(encoder_model, "NPU", properties); + compiled_model = core.compile_model(encoder_model, "NPU"); } else { compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); } @@ -85,7 +85,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model"); m_encoder = init_model(compiled_model); - m_decoder = WhisperDecoder::from_path(models_path, device, properties); + m_decoder = WhisperDecoder::from_path(models_path, device, properties, m_encoder.get_compiled_model().output("last_hidden_state").get_partial_shape()); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) { diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 70c1f84152..785e389b8e 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -202,7 +202,6 @@ ov::Tensor encode(ov::InferRequest& request, ". Actual size: ", mel_data.size(), "."); - ov::Tensor input_tensor(ov::element::f32, {1, feature_size, nb_max_frames}, mel_data.data()); request.set_tensor("input_features", input_tensor); @@ -213,7 +212,9 @@ ov::Tensor encode(ov::InferRequest& request, raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); // reset input tensor - request.set_tensor("input_features", ov::Tensor(ov::element::f32, {0, feature_size, nb_max_frames})); + auto m_is_npu = true; + uint8_t batch_size = m_is_npu ? 1 : 0; + request.set_tensor("input_features", ov::Tensor(ov::element::f32, {batch_size, feature_size, nb_max_frames})); return request.get_tensor("last_hidden_state"); } From 7ac58ec2229f038b6564f35d0491e5683c84c908 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Fri, 11 Jul 2025 15:40:15 +0100 Subject: [PATCH 03/15] Fixes for Whisper stateful/static pipelines - fix encode reset based on used device in stateful pipeline - fix STATIC_PIPELINE property option usage for Whisper static --- src/cpp/src/whisper/pipeline_static.cpp | 18 +++++++++++------- src/cpp/src/whisper/whisper.cpp | 4 ++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp index 15f5a95de9..6193f47d1a 100644 --- a/src/cpp/src/whisper/pipeline_static.cpp +++ b/src/cpp/src/whisper/pipeline_static.cpp @@ -990,7 +990,11 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys , m_sampler(m_tokenizer) { ov::Core core = utils::singleton_core(); - auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties); + // Remove "STATIC_PIPELINE" as we don't need to pass it further + auto model_properties = properties; + utils::pop_option(model_properties, "STATIC_PIPELINE"); + + auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, model_properties); reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size); auto last_hidden_state_shape = get_encoder_hidden_state_shape(encoder_model); @@ -998,10 +1002,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys std::shared_ptr decoder_with_past_model; if (std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml") ) { - decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); - decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, properties); + decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties); + decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, model_properties); } else { - auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); + auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties); ov::pass::StatefulToStateless().run_on_model(model); decoder_model = prepare_decoder_model(model); @@ -1030,15 +1034,15 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys preprocess_decoder(decoder_with_past_model); ov::CompiledModel compiled_model; - compiled_model = core.compile_model(encoder_model, "NPU", properties); + compiled_model = core.compile_model(encoder_model, "NPU", model_properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model"); m_models.encoder = compiled_model.create_infer_request(); - compiled_model = core.compile_model(decoder_with_past_model, "NPU", properties); + compiled_model = core.compile_model(decoder_with_past_model, "NPU", model_properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder with past model"); m_models.decoder_with_past = compiled_model.create_infer_request(); - compiled_model = core.compile_model(decoder_model, "NPU", properties); + compiled_model = core.compile_model(decoder_model, "NPU", model_properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder model"); m_models.decoder = compiled_model.create_infer_request(); diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 785e389b8e..76252d4d39 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -212,8 +212,8 @@ ov::Tensor encode(ov::InferRequest& request, raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); // reset input tensor - auto m_is_npu = true; - uint8_t batch_size = m_is_npu ? 1 : 0; + auto devices = request.get_compiled_model().get_property(ov::execution_devices); + uint8_t batch_size = (devices[0] == "NPU") ? 1 : 0; request.set_tensor("input_features", ov::Tensor(ov::element::f32, {batch_size, feature_size, nb_max_frames})); return request.get_tensor("last_hidden_state"); From 2c31ff4c00ea8c0bf461646f136dd93597aabb97 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Mon, 11 Aug 2025 11:54:31 +0100 Subject: [PATCH 04/15] Fix whisper decoder config --- src/cpp/src/utils.cpp | 5 +++++ src/cpp/src/whisper/pipeline.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 121c61d6c6..460abf28a4 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -101,6 +101,8 @@ void update_npu_config_whisper(ov::AnyMap& config, const ov::genai::utils::KVDesc& kv_desc) { update_config(config, {"NPU_USE_NPUW", "YES"}); update_config(config, {"NPUW_ONLINE_PIPELINE", "NONE"}); + update_config(config, {"NPUW_FUNCALL_FOR_ALL", "NO"}); + update_config(config, {"NPUW_FOLD", "NO"}); update_config(config, {"NPUW_LLM", "YES"}); update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch}); @@ -108,6 +110,9 @@ void update_npu_config_whisper(ov::AnyMap& config, update_config(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len}); update_config(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len}); + + // To disable chunking + update_config(config, {"NPUW_LLM_PREFILL_HINT", "STATIC"}); } inline bool is_paged_attention_available() { diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp index 0a15a6d03c..ac8bdf99ca 100644 --- a/src/cpp/src/whisper/pipeline.cpp +++ b/src/cpp/src/whisper/pipeline.cpp @@ -77,7 +77,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties); // NB: only batch_size == 1 is supported now for NPU reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size); - compiled_model = core.compile_model(encoder_model, "NPU"); + compiled_model = core.compile_model(encoder_model, "NPU", properties); } else { compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); } From 9faf09a2220ed352868a8921608fbefd0fb4ecc3 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Sun, 12 Oct 2025 20:24:19 +0100 Subject: [PATCH 05/15] Added new NPUW_WHISPER option --- src/cpp/src/utils.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 460abf28a4..fe809b83af 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -104,6 +104,7 @@ void update_npu_config_whisper(ov::AnyMap& config, update_config(config, {"NPUW_FUNCALL_FOR_ALL", "NO"}); update_config(config, {"NPUW_FOLD", "NO"}); update_config(config, {"NPUW_LLM", "YES"}); + update_config(config, {"NPUW_WHISPER", "YES"}); update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch}); update_config(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len}); From 935e63d0183c67e07ad02f3f0711396fe1bb4441 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Fri, 17 Oct 2025 22:05:26 +0200 Subject: [PATCH 06/15] Fixed missed `cache_position` input in StaticWhisperPipeline and WhisperStatefulImpl, fixed boolean attention_mask handling in StaticWhisperPipeline --- src/cpp/src/whisper/models/decoder.cpp | 3 +- .../src/whisper/models/statefull_decoder.cpp | 14 ++- .../src/whisper/models/statefull_decoder.hpp | 3 +- src/cpp/src/whisper/pipeline.cpp | 20 ++-- src/cpp/src/whisper/pipeline_static.cpp | 93 ++++++++++++++----- src/cpp/src/whisper/whisper.cpp | 3 +- src/cpp/src/whisper/whisper_utils.cpp | 10 +- src/cpp/src/whisper/whisper_utils.hpp | 4 +- 8 files changed, 114 insertions(+), 36 deletions(-) diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 26010c9d09..83b4b729ce 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -18,7 +18,8 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: if (has_decoder_with_past) { if (device == "NPU") { - OPENVINO_THROW("Please use STATIC_PIPELINE config option for NPU for 3-model whisper pipeline."); + OPENVINO_THROW("For NPU, 3-model whisper pipeline works only with STATIC_PIPELINE : YES configuration " + "(which is default for NPU)."); } return std::make_shared(models_path, device, properties); } diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 38d1561139..a7f3313025 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -1,13 +1,15 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "statefull_decoder.hpp" #include "utils.hpp" +#include "whisper/whisper_utils.hpp" namespace { void reshape_hidden_states_to_static(std::shared_ptr model, const ov::PartialShape& lhstates_shape) { ov::PartialShape new_shape = model->input("encoder_hidden_states").get_partial_shape(); + OPENVINO_ASSERT(new_shape.size() > 1 && lhstates_shape.size() > 1); new_shape[1] = lhstates_shape[1]; std::map name_to_shape{{"encoder_hidden_states", new_shape}}; model->reshape(name_to_shape); @@ -24,6 +26,8 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); + m_has_cache_position = ov::genai::utils::input_exists(model, "cache_position"); + ov::CompiledModel compiled_model; if (device == "NPU") { auto kv_pos = ov::genai::utils::get_kv_axes_pos(model); @@ -50,7 +54,9 @@ void WhisperStatefullDecoder::start_async(const Tensor& encoder_hidden_state, _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, m_request); - _set_cache_position_tensor(seq_len); + if (m_has_cache_position) { + _set_cache_position_tensor(seq_len); + } m_request.set_tensor("input_ids", input_ids); m_request.set_tensor("beam_idx", beam_idx); @@ -79,7 +85,9 @@ Tensor WhisperStatefullDecoder::wait() { void WhisperStatefullDecoder::reset_state() { m_request.reset_state(); - m_request.set_tensor("cache_position", create_host_tensor(ov::element::i64, {0})); + if (m_has_cache_position) { + m_request.set_tensor("cache_position", create_host_tensor(ov::element::i64, {0})); + } Shape encoder_hidden_states_shape{m_request.get_tensor("encoder_hidden_states").get_shape()}; encoder_hidden_states_shape[0] = 0; diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 13bfc5792e..b0fbff2131 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once @@ -28,5 +28,6 @@ class WhisperStatefullDecoder : public WhisperDecoder { private: ov::InferRequest m_request; + bool m_has_cache_position = true; }; } // namespace ov::genai diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp index ac8bdf99ca..a3c18fe0ce 100644 --- a/src/cpp/src/whisper/pipeline.cpp +++ b/src/cpp/src/whisper/pipeline.cpp @@ -42,7 +42,9 @@ ov::InferRequest init_model(ov::CompiledModel& compiled) { } } -void reshape_to_static_encoder(std::shared_ptr model, const size_t feature_size) { +void reshape_to_static_encoder(std::shared_ptr model, + const size_t batch_size, + const size_t feature_size) { std::map new_shapes; for (auto input : model->inputs()) { const auto& input_name = input.get_any_name(); @@ -51,10 +53,10 @@ void reshape_to_static_encoder(std::shared_ptr model, const size_t fe const auto& partial_shape = input.get_partial_shape(); OPENVINO_ASSERT(partial_shape.size() >= 3); new_shape = partial_shape; - new_shape[0] = 1; // batch_dim + new_shape[0] = batch_size; // batch_dim new_shape[1] = feature_size; + new_shapes.emplace(input_name, new_shape); } - new_shapes.emplace(input_name, new_shape); } model->reshape(new_shapes); } @@ -76,7 +78,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi if (device == "NPU") { auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties); // NB: only batch_size == 1 is supported now for NPU - reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size); + reshape_to_static_encoder(encoder_model, 1, m_feature_extractor.feature_size); compiled_model = core.compile_model(encoder_model, "NPU", properties); } else { compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); @@ -178,8 +180,14 @@ ov::genai::WhisperPipeline::WhisperPipeline(const std::filesystem::path& models_ const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (device == "NPU" && properties.count("STATIC_PIPELINE")) { - m_impl = std::make_unique(models_path, properties); + if (device == "NPU") { + auto properties_copy = properties; + const bool use_static_pipeline = utils::pop_or_default(properties_copy, "STATIC_PIPELINE", true); + if (!use_static_pipeline) { + m_impl = std::make_unique(models_path, device, properties_copy); + } else { + m_impl = std::make_unique(models_path, properties_copy); + } } else { m_impl = std::make_unique(models_path, device, properties); } diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp index 6193f47d1a..8dfd83cd30 100644 --- a/src/cpp/src/whisper/pipeline_static.cpp +++ b/src/cpp/src/whisper/pipeline_static.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "whisper/pipeline_static.hpp" @@ -18,6 +18,7 @@ #include "openvino/core/preprocess/pre_post_process.hpp" #include "openvino/pass/pattern/matcher.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/graph_rewrite.hpp" #include "openvino/pass/manager.hpp" #include "openvino/op/range.hpp" @@ -407,7 +408,8 @@ void add_attention_mask_input(std::shared_ptr model) { auto unsqueeze1 = wrap_type({range, any_input()}); auto unsqueeze2 = wrap_type({unsqueeze1, any_input()}); auto unsqueeze3 = wrap_type({unsqueeze2, any_input()}); - auto lessequal = wrap_type({unsqueeze3, any_input()}); + auto opt_convert = optional({unsqueeze3->output(0)}); + auto lessequal = wrap_type({opt_convert, any_input()}); register_matcher(std::make_shared(lessequal, this->get_type_info().name), [model](Matcher& m) { auto node = m.get_match_root(); @@ -470,13 +472,6 @@ void add_attention_mask_input(std::shared_ptr model, bool transform_c attention_mask->get_output_tensor(0).set_names({"attention_mask"}); model->add_parameters({attention_mask}); - auto slice = self_attn_nodes[0]->input(kAttnMaskPort).get_source_output().get_node(); - auto cvt = std::make_shared(attention_mask->output(0), ov::element::f32); - auto add = std::make_shared(slice->output(0), cvt->output(0)); - - auto trps = std::make_shared(cvt->output(0), v0::Constant::create(ov::element::i32, ov::Shape{2}, std::vector{1, 0})); - auto mtpl = std::make_shared(trps->output(0), add->output(0)); - auto cst_ninf = std::make_shared( ov::element::f32, ov::Shape{1}, @@ -493,6 +488,18 @@ void add_attention_mask_input(std::shared_ptr model, bool transform_c std::vector{0} ); + auto slice = self_attn_nodes[0]->input(kAttnMaskPort).get_source_output().get_node_shared_ptr(); + std::shared_ptr slice_f32; + if (slice->get_element_type() == ov::element::boolean) { + slice_f32 = std::make_shared(slice->output(0), cst_0->output(0), cst_ninf->output(0)); + } else { + slice_f32 = slice; + } + auto cvt = std::make_shared(attention_mask->output(0), ov::element::f32); + auto add = std::make_shared(slice_f32->output(0), cvt->output(0)); + auto trps = std::make_shared(cvt->output(0), v0::Constant::create(ov::element::i32, ov::Shape{2}, std::vector{1, 0})); + auto mtpl = std::make_shared(trps->output(0), add->output(0)); + auto equal = std::make_shared(mtpl->output(0), cst_1->output(0)); auto select = std::make_shared( equal->output(0), cst_0->output(0), cst_ninf->output(0) @@ -548,6 +555,46 @@ void add_attention_mask_input(std::shared_ptr model, bool transform_c pm.run_passes(model); } +void add_cache_position_input(std::shared_ptr model) { + using namespace ov::pass::pattern; + using namespace ov::op; + class CachePositionInput : public ov::pass::MatcherPass { + public: + OPENVINO_MATCHER_PASS_RTTI("CachePositionInput"); + + CachePositionInput(std::shared_ptr model) { + auto gather = wrap_type({any_input(), any_input(), any_input()}); + auto add = wrap_type({gather, any_input()}); + auto range = wrap_type({gather, add, any_input()}); + auto unsqueeze = wrap_type({range, any_input()}); + auto tile = wrap_type({unsqueeze, any_input()}); + + register_matcher(std::make_shared(tile, this->get_type_info().name), + [model, unsqueeze](Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + auto unsqueeze_node = node_to_output.at(unsqueeze).get_node_shared_ptr(); + auto matched_unsqueeze = std::static_pointer_cast(unsqueeze_node); + + auto cache_position = std::make_shared(ov::element::i64, ov::Shape{1}); + cache_position->get_output_tensor(0).set_names({"cache_position"}); + cache_position->set_friendly_name("cache_position"); + model->add_parameters({cache_position}); + // If cache_position input is missed in the model, it means that position is calculated + // by the model itself using fp32 range constructed from the shapes of inputs. + // So operations below this range expect fp32 input. + auto cache_position_f32 = std::make_shared(cache_position, ov::element::f32); + + matched_unsqueeze->input(0).replace_source_output(cache_position_f32->output(0)); + return false; + }); + } + }; + + ov::pass::Manager pm; + pm.register_pass(model); + pm.run_passes(model); + model->validate_nodes_and_infer_types(); +} ov::PartialShape get_encoder_hidden_state_shape(const std::shared_ptr& encoder) { return encoder->output("last_hidden_state").get_partial_shape(); @@ -956,8 +1003,10 @@ std::shared_ptr prepare_decoder_model(std::shared_ptr& mod remove_input_kv_tensors(decoder_model); // 3) Expose all states that requires initialization on the first run as outputs expose_runtime_states_as_outputs(decoder_model); - // 4) Remove cache_position input - remove_cache_position(decoder_model); + // 4) Remove cache_position input if it exists + if (ov::genai::utils::input_exists(decoder_model, "cache_position")) { + remove_cache_position(decoder_model); + } // 5) Normalize output names - should be done in stateful_to_stateless_transformation normalize_output_key_value_names(decoder_model); @@ -972,6 +1021,10 @@ std::shared_ptr prepare_decoder_with_past_model(std::shared_ptrreshape({{"input_ids", ov::PartialShape({-1, 1})}}); decoder_with_past_model->set_friendly_name("Model6"); @@ -990,11 +1043,7 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys , m_sampler(m_tokenizer) { ov::Core core = utils::singleton_core(); - // Remove "STATIC_PIPELINE" as we don't need to pass it further - auto model_properties = properties; - utils::pop_option(model_properties, "STATIC_PIPELINE"); - - auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, model_properties); + auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties); reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size); auto last_hidden_state_shape = get_encoder_hidden_state_shape(encoder_model); @@ -1002,10 +1051,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys std::shared_ptr decoder_with_past_model; if (std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml") ) { - decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties); - decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, model_properties); + decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); + decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, properties); } else { - auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties); + auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); ov::pass::StatefulToStateless().run_on_model(model); decoder_model = prepare_decoder_model(model); @@ -1034,15 +1083,15 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys preprocess_decoder(decoder_with_past_model); ov::CompiledModel compiled_model; - compiled_model = core.compile_model(encoder_model, "NPU", model_properties); + compiled_model = core.compile_model(encoder_model, "NPU", properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model"); m_models.encoder = compiled_model.create_infer_request(); - compiled_model = core.compile_model(decoder_with_past_model, "NPU", model_properties); + compiled_model = core.compile_model(decoder_with_past_model, "NPU", properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder with past model"); m_models.decoder_with_past = compiled_model.create_infer_request(); - compiled_model = core.compile_model(decoder_model, "NPU", model_properties); + compiled_model = core.compile_model(decoder_model, "NPU", properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder model"); m_models.decoder = compiled_model.create_infer_request(); diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 76252d4d39..c5ec745c20 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -213,7 +213,8 @@ ov::Tensor encode(ov::InferRequest& request, // reset input tensor auto devices = request.get_compiled_model().get_property(ov::execution_devices); - uint8_t batch_size = (devices[0] == "NPU") ? 1 : 0; + OPENVINO_ASSERT(devices.size() > 0, "No execution devices found!"); + size_t batch_size = (devices[0] == "NPU") ? 1 : 0; request.set_tensor("input_features", ov::Tensor(ov::element::f32, {batch_size, feature_size, nb_max_frames})); return request.get_tensor("last_hidden_state"); diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp index 3f925c6ead..42cf6fa5fd 100644 --- a/src/cpp/src/whisper/whisper_utils.cpp +++ b/src/cpp/src/whisper/whisper_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "whisper/whisper_utils.hpp" @@ -57,6 +57,14 @@ int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) { return out_token; } +bool input_exists(const std::shared_ptr& model, const std::string& name) { + auto inputs = model->inputs(); + auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) { + return port.get_names().count(name) != 0; + }); + return it != inputs.end(); +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp index 8fd0a080c6..ba66d18e0f 100644 --- a/src/cpp/src/whisper/whisper_utils.hpp +++ b/src/cpp/src/whisper/whisper_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once @@ -19,6 +19,8 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, int64_t argmax(const ov::Tensor& logits, const size_t batch_idx); +bool input_exists(const std::shared_ptr& model, const std::string& name); + } // namespace utils } // namespace genai } // namespace ov From 9556acd9694d7916a91643949cd1847190d1dbbf Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Mon, 20 Oct 2025 12:05:44 +0200 Subject: [PATCH 07/15] Test new requirements.txt --- tests/python_tests/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index e24bf2e160..28735702e6 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,10 +1,10 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.35.2 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@04db016571d1a19c14918553365ee4c05c8b4697 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@3130e907fb7960653039d138493cbb075e128f6a numpy==1.26.4; platform_system == "Darwin" and platform_machine == "x86_64" safetensors==0.6.2; platform_system == "Darwin" and platform_machine == "x86_64" pytest==8.4.2 -transformers==4.53.3 +transformers==4.55.4 hf_transfer==0.1.9 gguf==0.17.1 From 14db9c7d8ab78efbbfe5fa97b784c83bc4b16ce1 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Mon, 20 Oct 2025 19:08:17 +0200 Subject: [PATCH 08/15] Fix for StaticWhisperPipeline to work with transformers_4.55.4 ieline_static.cpp --- src/cpp/src/whisper/pipeline_static.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp index 8dfd83cd30..d53f5b2166 100644 --- a/src/cpp/src/whisper/pipeline_static.cpp +++ b/src/cpp/src/whisper/pipeline_static.cpp @@ -579,12 +579,14 @@ void add_cache_position_input(std::shared_ptr model) { cache_position->get_output_tensor(0).set_names({"cache_position"}); cache_position->set_friendly_name("cache_position"); model->add_parameters({cache_position}); - // If cache_position input is missed in the model, it means that position is calculated - // by the model itself using fp32 range constructed from the shapes of inputs. - // So operations below this range expect fp32 input. - auto cache_position_f32 = std::make_shared(cache_position, ov::element::f32); + std::shared_ptr cache_pos_unsqueeze_arg; + if (unsqueeze_node->input(0).get_element_type() == ov::element::f32) { + auto cache_pos_unsqueeze_arg = std::make_shared(cache_position, ov::element::f32); + } else { + cache_pos_unsqueeze_arg = cache_position; + } - matched_unsqueeze->input(0).replace_source_output(cache_position_f32->output(0)); + matched_unsqueeze->input(0).replace_source_output(cache_pos_unsqueeze_arg->output(0)); return false; }); } From 5536a6eee45edc0efb95944717416e6c6127ba09 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Wed, 22 Oct 2025 20:02:06 +0100 Subject: [PATCH 09/15] Fixed review comments --- src/cpp/src/whisper/pipeline_static.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp index d53f5b2166..bc373abea6 100644 --- a/src/cpp/src/whisper/pipeline_static.cpp +++ b/src/cpp/src/whisper/pipeline_static.cpp @@ -581,7 +581,7 @@ void add_cache_position_input(std::shared_ptr model) { model->add_parameters({cache_position}); std::shared_ptr cache_pos_unsqueeze_arg; if (unsqueeze_node->input(0).get_element_type() == ov::element::f32) { - auto cache_pos_unsqueeze_arg = std::make_shared(cache_position, ov::element::f32); + cache_pos_unsqueeze_arg = std::make_shared(cache_position, ov::element::f32); } else { cache_pos_unsqueeze_arg = cache_position; } From 68e16f6e9f18b67f2947ec5a2b511e92c178286c Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Oct 2025 21:12:15 +0200 Subject: [PATCH 10/15] Skip tests --- tests/python_tests/test_whisper_pipeline.py | 45 ++------------------- 1 file changed, 3 insertions(+), 42 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index 655f527852..68caee5068 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -55,13 +55,11 @@ def get_whisper_models_list(tiny_only=False): # used whisper models are relatively small # cache them in memory to speedup tests @functools.lru_cache() -def read_whisper_model(params, stateful=True): +def read_whisper_model(params): model_id, path = params - if not stateful: - path = pathlib.Path(f"{path}_with_past") if not (path / "openvino_encoder_model.xml").exists(): - save_model(model_id=model_id, tmp_path=path, stateful=stateful) + save_model(model_id=model_id, tmp_path=path) opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained( path, @@ -93,7 +91,7 @@ def read_whisper_model(params, stateful=True): ) -def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True): +def save_model(model_id: str, tmp_path: pathlib.Path): tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)) ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( tokenizer, @@ -111,7 +109,6 @@ def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True): model_id, export=True, trust_remote_code=True, - stateful=stateful, compile=False, device="CPU", load_in_8bit=False, @@ -226,9 +223,6 @@ def run_pipeline_with_ref( streamer: typing.Callable[[str], bool] | None = None, ): _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) - _, _, _, genai_with_past_pipe = read_whisper_model( - (model_id, tmp_path), stateful=False - ) if type(sample) is np.ndarray and len(sample.shape) == 1: sample = np.expand_dims(sample, 0) @@ -239,12 +233,6 @@ def run_pipeline_with_ref( compare_results(hf_result, genai_result) - genai_with_past_result = run_genai( - genai_with_past_pipe, _sample, generation_config, streamer - ) - - compare_results(hf_result, genai_with_past_result) - def compare_results(hf_result, genai_result): assert genai_result.texts[0] == hf_result["text"] @@ -510,33 +498,6 @@ def test_longform_audio(model_descr, sample_from_dataset): assert "".join(streamer_result) == hf_result["text"] -@pytest.mark.parametrize("model_descr", get_whisper_models_list()) -@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=2, long_form=True)], indirect=True) -@pytest.mark.precommit -@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169") -def test_longform_audio_with_past(model_descr, sample_from_dataset): - _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True) - - streamer_result = [] - - genai_result = run_genai( - genai_pipe, - sample_from_dataset, - config=ov_genai.WhisperGenerationConfig(return_timestamps=True), - streamer=lambda x: streamer_result.append(x), - ) - - hf_result = run_huggingface( - hf_pipe, - sample_from_dataset, - config=ov_genai.WhisperGenerationConfig(return_timestamps=True), - ) - - compare_results(hf_result, genai_result) - - assert "".join(streamer_result) == hf_result["text"] - - @pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.precommit @pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169") From 7e29a70a4eeb775b3e8db0dac68b71e97ee96b29 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Thu, 23 Oct 2025 12:49:31 +0100 Subject: [PATCH 11/15] Added handling of `cache_position` for stateless Whisper pipeline --- src/cpp/src/utils.cpp | 8 ++++++++ src/cpp/src/utils.hpp | 5 +++++ src/cpp/src/whisper/models/statefull_decoder.cpp | 3 +-- src/cpp/src/whisper/models/with_past_decoder.cpp | 11 ++++++++--- src/cpp/src/whisper/models/with_past_decoder.hpp | 1 + src/cpp/src/whisper/pipeline_static.cpp | 10 +++++----- src/cpp/src/whisper/whisper_utils.cpp | 8 -------- src/cpp/src/whisper/whisper_utils.hpp | 2 -- 8 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 13fbbd116a..c26a1830cf 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -840,6 +840,14 @@ void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path out.close(); } +bool has_input(const std::shared_ptr& model, const std::string& name) { + auto inputs = model->inputs(); + auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) { + return port.get_names().count(name) != 0; + }); + return it != inputs.end(); +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 9cba89e05d..14106ef8f7 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -309,6 +309,11 @@ ov::CompiledModel import_model(const std::filesystem::path& blob_path, */ void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path& blob_path); +/** + * @brief Checks if the model has an input with the specified name. + */ +bool has_input(const std::shared_ptr& model, const std::string& name); + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index a7f3313025..3c1a8c4e20 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -4,7 +4,6 @@ #include "statefull_decoder.hpp" #include "utils.hpp" -#include "whisper/whisper_utils.hpp" namespace { void reshape_hidden_states_to_static(std::shared_ptr model, const ov::PartialShape& lhstates_shape) { @@ -26,7 +25,7 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties); - m_has_cache_position = ov::genai::utils::input_exists(model, "cache_position"); + m_has_cache_position = utils::has_input(model, "cache_position"); ov::CompiledModel compiled_model; if (device == "NPU") { diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 559c96d952..adf72454b5 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -86,8 +86,11 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode "To obtain stateful decoder model use latest `optimum-intel` package:\n" "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git@main\n" "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny"); + ov::Core core = utils::singleton_core(); + m_has_cache_position = utils::has_input(core.read_model(models_path / "openvino_decoder_model.xml"), "cache_position"); + auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); m_request_decoder = compiled_model.create_infer_request(); @@ -110,9 +113,11 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state, request.set_tensor("input_ids", input_ids); if (!is_initial_step) { - ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); - cache_position_tensor.set_shape({1}); - cache_position_tensor.data()[0] = m_cache_position; + if (m_has_cache_position) { + ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); + cache_position_tensor.set_shape({1}); + cache_position_tensor.data()[0] = m_cache_position; + } } _set_past_key_value(beam_idx); diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index b268903802..47375ffdf2 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -26,6 +26,7 @@ class WhisperWithPastDecoder : public WhisperDecoder { size_t m_cache_position = 0; bool m_initial_past_key_value_set = false; bool m_past_key_value_linked = false; + bool m_has_cache_position = true; void _set_past_key_value(const Tensor& beam_idx); }; diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp index bc373abea6..bc04f45bbe 100644 --- a/src/cpp/src/whisper/pipeline_static.cpp +++ b/src/cpp/src/whisper/pipeline_static.cpp @@ -1006,7 +1006,7 @@ std::shared_ptr prepare_decoder_model(std::shared_ptr& mod // 3) Expose all states that requires initialization on the first run as outputs expose_runtime_states_as_outputs(decoder_model); // 4) Remove cache_position input if it exists - if (ov::genai::utils::input_exists(decoder_model, "cache_position")) { + if (ov::genai::utils::has_input(decoder_model, "cache_position")) { remove_cache_position(decoder_model); } // 5) Normalize output names - should be done in stateful_to_stateless_transformation @@ -1023,10 +1023,6 @@ std::shared_ptr prepare_decoder_with_past_model(std::shared_ptrreshape({{"input_ids", ov::PartialShape({-1, 1})}}); decoder_with_past_model->set_friendly_name("Model6"); @@ -1066,6 +1062,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys if (!decoder_model || !decoder_with_past_model) OPENVINO_THROW("Decoder/decoder_with_past model is not valid !"); + if (!ov::genai::utils::has_input(decoder_with_past_model, "cache_position")) { + add_cache_position_input(decoder_with_past_model); + } + add_attention_mask_input(decoder_model, true /* transform_cross_attn */, last_hidden_state_shape[1].get_length()); // NB: Note, there is no need to transform cross attention for decoder_with_past_model // as it accepts only single token and there can't be any padding. diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp index 42cf6fa5fd..60714059d6 100644 --- a/src/cpp/src/whisper/whisper_utils.cpp +++ b/src/cpp/src/whisper/whisper_utils.cpp @@ -57,14 +57,6 @@ int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) { return out_token; } -bool input_exists(const std::shared_ptr& model, const std::string& name) { - auto inputs = model->inputs(); - auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) { - return port.get_names().count(name) != 0; - }); - return it != inputs.end(); -} - } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp index ba66d18e0f..d7205aa5bd 100644 --- a/src/cpp/src/whisper/whisper_utils.hpp +++ b/src/cpp/src/whisper/whisper_utils.hpp @@ -19,8 +19,6 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, int64_t argmax(const ov::Tensor& logits, const size_t batch_idx); -bool input_exists(const std::shared_ptr& model, const std::string& name); - } // namespace utils } // namespace genai } // namespace ov From 6614e94b7f32086cf67e09f206ef0a336b7b73d0 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Thu, 23 Oct 2025 13:11:52 +0100 Subject: [PATCH 12/15] Fixed model name to check `cache_position` input --- src/cpp/src/whisper/models/with_past_decoder.cpp | 4 ++-- src/cpp/src/whisper/whisper_utils.cpp | 2 +- src/cpp/src/whisper/whisper_utils.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index adf72454b5..2b1ec783a7 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -89,12 +89,12 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode ov::Core core = utils::singleton_core(); - m_has_cache_position = utils::has_input(core.read_model(models_path / "openvino_decoder_model.xml"), "cache_position"); - auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); m_request_decoder = compiled_model.create_infer_request(); + m_has_cache_position = + utils::has_input(core.read_model(models_path / "openvino_decoder_with_past_model.xml"), "cache_position"); compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); m_request_decoder_with_past = compiled_model.create_infer_request(); diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp index 60714059d6..3f925c6ead 100644 --- a/src/cpp/src/whisper/whisper_utils.cpp +++ b/src/cpp/src/whisper/whisper_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024-2025 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "whisper/whisper_utils.hpp" diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp index d7205aa5bd..8fd0a080c6 100644 --- a/src/cpp/src/whisper/whisper_utils.hpp +++ b/src/cpp/src/whisper/whisper_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024-2025 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once From 04c89a218d860e481f707033bd56302ee72b2339 Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Thu, 23 Oct 2025 13:14:49 +0100 Subject: [PATCH 13/15] Polishing --- src/cpp/src/whisper/models/with_past_decoder.cpp | 4 ++-- src/cpp/src/whisper/models/with_past_decoder.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 2b1ec783a7..f3c1f3fab5 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -93,7 +93,7 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); m_request_decoder = compiled_model.create_infer_request(); - m_has_cache_position = + m_past_decoder_has_cache_position = utils::has_input(core.read_model(models_path / "openvino_decoder_with_past_model.xml"), "cache_position"); compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); @@ -113,7 +113,7 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state, request.set_tensor("input_ids", input_ids); if (!is_initial_step) { - if (m_has_cache_position) { + if (m_past_decoder_has_cache_position) { ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); cache_position_tensor.set_shape({1}); cache_position_tensor.data()[0] = m_cache_position; diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index 47375ffdf2..89b804393a 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -26,7 +26,7 @@ class WhisperWithPastDecoder : public WhisperDecoder { size_t m_cache_position = 0; bool m_initial_past_key_value_set = false; bool m_past_key_value_linked = false; - bool m_has_cache_position = true; + bool m_past_decoder_has_cache_position = true; void _set_past_key_value(const Tensor& beam_idx); }; From a303604659c05f997c651abc346f4eb051b8c11a Mon Sep 17 00:00:00 2001 From: Anastasiya Pronina Date: Thu, 23 Oct 2025 13:22:19 +0100 Subject: [PATCH 14/15] Fixed review comments --- .../src/whisper/models/with_past_decoder.cpp | 10 ++--- src/cpp/src/whisper/pipeline_static.cpp | 2 +- tests/python_tests/test_whisper_pipeline.py | 45 +++++++++++++++++-- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index f3c1f3fab5..11c25f2118 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -112,12 +112,10 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state, _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request); request.set_tensor("input_ids", input_ids); - if (!is_initial_step) { - if (m_past_decoder_has_cache_position) { - ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); - cache_position_tensor.set_shape({1}); - cache_position_tensor.data()[0] = m_cache_position; - } + if (!is_initial_step && m_past_decoder_has_cache_position) { + ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); + cache_position_tensor.set_shape({1}); + cache_position_tensor.data()[0] = m_cache_position; } _set_past_key_value(beam_idx); diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp index bc04f45bbe..293e00091a 100644 --- a/src/cpp/src/whisper/pipeline_static.cpp +++ b/src/cpp/src/whisper/pipeline_static.cpp @@ -580,7 +580,7 @@ void add_cache_position_input(std::shared_ptr model) { cache_position->set_friendly_name("cache_position"); model->add_parameters({cache_position}); std::shared_ptr cache_pos_unsqueeze_arg; - if (unsqueeze_node->input(0).get_element_type() == ov::element::f32) { + if (matched_unsqueeze->input(0).get_element_type() == ov::element::f32) { cache_pos_unsqueeze_arg = std::make_shared(cache_position, ov::element::f32); } else { cache_pos_unsqueeze_arg = cache_position; diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index 68caee5068..655f527852 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -55,11 +55,13 @@ def get_whisper_models_list(tiny_only=False): # used whisper models are relatively small # cache them in memory to speedup tests @functools.lru_cache() -def read_whisper_model(params): +def read_whisper_model(params, stateful=True): model_id, path = params + if not stateful: + path = pathlib.Path(f"{path}_with_past") if not (path / "openvino_encoder_model.xml").exists(): - save_model(model_id=model_id, tmp_path=path) + save_model(model_id=model_id, tmp_path=path, stateful=stateful) opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained( path, @@ -91,7 +93,7 @@ def read_whisper_model(params): ) -def save_model(model_id: str, tmp_path: pathlib.Path): +def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True): tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)) ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( tokenizer, @@ -109,6 +111,7 @@ def save_model(model_id: str, tmp_path: pathlib.Path): model_id, export=True, trust_remote_code=True, + stateful=stateful, compile=False, device="CPU", load_in_8bit=False, @@ -223,6 +226,9 @@ def run_pipeline_with_ref( streamer: typing.Callable[[str], bool] | None = None, ): _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) + _, _, _, genai_with_past_pipe = read_whisper_model( + (model_id, tmp_path), stateful=False + ) if type(sample) is np.ndarray and len(sample.shape) == 1: sample = np.expand_dims(sample, 0) @@ -233,6 +239,12 @@ def run_pipeline_with_ref( compare_results(hf_result, genai_result) + genai_with_past_result = run_genai( + genai_with_past_pipe, _sample, generation_config, streamer + ) + + compare_results(hf_result, genai_with_past_result) + def compare_results(hf_result, genai_result): assert genai_result.texts[0] == hf_result["text"] @@ -498,6 +510,33 @@ def test_longform_audio(model_descr, sample_from_dataset): assert "".join(streamer_result) == hf_result["text"] +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) +@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=2, long_form=True)], indirect=True) +@pytest.mark.precommit +@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169") +def test_longform_audio_with_past(model_descr, sample_from_dataset): + _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True) + + streamer_result = [] + + genai_result = run_genai( + genai_pipe, + sample_from_dataset, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), + streamer=lambda x: streamer_result.append(x), + ) + + hf_result = run_huggingface( + hf_pipe, + sample_from_dataset, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), + ) + + compare_results(hf_result, genai_result) + + assert "".join(streamer_result) == hf_result["text"] + + @pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.precommit @pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169") From 816938a9df94e27f7e86a34db19e6ddb64caa1b6 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Fri, 24 Oct 2025 18:32:34 +0200 Subject: [PATCH 15/15] Reverted update of requirements.txt --- tests/python_tests/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 28735702e6..e24bf2e160 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,10 +1,10 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.35.2 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@3130e907fb7960653039d138493cbb075e128f6a +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@04db016571d1a19c14918553365ee4c05c8b4697 numpy==1.26.4; platform_system == "Darwin" and platform_machine == "x86_64" safetensors==0.6.2; platform_system == "Darwin" and platform_machine == "x86_64" pytest==8.4.2 -transformers==4.55.4 +transformers==4.53.3 hf_transfer==0.1.9 gguf==0.17.1