From a4f09c74d0992d29cbec77d14319d078d4613bd4 Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Wed, 26 Mar 2025 12:40:46 +0000
Subject: [PATCH 01/15] Switch NPU Whisper to ov::genai::WhisperStatefulImpl

---
 src/cpp/src/whisper/models/decoder.cpp        |  3 ++
 .../src/whisper/models/statefull_decoder.cpp  | 17 ++++++++++-
 .../src/whisper/models/statefull_decoder.hpp  |  1 +
 src/cpp/src/whisper/pipeline.cpp              | 30 +++++++++++++++++--
 4 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 9f6cc9224f..4aa733f678 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -16,6 +16,9 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
     bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml");
 
     if (has_decoder_with_past) {
+        if (device == "NPU") {
+            OPENVINO_THROW("Please use STATIC_PIPELINE config option for NPU for 3-model whisper pipeline.");
+        }
         return std::make_shared<WhisperWithPastDecoder>(models_path, device, properties);
     }
 
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index 802e67a9c2..aaaaa3a826 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -15,7 +15,22 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
 
     utils::apply_slice_before_matmul_transformation(model);
 
-    auto compiled_model = core.compile_model(model, device, properties);
+    if (device.find("NPU") != std::string::npos) {
+        m_is_npu = true;
+    }
+
+    ov::CompiledModel compiled_model;
+    if (m_is_npu) {
+        auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
+
+        utils::KVDesc kv_desc;
+        // Check max_prompt_length and min_response_length
+        std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
+            model, properties, kv_pos, models_path / "openvino_model.xml"
+        );
+    } else {
+        compiled_model = core.compile_model(model, device, properties);
+    }
 
     utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
     m_request = compiled_model.create_infer_request();
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 6cfeebb6cf..24c421861c 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -27,5 +27,6 @@ class WhisperStatefullDecoder : public WhisperDecoder {
 
 private:
     ov::InferRequest m_request;
+    bool m_is_npu = false;
 };
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp
index 3e1d2483db..339be635fe 100644
--- a/src/cpp/src/whisper/pipeline.cpp
+++ b/src/cpp/src/whisper/pipeline.cpp
@@ -42,6 +42,23 @@ ov::InferRequest init_model(ov::CompiledModel& compiled) {
     }
 }
 
+void reshape_to_static_encoder(std::shared_ptr<ov::Model> model, const size_t feature_size) {
+    std::map<std::string, ov::PartialShape> new_shapes;
+    for (auto input : model->inputs()) {
+        const auto& input_name = input.get_any_name();
+        ov::PartialShape new_shape;
+        if (input_name.find("input_features") != std::string::npos) {
+            const auto& partial_shape = input.get_partial_shape();
+            OPENVINO_ASSERT(partial_shape.size() >= 3);
+            new_shape = partial_shape;
+            new_shape[0] = 1;  // batch_dim
+            new_shape[1] = feature_size;
+        }
+        new_shapes.emplace(input_name, new_shape);
+    }
+    model->reshape(new_shapes);
+}
+
 }  // namespace
 
 namespace ov {
@@ -55,9 +72,16 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         : WhisperPipelineImplBase{models_path},
           m_sampler(m_tokenizer) {
         ov::Core core = utils::singleton_core();
+        ov::CompiledModel compiled_model;
+        if (device == "NPU") {
+            auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
+            // NB: only batch_size == 1 is supported now for NPU
+            reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size);
+            compiled_model = core.compile_model(encoder_model, "NPU", properties);
+        } else {
+            compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
+        }
 
-        ov::CompiledModel compiled_model =
-            core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model");
         m_encoder = init_model(compiled_model);
 
@@ -154,7 +178,7 @@ ov::genai::WhisperPipeline::WhisperPipeline(const std::filesystem::path& models_
                                             const std::string& device,
                                             const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
-    if (device == "NPU") {
+    if (device == "NPU" && properties.count("STATIC_PIPELINE")) {
         m_impl = std::make_unique<StaticWhisperPipeline>(models_path, properties);
     } else {
         m_impl = std::make_unique<WhisperPipelineStatefulImpl>(models_path, device, properties);

From 1ba556080be281601a8e246f31e008fc750f358e Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Wed, 4 Jun 2025 17:47:11 +0100
Subject: [PATCH 02/15] Update Whisper stateful pipeline for NPU

---
 src/cpp/src/utils.cpp                         | 31 ++++++++++++++++---
 src/cpp/src/utils.hpp                         |  3 +-
 src/cpp/src/whisper/models/decoder.cpp        |  5 +--
 src/cpp/src/whisper/models/decoder.hpp        |  3 +-
 .../src/whisper/models/statefull_decoder.cpp  | 30 +++++++++++-------
 .../src/whisper/models/statefull_decoder.hpp  |  4 +--
 src/cpp/src/whisper/pipeline.cpp              |  4 +--
 src/cpp/src/whisper/whisper.cpp               |  5 +--
 8 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index c23131b4f1..121c61d6c6 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -74,7 +74,6 @@ std::optional<uint32_t> pop_int_and_cast(ov::AnyMap& config, const std::string&
 }
 
 void update_npu_config(ov::AnyMap& config,
-                       const std::shared_ptr<ov::Model>& model,
                        const ov::genai::utils::KVAxesPosition& kv_pos,
                        const ov::genai::utils::KVDesc& kv_desc) {
     update_config(config, {"NPU_USE_NPUW", "YES"});
@@ -97,6 +96,20 @@ void update_npu_config(ov::AnyMap& config,
     rename_key(config, "++SHARED_HEAD_CONFIG", "++NPUW_LLM_SHARED_HEAD_CONFIG");
 }
 
+void update_npu_config_whisper(ov::AnyMap& config,
+                               const ov::genai::utils::KVAxesPosition& kv_pos,
+                               const ov::genai::utils::KVDesc& kv_desc) {
+    update_config(config, {"NPU_USE_NPUW", "YES"});
+    update_config(config, {"NPUW_ONLINE_PIPELINE", "NONE"});
+    update_config(config, {"NPUW_LLM", "YES"});
+
+    update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch});
+    update_config(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len});
+
+    update_config(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len});
+    update_config(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len});
+}
+
 inline bool is_paged_attention_available() {
 #if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
     return true;
@@ -546,7 +559,8 @@ void print_gguf_debug_info(const std::string &debug_info) {
 std::pair<ov::CompiledModel, KVDesc>
 compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
                         const ov::AnyMap& config,
-                        const KVAxesPosition& kv_pos) {
+                        const KVAxesPosition& kv_pos,
+                        const bool is_whisper) {
     ov::CompiledModel compiled;
     ov::AnyMap properties = config;
     KVDesc kv_desc;
@@ -567,9 +581,16 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
         kv_desc.max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
         kv_desc.min_response_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
     } else {
-        kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
-        kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
-        update_npu_config(properties, model, kv_pos, kv_desc);
+        if (is_whisper) {
+            kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(4u);
+            // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
+            kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(444u);
+            update_npu_config_whisper(properties, kv_pos, kv_desc);
+        } else {
+            kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
+            kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
+            update_npu_config(properties, kv_pos, kv_desc);
+        }
         compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
         // Also export compiled model if required
         if (export_blob) {
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 9bcfd1361f..4000acbcc7 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -190,7 +190,8 @@ struct KVDesc {
 
 std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
                                                              const ov::AnyMap& config,
-                                                             const KVAxesPosition& kv_pos);
+                                                             const KVAxesPosition& kv_pos,
+                                                             const bool is_whisper = false);
 
 /// @brief SharedOptional is a wrapper around a reference to an existing object and an optional shared alternative value.
 /// The difference from std::optional is that the default state is not empty and contains a reference to an existing object outside the class.
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 4aa733f678..26010c9d09 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -12,7 +12,8 @@
 namespace ov::genai {
 std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem::path& models_path,
                                                           const std::string& device,
-                                                          const ov::AnyMap& properties) {
+                                                          const ov::AnyMap& properties,
+                                                          const ov::PartialShape& lhs_shape) {
     bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml");
 
     if (has_decoder_with_past) {
@@ -22,7 +23,7 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
         return std::make_shared<WhisperWithPastDecoder>(models_path, device, properties);
     }
 
-    return std::make_shared<WhisperStatefullDecoder>(models_path, device, properties);
+    return std::make_shared<WhisperStatefullDecoder>(models_path, device, properties, lhs_shape);
 }
 
 std::pair<int64_t, float> WhisperDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp
index 3bdcf4d347..42bc9c39d8 100644
--- a/src/cpp/src/whisper/models/decoder.hpp
+++ b/src/cpp/src/whisper/models/decoder.hpp
@@ -13,7 +13,8 @@ class WhisperDecoder {
 public:
     static std::shared_ptr<WhisperDecoder> from_path(const std::filesystem::path& models_path,
                                                      const std::string& device,
-                                                     const ov::AnyMap& properties);
+                                                     const ov::AnyMap& properties,
+                                                     const ov::PartialShape& lhs_shape);
 
     std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id);
 
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index aaaaa3a826..38d1561139 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -5,30 +5,36 @@
 
 #include "utils.hpp"
 
+namespace {
+void reshape_hidden_states_to_static(std::shared_ptr<ov::Model> model, const ov::PartialShape& lhstates_shape) {
+    ov::PartialShape new_shape = model->input("encoder_hidden_states").get_partial_shape();
+    new_shape[1] = lhstates_shape[1];
+    std::map<std::string, ov::PartialShape> name_to_shape{{"encoder_hidden_states", new_shape}};
+    model->reshape(name_to_shape);
+}
+
+} // anonymous
+
 namespace ov::genai {
 WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& models_path,
                                                  const std::string& device,
-                                                 const ov::AnyMap& properties) {
+                                                 const ov::AnyMap& properties,
+                                                 const ov::PartialShape& lhs_shape) {
     ov::Core core = utils::singleton_core();
 
     auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
 
-    utils::apply_slice_before_matmul_transformation(model);
-
-    if (device.find("NPU") != std::string::npos) {
-        m_is_npu = true;
-    }
-
     ov::CompiledModel compiled_model;
-    if (m_is_npu) {
+    if (device == "NPU") {
         auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
 
+        reshape_hidden_states_to_static(model, lhs_shape);
+
         utils::KVDesc kv_desc;
-        // Check max_prompt_length and min_response_length
-        std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
-            model, properties, kv_pos, models_path / "openvino_model.xml"
-        );
+        std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(model, properties, kv_pos, true);
     } else {
+        utils::apply_slice_before_matmul_transformation(model);
+
         compiled_model = core.compile_model(model, device, properties);
     }
 
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 24c421861c..13bfc5792e 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -12,7 +12,8 @@ class WhisperStatefullDecoder : public WhisperDecoder {
 public:
     WhisperStatefullDecoder(const std::filesystem::path& models_path,
                             const std::string& device,
-                            const ov::AnyMap& properties);
+                            const ov::AnyMap& properties,
+                            const ov::PartialShape& lhs_shape);
 
     void start_async(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) override;
 
@@ -27,6 +28,5 @@ class WhisperStatefullDecoder : public WhisperDecoder {
 
 private:
     ov::InferRequest m_request;
-    bool m_is_npu = false;
 };
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp
index 339be635fe..0a15a6d03c 100644
--- a/src/cpp/src/whisper/pipeline.cpp
+++ b/src/cpp/src/whisper/pipeline.cpp
@@ -77,7 +77,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
             auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
             // NB: only batch_size == 1 is supported now for NPU
             reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size);
-            compiled_model = core.compile_model(encoder_model, "NPU", properties);
+            compiled_model = core.compile_model(encoder_model, "NPU");
         } else {
             compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
         }
@@ -85,7 +85,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model");
         m_encoder = init_model(compiled_model);
 
-        m_decoder = WhisperDecoder::from_path(models_path, device, properties);
+        m_decoder = WhisperDecoder::from_path(models_path, device, properties, m_encoder.get_compiled_model().output("last_hidden_state").get_partial_shape());
 
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1) {
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 70c1f84152..785e389b8e 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -202,7 +202,6 @@ ov::Tensor encode(ov::InferRequest& request,
                     ". Actual size: ",
                     mel_data.size(),
                     ".");
-
     ov::Tensor input_tensor(ov::element::f32, {1, feature_size, nb_max_frames}, mel_data.data());
 
     request.set_tensor("input_features", input_tensor);
@@ -213,7 +212,9 @@ ov::Tensor encode(ov::InferRequest& request,
     raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
 
     // reset input tensor
-    request.set_tensor("input_features", ov::Tensor(ov::element::f32, {0, feature_size, nb_max_frames}));
+    auto m_is_npu = true;
+    uint8_t batch_size = m_is_npu ? 1 : 0;
+    request.set_tensor("input_features", ov::Tensor(ov::element::f32, {batch_size, feature_size, nb_max_frames}));
 
     return request.get_tensor("last_hidden_state");
 }

From 7ac58ec2229f038b6564f35d0491e5683c84c908 Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Fri, 11 Jul 2025 15:40:15 +0100
Subject: [PATCH 03/15] Fixes for Whisper stateful/static pipelines

- fix encode reset based on used device in stateful pipeline
- fix STATIC_PIPELINE property option usage for Whisper static
---
 src/cpp/src/whisper/pipeline_static.cpp | 18 +++++++++++-------
 src/cpp/src/whisper/whisper.cpp         |  4 ++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
index 15f5a95de9..6193f47d1a 100644
--- a/src/cpp/src/whisper/pipeline_static.cpp
+++ b/src/cpp/src/whisper/pipeline_static.cpp
@@ -990,7 +990,11 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     , m_sampler(m_tokenizer) {
     ov::Core core = utils::singleton_core();
 
-    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
+    // Remove "STATIC_PIPELINE" as we don't need to pass it further
+    auto model_properties = properties;
+    utils::pop_option(model_properties, "STATIC_PIPELINE");
+
+    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, model_properties);
     reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size);
     auto last_hidden_state_shape = get_encoder_hidden_state_shape(encoder_model);
 
@@ -998,10 +1002,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     std::shared_ptr<ov::Model> decoder_with_past_model;
 
     if (std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml") ) {
-        decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
-        decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, properties);
+        decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties);
+        decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, model_properties);
     } else {
-        auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
+        auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties);
         ov::pass::StatefulToStateless().run_on_model(model);
 
         decoder_model = prepare_decoder_model(model);
@@ -1030,15 +1034,15 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     preprocess_decoder(decoder_with_past_model);
 
     ov::CompiledModel compiled_model;
-    compiled_model = core.compile_model(encoder_model, "NPU", properties);
+    compiled_model = core.compile_model(encoder_model, "NPU", model_properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model");
     m_models.encoder = compiled_model.create_infer_request();
 
-    compiled_model = core.compile_model(decoder_with_past_model, "NPU", properties);
+    compiled_model = core.compile_model(decoder_with_past_model, "NPU", model_properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder with past model");
     m_models.decoder_with_past = compiled_model.create_infer_request();
 
-    compiled_model = core.compile_model(decoder_model, "NPU", properties);
+    compiled_model = core.compile_model(decoder_model, "NPU", model_properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder model");
     m_models.decoder = compiled_model.create_infer_request();
 
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 785e389b8e..76252d4d39 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -212,8 +212,8 @@ ov::Tensor encode(ov::InferRequest& request,
     raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
 
     // reset input tensor
-    auto m_is_npu = true;
-    uint8_t batch_size = m_is_npu ? 1 : 0;
+    auto devices = request.get_compiled_model().get_property(ov::execution_devices);
+    uint8_t batch_size = (devices[0] == "NPU") ? 1 : 0;
     request.set_tensor("input_features", ov::Tensor(ov::element::f32, {batch_size, feature_size, nb_max_frames}));
 
     return request.get_tensor("last_hidden_state");

From 2c31ff4c00ea8c0bf461646f136dd93597aabb97 Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Mon, 11 Aug 2025 11:54:31 +0100
Subject: [PATCH 04/15] Fix whisper decoder config

---
 src/cpp/src/utils.cpp            | 5 +++++
 src/cpp/src/whisper/pipeline.cpp | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 121c61d6c6..460abf28a4 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -101,6 +101,8 @@ void update_npu_config_whisper(ov::AnyMap& config,
                                const ov::genai::utils::KVDesc& kv_desc) {
     update_config(config, {"NPU_USE_NPUW", "YES"});
     update_config(config, {"NPUW_ONLINE_PIPELINE", "NONE"});
+    update_config(config, {"NPUW_FUNCALL_FOR_ALL", "NO"});
+    update_config(config, {"NPUW_FOLD", "NO"});
     update_config(config, {"NPUW_LLM", "YES"});
 
     update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch});
@@ -108,6 +110,9 @@ void update_npu_config_whisper(ov::AnyMap& config,
 
     update_config(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len});
     update_config(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len});
+
+    // To disable chunking
+    update_config(config, {"NPUW_LLM_PREFILL_HINT", "STATIC"});
 }
 
 inline bool is_paged_attention_available() {
diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp
index 0a15a6d03c..ac8bdf99ca 100644
--- a/src/cpp/src/whisper/pipeline.cpp
+++ b/src/cpp/src/whisper/pipeline.cpp
@@ -77,7 +77,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
             auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
             // NB: only batch_size == 1 is supported now for NPU
             reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size);
-            compiled_model = core.compile_model(encoder_model, "NPU");
+            compiled_model = core.compile_model(encoder_model, "NPU", properties);
         } else {
             compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
         }

From 9faf09a2220ed352868a8921608fbefd0fb4ecc3 Mon Sep 17 00:00:00 2001
From: Ekaterina Shiryaeva <ekaterina.shiriaeva@intel.com>
Date: Sun, 12 Oct 2025 20:24:19 +0100
Subject: [PATCH 05/15] Added new NPUW_WHISPER option

---
 src/cpp/src/utils.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 460abf28a4..fe809b83af 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -104,6 +104,7 @@ void update_npu_config_whisper(ov::AnyMap& config,
     update_config(config, {"NPUW_FUNCALL_FOR_ALL", "NO"});
     update_config(config, {"NPUW_FOLD", "NO"});
     update_config(config, {"NPUW_LLM", "YES"});
+    update_config(config, {"NPUW_WHISPER", "YES"});
 
     update_config(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch});
     update_config(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len});

From 935e63d0183c67e07ad02f3f0711396fe1bb4441 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Fri, 17 Oct 2025 22:05:26 +0200
Subject: [PATCH 06/15] Fixed missed `cache_position` input in
 StaticWhisperPipeline and WhisperStatefulImpl, fixed boolean attention_mask
 handling in StaticWhisperPipeline

---
 src/cpp/src/whisper/models/decoder.cpp        |  3 +-
 .../src/whisper/models/statefull_decoder.cpp  | 14 ++-
 .../src/whisper/models/statefull_decoder.hpp  |  3 +-
 src/cpp/src/whisper/pipeline.cpp              | 20 ++--
 src/cpp/src/whisper/pipeline_static.cpp       | 93 ++++++++++++++-----
 src/cpp/src/whisper/whisper.cpp               |  3 +-
 src/cpp/src/whisper/whisper_utils.cpp         | 10 +-
 src/cpp/src/whisper/whisper_utils.hpp         |  4 +-
 8 files changed, 114 insertions(+), 36 deletions(-)

diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 26010c9d09..83b4b729ce 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -18,7 +18,8 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
 
     if (has_decoder_with_past) {
         if (device == "NPU") {
-            OPENVINO_THROW("Please use STATIC_PIPELINE config option for NPU for 3-model whisper pipeline.");
+            OPENVINO_THROW("For NPU, 3-model whisper pipeline works only with STATIC_PIPELINE : YES configuration "
+                           "(which is default for NPU).");
         }
         return std::make_shared<WhisperWithPastDecoder>(models_path, device, properties);
     }
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index 38d1561139..a7f3313025 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -1,13 +1,15 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "statefull_decoder.hpp"
 
 #include "utils.hpp"
+#include "whisper/whisper_utils.hpp"
 
 namespace {
 void reshape_hidden_states_to_static(std::shared_ptr<ov::Model> model, const ov::PartialShape& lhstates_shape) {
     ov::PartialShape new_shape = model->input("encoder_hidden_states").get_partial_shape();
+    OPENVINO_ASSERT(new_shape.size() > 1 && lhstates_shape.size() > 1);
     new_shape[1] = lhstates_shape[1];
     std::map<std::string, ov::PartialShape> name_to_shape{{"encoder_hidden_states", new_shape}};
     model->reshape(name_to_shape);
@@ -24,6 +26,8 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
 
     auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
 
+    m_has_cache_position = ov::genai::utils::input_exists(model, "cache_position");
+
     ov::CompiledModel compiled_model;
     if (device == "NPU") {
         auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
@@ -50,7 +54,9 @@ void WhisperStatefullDecoder::start_async(const Tensor& encoder_hidden_state,
 
     _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, m_request);
 
-    _set_cache_position_tensor(seq_len);
+    if (m_has_cache_position) {
+        _set_cache_position_tensor(seq_len);
+    }
     m_request.set_tensor("input_ids", input_ids);
     m_request.set_tensor("beam_idx", beam_idx);
 
@@ -79,7 +85,9 @@ Tensor WhisperStatefullDecoder::wait() {
 
 void WhisperStatefullDecoder::reset_state() {
     m_request.reset_state();
-    m_request.set_tensor("cache_position", create_host_tensor(ov::element::i64, {0}));
+    if (m_has_cache_position) {
+        m_request.set_tensor("cache_position", create_host_tensor(ov::element::i64, {0}));
+    }
 
     Shape encoder_hidden_states_shape{m_request.get_tensor("encoder_hidden_states").get_shape()};
     encoder_hidden_states_shape[0] = 0;
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 13bfc5792e..b0fbff2131 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -28,5 +28,6 @@ class WhisperStatefullDecoder : public WhisperDecoder {
 
 private:
     ov::InferRequest m_request;
+    bool m_has_cache_position = true;
 };
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/pipeline.cpp b/src/cpp/src/whisper/pipeline.cpp
index ac8bdf99ca..a3c18fe0ce 100644
--- a/src/cpp/src/whisper/pipeline.cpp
+++ b/src/cpp/src/whisper/pipeline.cpp
@@ -42,7 +42,9 @@ ov::InferRequest init_model(ov::CompiledModel& compiled) {
     }
 }
 
-void reshape_to_static_encoder(std::shared_ptr<ov::Model> model, const size_t feature_size) {
+void reshape_to_static_encoder(std::shared_ptr<ov::Model> model,
+                               const size_t batch_size,
+                               const size_t feature_size) {
     std::map<std::string, ov::PartialShape> new_shapes;
     for (auto input : model->inputs()) {
         const auto& input_name = input.get_any_name();
@@ -51,10 +53,10 @@ void reshape_to_static_encoder(std::shared_ptr<ov::Model> model, const size_t fe
             const auto& partial_shape = input.get_partial_shape();
             OPENVINO_ASSERT(partial_shape.size() >= 3);
             new_shape = partial_shape;
-            new_shape[0] = 1;  // batch_dim
+            new_shape[0] = batch_size;  // batch_dim
             new_shape[1] = feature_size;
+            new_shapes.emplace(input_name, new_shape);
         }
-        new_shapes.emplace(input_name, new_shape);
     }
     model->reshape(new_shapes);
 }
@@ -76,7 +78,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         if (device == "NPU") {
             auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
             // NB: only batch_size == 1 is supported now for NPU
-            reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size);
+            reshape_to_static_encoder(encoder_model, 1, m_feature_extractor.feature_size);
             compiled_model = core.compile_model(encoder_model, "NPU", properties);
         } else {
             compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties);
@@ -178,8 +180,14 @@ ov::genai::WhisperPipeline::WhisperPipeline(const std::filesystem::path& models_
                                             const std::string& device,
                                             const ov::AnyMap& properties) {
     auto start_time = std::chrono::steady_clock::now();
-    if (device == "NPU" && properties.count("STATIC_PIPELINE")) {
-        m_impl = std::make_unique<StaticWhisperPipeline>(models_path, properties);
+    if (device == "NPU") {
+        auto properties_copy = properties;
+        const bool use_static_pipeline = utils::pop_or_default(properties_copy, "STATIC_PIPELINE", true);
+        if (!use_static_pipeline) {
+            m_impl = std::make_unique<WhisperPipelineStatefulImpl>(models_path, device, properties_copy);
+        } else {
+            m_impl = std::make_unique<StaticWhisperPipeline>(models_path, properties_copy);
+        }
     } else {
         m_impl = std::make_unique<WhisperPipelineStatefulImpl>(models_path, device, properties);
     }
diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
index 6193f47d1a..8dfd83cd30 100644
--- a/src/cpp/src/whisper/pipeline_static.cpp
+++ b/src/cpp/src/whisper/pipeline_static.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "whisper/pipeline_static.hpp"
@@ -18,6 +18,7 @@
 #include "openvino/core/preprocess/pre_post_process.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
 #include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/op/range.hpp"
@@ -407,7 +408,8 @@ void add_attention_mask_input(std::shared_ptr<ov::Model> model) {
             auto unsqueeze1 = wrap_type<v0::Unsqueeze>({range, any_input()});
             auto unsqueeze2 = wrap_type<v0::Unsqueeze>({unsqueeze1, any_input()});
             auto unsqueeze3 = wrap_type<v0::Unsqueeze>({unsqueeze2, any_input()});
-            auto lessequal = wrap_type<v1::LessEqual>({unsqueeze3, any_input()});
+            auto opt_convert = optional<v0::Convert>({unsqueeze3->output(0)});
+            auto lessequal = wrap_type<v1::LessEqual>({opt_convert, any_input()});
 
             register_matcher(std::make_shared<Matcher>(lessequal, this->get_type_info().name), [model](Matcher& m) {
                 auto node = m.get_match_root();
@@ -470,13 +472,6 @@ void add_attention_mask_input(std::shared_ptr<ov::Model> model, bool transform_c
             attention_mask->get_output_tensor(0).set_names({"attention_mask"});
             model->add_parameters({attention_mask});
 
-            auto slice = self_attn_nodes[0]->input(kAttnMaskPort).get_source_output().get_node();
-            auto cvt = std::make_shared<v0::Convert>(attention_mask->output(0), ov::element::f32);
-            auto add = std::make_shared<v1::Add>(slice->output(0), cvt->output(0));
-
-            auto trps = std::make_shared<v1::Transpose>(cvt->output(0), v0::Constant::create(ov::element::i32, ov::Shape{2}, std::vector<int>{1, 0}));
-            auto mtpl = std::make_shared<v1::Multiply>(trps->output(0), add->output(0));
-
             auto cst_ninf = std::make_shared<v0::Constant>(
                 ov::element::f32,
                 ov::Shape{1},
@@ -493,6 +488,18 @@ void add_attention_mask_input(std::shared_ptr<ov::Model> model, bool transform_c
                 std::vector<float>{0}
             );
 
+            auto slice = self_attn_nodes[0]->input(kAttnMaskPort).get_source_output().get_node_shared_ptr();
+            std::shared_ptr<ov::Node> slice_f32;
+            if (slice->get_element_type() == ov::element::boolean) {
+                slice_f32 = std::make_shared<v1::Select>(slice->output(0), cst_0->output(0), cst_ninf->output(0));
+            } else {
+                slice_f32 = slice;
+            }
+            auto cvt = std::make_shared<v0::Convert>(attention_mask->output(0), ov::element::f32);
+            auto add = std::make_shared<v1::Add>(slice_f32->output(0), cvt->output(0));
+            auto trps = std::make_shared<v1::Transpose>(cvt->output(0), v0::Constant::create(ov::element::i32, ov::Shape{2}, std::vector<int>{1, 0}));
+            auto mtpl = std::make_shared<v1::Multiply>(trps->output(0), add->output(0));
+
             auto equal = std::make_shared<v1::Equal>(mtpl->output(0), cst_1->output(0));
             auto select = std::make_shared<v1::Select>(
                 equal->output(0), cst_0->output(0), cst_ninf->output(0)
@@ -548,6 +555,46 @@ void add_attention_mask_input(std::shared_ptr<ov::Model> model, bool transform_c
     pm.run_passes(model);
 }
 
+void add_cache_position_input(std::shared_ptr<ov::Model> model) {
+    using namespace ov::pass::pattern;
+    using namespace ov::op;
+    class CachePositionInput : public ov::pass::MatcherPass {
+    public:
+        OPENVINO_MATCHER_PASS_RTTI("CachePositionInput");
+
+        CachePositionInput(std::shared_ptr<ov::Model> model) {
+            auto gather = wrap_type<v8::Gather>({any_input(), any_input(), any_input()});
+            auto add = wrap_type<v1::Add>({gather, any_input()});
+            auto range = wrap_type<v4::Range>({gather, add, any_input()});
+            auto unsqueeze = wrap_type<v0::Unsqueeze>({range, any_input()});
+            auto tile = wrap_type<v0::Tile>({unsqueeze, any_input()});
+
+            register_matcher(std::make_shared<Matcher>(tile, this->get_type_info().name),
+                [model, unsqueeze](Matcher& m) {
+                auto& node_to_output = m.get_pattern_value_map();
+                auto unsqueeze_node = node_to_output.at(unsqueeze).get_node_shared_ptr();
+                auto matched_unsqueeze = std::static_pointer_cast<v0::Unsqueeze>(unsqueeze_node);
+
+                auto cache_position = std::make_shared<v0::Parameter>(ov::element::i64, ov::Shape{1});
+                cache_position->get_output_tensor(0).set_names({"cache_position"});
+                cache_position->set_friendly_name("cache_position");
+                model->add_parameters({cache_position});
+                // If cache_position input is missed in the model, it means that position is calculated
+                // by the model itself using fp32 range constructed from the shapes of inputs.
+                // So operations below this range expect fp32 input.
+                auto cache_position_f32 = std::make_shared<v0::Convert>(cache_position, ov::element::f32);
+
+                matched_unsqueeze->input(0).replace_source_output(cache_position_f32->output(0));
+                return false;
+            });
+        }
+    };
+
+    ov::pass::Manager pm;
+    pm.register_pass<CachePositionInput>(model);
+    pm.run_passes(model);
+    model->validate_nodes_and_infer_types();
+}
 
 ov::PartialShape get_encoder_hidden_state_shape(const std::shared_ptr<ov::Model>& encoder) {
     return encoder->output("last_hidden_state").get_partial_shape();
@@ -956,8 +1003,10 @@ std::shared_ptr<ov::Model> prepare_decoder_model(std::shared_ptr<ov::Model>& mod
     remove_input_kv_tensors(decoder_model);
     // 3) Expose all states that requires initialization on the first run as outputs
     expose_runtime_states_as_outputs(decoder_model);
-    // 4) Remove cache_position input
-    remove_cache_position(decoder_model);
+    // 4) Remove cache_position input if it exists
+    if (ov::genai::utils::input_exists(decoder_model, "cache_position")) {
+        remove_cache_position(decoder_model);
+    }
     // 5) Normalize output names - should be done in stateful_to_stateless_transformation
     normalize_output_key_value_names(decoder_model);
 
@@ -972,6 +1021,10 @@ std::shared_ptr<ov::Model> prepare_decoder_with_past_model(std::shared_ptr<ov::M
     normalize_output_key_value_names(decoder_with_past_model);
     expose_runtime_states_as_inputs(decoder_with_past_model);
 
+    if (!ov::genai::utils::input_exists(decoder_with_past_model, "cache_position")) {
+        add_cache_position_input(decoder_with_past_model);
+    }
+
     decoder_with_past_model->reshape({{"input_ids", ov::PartialShape({-1, 1})}});
     decoder_with_past_model->set_friendly_name("Model6");
 
@@ -990,11 +1043,7 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     , m_sampler(m_tokenizer) {
     ov::Core core = utils::singleton_core();
 
-    // Remove "STATIC_PIPELINE" as we don't need to pass it further
-    auto model_properties = properties;
-    utils::pop_option(model_properties, "STATIC_PIPELINE");
-
-    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, model_properties);
+    auto encoder_model = core.read_model(models_path / "openvino_encoder_model.xml", {}, properties);
     reshape_to_static_encoder(encoder_model, m_feature_extractor.feature_size);
     auto last_hidden_state_shape = get_encoder_hidden_state_shape(encoder_model);
 
@@ -1002,10 +1051,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     std::shared_ptr<ov::Model> decoder_with_past_model;
 
     if (std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml") ) {
-        decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties);
-        decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, model_properties);
+        decoder_model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
+        decoder_with_past_model = core.read_model(models_path / "openvino_decoder_with_past_model.xml", {}, properties);
     } else {
-        auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, model_properties);
+        auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
         ov::pass::StatefulToStateless().run_on_model(model);
 
         decoder_model = prepare_decoder_model(model);
@@ -1034,15 +1083,15 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     preprocess_decoder(decoder_with_past_model);
 
     ov::CompiledModel compiled_model;
-    compiled_model = core.compile_model(encoder_model, "NPU", model_properties);
+    compiled_model = core.compile_model(encoder_model, "NPU", properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model");
     m_models.encoder = compiled_model.create_infer_request();
 
-    compiled_model = core.compile_model(decoder_with_past_model, "NPU", model_properties);
+    compiled_model = core.compile_model(decoder_with_past_model, "NPU", properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder with past model");
     m_models.decoder_with_past = compiled_model.create_infer_request();
 
-    compiled_model = core.compile_model(decoder_model, "NPU", model_properties);
+    compiled_model = core.compile_model(decoder_model, "NPU", properties);
     ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder model");
     m_models.decoder = compiled_model.create_infer_request();
 
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 76252d4d39..c5ec745c20 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -213,7 +213,8 @@ ov::Tensor encode(ov::InferRequest& request,
 
     // reset input tensor
     auto devices = request.get_compiled_model().get_property(ov::execution_devices);
-    uint8_t batch_size = (devices[0] == "NPU") ? 1 : 0;
+    OPENVINO_ASSERT(devices.size() > 0, "No execution devices found!");
+    size_t batch_size = (devices[0] == "NPU") ? 1 : 0;
     request.set_tensor("input_features", ov::Tensor(ov::element::f32, {batch_size, feature_size, nb_max_frames}));
 
     return request.get_tensor("last_hidden_state");
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
index 3f925c6ead..42cf6fa5fd 100644
--- a/src/cpp/src/whisper/whisper_utils.cpp
+++ b/src/cpp/src/whisper/whisper_utils.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "whisper/whisper_utils.hpp"
@@ -57,6 +57,14 @@ int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) {
     return out_token;
 }
 
+bool input_exists(const std::shared_ptr<ov::Model>& model, const std::string& name) {
+    auto inputs = model->inputs();
+    auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) {
+        return port.get_names().count(name) != 0;
+    });
+    return it != inputs.end();
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
index 8fd0a080c6..ba66d18e0f 100644
--- a/src/cpp/src/whisper/whisper_utils.hpp
+++ b/src/cpp/src/whisper/whisper_utils.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -19,6 +19,8 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
 
 int64_t argmax(const ov::Tensor& logits, const size_t batch_idx);
 
+bool input_exists(const std::shared_ptr<ov::Model>& model, const std::string& name);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov

From 9556acd9694d7916a91643949cd1847190d1dbbf Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Mon, 20 Oct 2025 12:05:44 +0200
Subject: [PATCH 07/15] Test new requirements.txt

---
 tests/python_tests/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index e24bf2e160..28735702e6 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.35.2
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@04db016571d1a19c14918553365ee4c05c8b4697
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@3130e907fb7960653039d138493cbb075e128f6a
 numpy==1.26.4; platform_system == "Darwin" and platform_machine == "x86_64"
 safetensors==0.6.2; platform_system == "Darwin" and platform_machine == "x86_64"
 pytest==8.4.2
-transformers==4.53.3
+transformers==4.55.4
 hf_transfer==0.1.9
 gguf==0.17.1
 

From 14db9c7d8ab78efbbfe5fa97b784c83bc4b16ce1 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Mon, 20 Oct 2025 19:08:17 +0200
Subject: [PATCH 08/15] Fix for StaticWhisperPipeline to work with
 transformers_4.55.4 ieline_static.cpp

---
 src/cpp/src/whisper/pipeline_static.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
index 8dfd83cd30..d53f5b2166 100644
--- a/src/cpp/src/whisper/pipeline_static.cpp
+++ b/src/cpp/src/whisper/pipeline_static.cpp
@@ -579,12 +579,14 @@ void add_cache_position_input(std::shared_ptr<ov::Model> model) {
                 cache_position->get_output_tensor(0).set_names({"cache_position"});
                 cache_position->set_friendly_name("cache_position");
                 model->add_parameters({cache_position});
-                // If cache_position input is missed in the model, it means that position is calculated
-                // by the model itself using fp32 range constructed from the shapes of inputs.
-                // So operations below this range expect fp32 input.
-                auto cache_position_f32 = std::make_shared<v0::Convert>(cache_position, ov::element::f32);
+                std::shared_ptr<ov::Node> cache_pos_unsqueeze_arg;
+                if (unsqueeze_node->input(0).get_element_type() == ov::element::f32) {
+                    auto cache_pos_unsqueeze_arg = std::make_shared<v0::Convert>(cache_position, ov::element::f32);
+                } else {
+                    cache_pos_unsqueeze_arg = cache_position;
+                }
 
-                matched_unsqueeze->input(0).replace_source_output(cache_position_f32->output(0));
+                matched_unsqueeze->input(0).replace_source_output(cache_pos_unsqueeze_arg->output(0));
                 return false;
             });
         }

From 5536a6eee45edc0efb95944717416e6c6127ba09 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Wed, 22 Oct 2025 20:02:06 +0100
Subject: [PATCH 09/15] Fixed review comments

---
 src/cpp/src/whisper/pipeline_static.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
index d53f5b2166..bc373abea6 100644
--- a/src/cpp/src/whisper/pipeline_static.cpp
+++ b/src/cpp/src/whisper/pipeline_static.cpp
@@ -581,7 +581,7 @@ void add_cache_position_input(std::shared_ptr<ov::Model> model) {
                 model->add_parameters({cache_position});
                 std::shared_ptr<ov::Node> cache_pos_unsqueeze_arg;
                 if (unsqueeze_node->input(0).get_element_type() == ov::element::f32) {
-                    auto cache_pos_unsqueeze_arg = std::make_shared<v0::Convert>(cache_position, ov::element::f32);
+                    cache_pos_unsqueeze_arg = std::make_shared<v0::Convert>(cache_position, ov::element::f32);
                 } else {
                     cache_pos_unsqueeze_arg = cache_position;
                 }

From 68e16f6e9f18b67f2947ec5a2b511e92c178286c Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Oct 2025 21:12:15 +0200
Subject: [PATCH 10/15] Skip tests

---
 tests/python_tests/test_whisper_pipeline.py | 45 ++-------------------
 1 file changed, 3 insertions(+), 42 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index 655f527852..68caee5068 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -55,13 +55,11 @@ def get_whisper_models_list(tiny_only=False):
 # used whisper models are relatively small
 # cache them in memory to speedup tests
 @functools.lru_cache()
-def read_whisper_model(params, stateful=True):
+def read_whisper_model(params):
     model_id, path = params
-    if not stateful:
-        path = pathlib.Path(f"{path}_with_past")
 
     if not (path / "openvino_encoder_model.xml").exists():
-        save_model(model_id=model_id, tmp_path=path, stateful=stateful)
+        save_model(model_id=model_id, tmp_path=path)
 
     opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained(
         path,
@@ -93,7 +91,7 @@ def read_whisper_model(params, stateful=True):
     )
 
 
-def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True):
+def save_model(model_id: str, tmp_path: pathlib.Path):
     tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True))
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
         tokenizer,
@@ -111,7 +109,6 @@ def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True):
         model_id,
         export=True,
         trust_remote_code=True,
-        stateful=stateful,
         compile=False,
         device="CPU",
         load_in_8bit=False,
@@ -226,9 +223,6 @@ def run_pipeline_with_ref(
     streamer: typing.Callable[[str], bool] | None = None,
 ):
     _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
-    _, _, _, genai_with_past_pipe = read_whisper_model(
-        (model_id, tmp_path), stateful=False
-    )
 
     if type(sample) is np.ndarray and len(sample.shape) == 1:
         sample = np.expand_dims(sample, 0)
@@ -239,12 +233,6 @@ def run_pipeline_with_ref(
 
         compare_results(hf_result, genai_result)
 
-        genai_with_past_result = run_genai(
-            genai_with_past_pipe, _sample, generation_config, streamer
-        )
-
-        compare_results(hf_result, genai_with_past_result)
-
 
 def compare_results(hf_result, genai_result):
     assert genai_result.texts[0] == hf_result["text"]
@@ -510,33 +498,6 @@ def test_longform_audio(model_descr, sample_from_dataset):
     assert "".join(streamer_result) == hf_result["text"]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list())
-@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=2, long_form=True)], indirect=True)
-@pytest.mark.precommit
-@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169")
-def test_longform_audio_with_past(model_descr, sample_from_dataset):
-    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True)
-
-    streamer_result = []
-
-    genai_result = run_genai(
-        genai_pipe,
-        sample_from_dataset,
-        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
-        streamer=lambda x: streamer_result.append(x),
-    )
-
-    hf_result = run_huggingface(
-        hf_pipe,
-        sample_from_dataset,
-        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
-    )
-
-    compare_results(hf_result, genai_result)
-
-    assert "".join(streamer_result) == hf_result["text"]
-
-
 @pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
 @pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169")

From 7e29a70a4eeb775b3e8db0dac68b71e97ee96b29 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Thu, 23 Oct 2025 12:49:31 +0100
Subject: [PATCH 11/15] Added handling of `cache_position` for stateless
 Whisper pipeline

---
 src/cpp/src/utils.cpp                            |  8 ++++++++
 src/cpp/src/utils.hpp                            |  5 +++++
 src/cpp/src/whisper/models/statefull_decoder.cpp |  3 +--
 src/cpp/src/whisper/models/with_past_decoder.cpp | 11 ++++++++---
 src/cpp/src/whisper/models/with_past_decoder.hpp |  1 +
 src/cpp/src/whisper/pipeline_static.cpp          | 10 +++++-----
 src/cpp/src/whisper/whisper_utils.cpp            |  8 --------
 src/cpp/src/whisper/whisper_utils.hpp            |  2 --
 8 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 13fbbd116a..c26a1830cf 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -840,6 +840,14 @@ void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path
     out.close();
 }
 
+bool has_input(const std::shared_ptr<ov::Model>& model, const std::string& name) {
+    auto inputs = model->inputs();
+    auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) {
+        return port.get_names().count(name) != 0;
+    });
+    return it != inputs.end();
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 9cba89e05d..14106ef8f7 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -309,6 +309,11 @@ ov::CompiledModel import_model(const std::filesystem::path& blob_path,
  */
 void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path& blob_path);
 
+/**
+ * @brief Checks if the model has an input with the specified name.
+ */
+bool has_input(const std::shared_ptr<Model>& model, const std::string& name);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index a7f3313025..3c1a8c4e20 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -4,7 +4,6 @@
 #include "statefull_decoder.hpp"
 
 #include "utils.hpp"
-#include "whisper/whisper_utils.hpp"
 
 namespace {
 void reshape_hidden_states_to_static(std::shared_ptr<ov::Model> model, const ov::PartialShape& lhstates_shape) {
@@ -26,7 +25,7 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
 
     auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
 
-    m_has_cache_position = ov::genai::utils::input_exists(model, "cache_position");
+    m_has_cache_position = utils::has_input(model, "cache_position");
 
     ov::CompiledModel compiled_model;
     if (device == "NPU") {
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index 559c96d952..adf72454b5 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -86,8 +86,11 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode
                  "To obtain stateful decoder model use latest `optimum-intel` package:\n"
                  "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git@main\n"
                  "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny");
+
     ov::Core core = utils::singleton_core();
 
+    m_has_cache_position = utils::has_input(core.read_model(models_path / "openvino_decoder_model.xml"), "cache_position");
+
     auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
     utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
     m_request_decoder = compiled_model.create_infer_request();
@@ -110,9 +113,11 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state,
     request.set_tensor("input_ids", input_ids);
 
     if (!is_initial_step) {
-        ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
-        cache_position_tensor.set_shape({1});
-        cache_position_tensor.data<int64_t>()[0] = m_cache_position;
+        if (m_has_cache_position) {
+            ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
+            cache_position_tensor.set_shape({1});
+            cache_position_tensor.data<int64_t>()[0] = m_cache_position;
+        }
     }
 
     _set_past_key_value(beam_idx);
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index b268903802..47375ffdf2 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -26,6 +26,7 @@ class WhisperWithPastDecoder : public WhisperDecoder {
     size_t m_cache_position = 0;
     bool m_initial_past_key_value_set = false;
     bool m_past_key_value_linked = false;
+    bool m_has_cache_position = true;
 
     void _set_past_key_value(const Tensor& beam_idx);
 };
diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
index bc373abea6..bc04f45bbe 100644
--- a/src/cpp/src/whisper/pipeline_static.cpp
+++ b/src/cpp/src/whisper/pipeline_static.cpp
@@ -1006,7 +1006,7 @@ std::shared_ptr<ov::Model> prepare_decoder_model(std::shared_ptr<ov::Model>& mod
     // 3) Expose all states that requires initialization on the first run as outputs
     expose_runtime_states_as_outputs(decoder_model);
     // 4) Remove cache_position input if it exists
-    if (ov::genai::utils::input_exists(decoder_model, "cache_position")) {
+    if (ov::genai::utils::has_input(decoder_model, "cache_position")) {
         remove_cache_position(decoder_model);
     }
     // 5) Normalize output names - should be done in stateful_to_stateless_transformation
@@ -1023,10 +1023,6 @@ std::shared_ptr<ov::Model> prepare_decoder_with_past_model(std::shared_ptr<ov::M
     normalize_output_key_value_names(decoder_with_past_model);
     expose_runtime_states_as_inputs(decoder_with_past_model);
 
-    if (!ov::genai::utils::input_exists(decoder_with_past_model, "cache_position")) {
-        add_cache_position_input(decoder_with_past_model);
-    }
-
     decoder_with_past_model->reshape({{"input_ids", ov::PartialShape({-1, 1})}});
     decoder_with_past_model->set_friendly_name("Model6");
 
@@ -1066,6 +1062,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     if (!decoder_model || !decoder_with_past_model)
         OPENVINO_THROW("Decoder/decoder_with_past model is not valid !");
 
+    if (!ov::genai::utils::has_input(decoder_with_past_model, "cache_position")) {
+        add_cache_position_input(decoder_with_past_model);
+    }
+
     add_attention_mask_input(decoder_model, true /* transform_cross_attn */, last_hidden_state_shape[1].get_length());
     // NB: Note, there is no need to transform cross attention for decoder_with_past_model
     // as it accepts only single token and there can't be any padding.
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
index 42cf6fa5fd..60714059d6 100644
--- a/src/cpp/src/whisper/whisper_utils.cpp
+++ b/src/cpp/src/whisper/whisper_utils.cpp
@@ -57,14 +57,6 @@ int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) {
     return out_token;
 }
 
-bool input_exists(const std::shared_ptr<ov::Model>& model, const std::string& name) {
-    auto inputs = model->inputs();
-    auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) {
-        return port.get_names().count(name) != 0;
-    });
-    return it != inputs.end();
-}
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
index ba66d18e0f..d7205aa5bd 100644
--- a/src/cpp/src/whisper/whisper_utils.hpp
+++ b/src/cpp/src/whisper/whisper_utils.hpp
@@ -19,8 +19,6 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
 
 int64_t argmax(const ov::Tensor& logits, const size_t batch_idx);
 
-bool input_exists(const std::shared_ptr<ov::Model>& model, const std::string& name);
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov

From 6614e94b7f32086cf67e09f206ef0a336b7b73d0 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Thu, 23 Oct 2025 13:11:52 +0100
Subject: [PATCH 12/15] Fixed model name to check `cache_position` input

---
 src/cpp/src/whisper/models/with_past_decoder.cpp | 4 ++--
 src/cpp/src/whisper/whisper_utils.cpp            | 2 +-
 src/cpp/src/whisper/whisper_utils.hpp            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index adf72454b5..2b1ec783a7 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -89,12 +89,12 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode
 
     ov::Core core = utils::singleton_core();
 
-    m_has_cache_position = utils::has_input(core.read_model(models_path / "openvino_decoder_model.xml"), "cache_position");
-
     auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
     utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
     m_request_decoder = compiled_model.create_infer_request();
 
+    m_has_cache_position =
+        utils::has_input(core.read_model(models_path / "openvino_decoder_with_past_model.xml"), "cache_position");
     compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
     utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
     m_request_decoder_with_past = compiled_model.create_infer_request();
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
index 60714059d6..3f925c6ead 100644
--- a/src/cpp/src/whisper/whisper_utils.cpp
+++ b/src/cpp/src/whisper/whisper_utils.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024-2025 Intel Corporation
+// Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "whisper/whisper_utils.hpp"
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
index d7205aa5bd..8fd0a080c6 100644
--- a/src/cpp/src/whisper/whisper_utils.hpp
+++ b/src/cpp/src/whisper/whisper_utils.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024-2025 Intel Corporation
+// Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once

From 04c89a218d860e481f707033bd56302ee72b2339 Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Thu, 23 Oct 2025 13:14:49 +0100
Subject: [PATCH 13/15] Polishing

---
 src/cpp/src/whisper/models/with_past_decoder.cpp | 4 ++--
 src/cpp/src/whisper/models/with_past_decoder.hpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index 2b1ec783a7..f3c1f3fab5 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -93,7 +93,7 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode
     utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
     m_request_decoder = compiled_model.create_infer_request();
 
-    m_has_cache_position =
+    m_past_decoder_has_cache_position =
         utils::has_input(core.read_model(models_path / "openvino_decoder_with_past_model.xml"), "cache_position");
     compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
     utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
@@ -113,7 +113,7 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state,
     request.set_tensor("input_ids", input_ids);
 
     if (!is_initial_step) {
-        if (m_has_cache_position) {
+        if (m_past_decoder_has_cache_position) {
             ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
             cache_position_tensor.set_shape({1});
             cache_position_tensor.data<int64_t>()[0] = m_cache_position;
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index 47375ffdf2..89b804393a 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -26,7 +26,7 @@ class WhisperWithPastDecoder : public WhisperDecoder {
     size_t m_cache_position = 0;
     bool m_initial_past_key_value_set = false;
     bool m_past_key_value_linked = false;
-    bool m_has_cache_position = true;
+    bool m_past_decoder_has_cache_position = true;
 
     void _set_past_key_value(const Tensor& beam_idx);
 };

From a303604659c05f997c651abc346f4eb051b8c11a Mon Sep 17 00:00:00 2001
From: Anastasiya Pronina <anastasiya.pronina@intel.com>
Date: Thu, 23 Oct 2025 13:22:19 +0100
Subject: [PATCH 14/15] Fixed review comments

---
 .../src/whisper/models/with_past_decoder.cpp  | 10 ++---
 src/cpp/src/whisper/pipeline_static.cpp       |  2 +-
 tests/python_tests/test_whisper_pipeline.py   | 45 +++++++++++++++++--
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index f3c1f3fab5..11c25f2118 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -112,12 +112,10 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state,
     _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request);
     request.set_tensor("input_ids", input_ids);
 
-    if (!is_initial_step) {
-        if (m_past_decoder_has_cache_position) {
-            ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
-            cache_position_tensor.set_shape({1});
-            cache_position_tensor.data<int64_t>()[0] = m_cache_position;
-        }
+    if (!is_initial_step && m_past_decoder_has_cache_position) {
+        ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
+        cache_position_tensor.set_shape({1});
+        cache_position_tensor.data<int64_t>()[0] = m_cache_position;
     }
 
     _set_past_key_value(beam_idx);
diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
index bc04f45bbe..293e00091a 100644
--- a/src/cpp/src/whisper/pipeline_static.cpp
+++ b/src/cpp/src/whisper/pipeline_static.cpp
@@ -580,7 +580,7 @@ void add_cache_position_input(std::shared_ptr<ov::Model> model) {
                 cache_position->set_friendly_name("cache_position");
                 model->add_parameters({cache_position});
                 std::shared_ptr<ov::Node> cache_pos_unsqueeze_arg;
-                if (unsqueeze_node->input(0).get_element_type() == ov::element::f32) {
+                if (matched_unsqueeze->input(0).get_element_type() == ov::element::f32) {
                     cache_pos_unsqueeze_arg = std::make_shared<v0::Convert>(cache_position, ov::element::f32);
                 } else {
                     cache_pos_unsqueeze_arg = cache_position;
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index 68caee5068..655f527852 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -55,11 +55,13 @@ def get_whisper_models_list(tiny_only=False):
 # used whisper models are relatively small
 # cache them in memory to speedup tests
 @functools.lru_cache()
-def read_whisper_model(params):
+def read_whisper_model(params, stateful=True):
     model_id, path = params
+    if not stateful:
+        path = pathlib.Path(f"{path}_with_past")
 
     if not (path / "openvino_encoder_model.xml").exists():
-        save_model(model_id=model_id, tmp_path=path)
+        save_model(model_id=model_id, tmp_path=path, stateful=stateful)
 
     opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained(
         path,
@@ -91,7 +93,7 @@ def read_whisper_model(params):
     )
 
 
-def save_model(model_id: str, tmp_path: pathlib.Path):
+def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True):
     tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True))
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
         tokenizer,
@@ -109,6 +111,7 @@ def save_model(model_id: str, tmp_path: pathlib.Path):
         model_id,
         export=True,
         trust_remote_code=True,
+        stateful=stateful,
         compile=False,
         device="CPU",
         load_in_8bit=False,
@@ -223,6 +226,9 @@ def run_pipeline_with_ref(
     streamer: typing.Callable[[str], bool] | None = None,
 ):
     _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
+    _, _, _, genai_with_past_pipe = read_whisper_model(
+        (model_id, tmp_path), stateful=False
+    )
 
     if type(sample) is np.ndarray and len(sample.shape) == 1:
         sample = np.expand_dims(sample, 0)
@@ -233,6 +239,12 @@ def run_pipeline_with_ref(
 
         compare_results(hf_result, genai_result)
 
+        genai_with_past_result = run_genai(
+            genai_with_past_pipe, _sample, generation_config, streamer
+        )
+
+        compare_results(hf_result, genai_with_past_result)
+
 
 def compare_results(hf_result, genai_result):
     assert genai_result.texts[0] == hf_result["text"]
@@ -498,6 +510,33 @@ def test_longform_audio(model_descr, sample_from_dataset):
     assert "".join(streamer_result) == hf_result["text"]
 
 
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
+@pytest.mark.parametrize("sample_from_dataset", [*get_fixture_params_for_n_whisper_dataset_samples(n=2, long_form=True)], indirect=True)
+@pytest.mark.precommit
+@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169")
+def test_longform_audio_with_past(model_descr, sample_from_dataset):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True)
+
+    streamer_result = []
+
+    genai_result = run_genai(
+        genai_pipe,
+        sample_from_dataset,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
+        streamer=lambda x: streamer_result.append(x),
+    )
+
+    hf_result = run_huggingface(
+        hf_pipe,
+        sample_from_dataset,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
+    )
+
+    compare_results(hf_result, genai_result)
+
+    assert "".join(streamer_result) == hf_result["text"]
+
+
 @pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
 @pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 173169")

From 816938a9df94e27f7e86a34db19e6ddb64caa1b6 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Fri, 24 Oct 2025 18:32:34 +0200
Subject: [PATCH 15/15] Reverted update of requirements.txt

---
 tests/python_tests/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index 28735702e6..e24bf2e160 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.35.2
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@3130e907fb7960653039d138493cbb075e128f6a
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@04db016571d1a19c14918553365ee4c05c8b4697
 numpy==1.26.4; platform_system == "Darwin" and platform_machine == "x86_64"
 safetensors==0.6.2; platform_system == "Darwin" and platform_machine == "x86_64"
 pytest==8.4.2
-transformers==4.55.4
+transformers==4.53.3
 hf_transfer==0.1.9
 gguf==0.17.1