Added handling of cache_position for stateless Whisper pipeline

AsyaPronina · AsyaPronina · commit 7e29a70a4eeb · 2025-10-23T12:59:31.000+01:00
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -840,6 +840,14 @@ void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path
     out.close();
 }
 
+bool has_input(const std::shared_ptr<ov::Model>& model, const std::string& name) {
+    auto inputs = model->inputs();
+    auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) {
+        return port.get_names().count(name) != 0;
+    });
+    return it != inputs.end();
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -309,6 +309,11 @@ ov::CompiledModel import_model(const std::filesystem::path& blob_path,
  */
 void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path& blob_path);
 
+/**
+ * @brief Checks if the model has an input with the specified name.
+ */
+bool has_input(const std::shared_ptr<Model>& model, const std::string& name);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -4,7 +4,6 @@
 #include "statefull_decoder.hpp"
 
 #include "utils.hpp"
-#include "whisper/whisper_utils.hpp"
 
 namespace {
 void reshape_hidden_states_to_static(std::shared_ptr<ov::Model> model, const ov::PartialShape& lhstates_shape) {
@@ -26,7 +25,7 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
 
     auto model = core.read_model(models_path / "openvino_decoder_model.xml", {}, properties);
 
-    m_has_cache_position = ov::genai::utils::input_exists(model, "cache_position");
+    m_has_cache_position = utils::has_input(model, "cache_position");
 
     ov::CompiledModel compiled_model;
     if (device == "NPU") {
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -86,8 +86,11 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode
                  "To obtain stateful decoder model use latest `optimum-intel` package:\n"
                  "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git@main\n"
                  "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny");
+
     ov::Core core = utils::singleton_core();
 
+    m_has_cache_position = utils::has_input(core.read_model(models_path / "openvino_decoder_model.xml"), "cache_position");
+
     auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
     utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
     m_request_decoder = compiled_model.create_infer_request();
@@ -110,9 +113,11 @@ void WhisperWithPastDecoder::start_async(const Tensor& encoder_hidden_state,
     request.set_tensor("input_ids", input_ids);
 
     if (!is_initial_step) {
-        ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
-        cache_position_tensor.set_shape({1});
-        cache_position_tensor.data<int64_t>()[0] = m_cache_position;
+        if (m_has_cache_position) {
+            ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
+            cache_position_tensor.set_shape({1});
+            cache_position_tensor.data<int64_t>()[0] = m_cache_position;
+        }
     }
 
     _set_past_key_value(beam_idx);
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -26,6 +26,7 @@ class WhisperWithPastDecoder : public WhisperDecoder {
     size_t m_cache_position = 0;
     bool m_initial_past_key_value_set = false;
     bool m_past_key_value_linked = false;
+    bool m_has_cache_position = true;
 
     void _set_past_key_value(const Tensor& beam_idx);
 };
diff --git a/src/cpp/src/whisper/pipeline_static.cpp b/src/cpp/src/whisper/pipeline_static.cpp
@@ -1006,7 +1006,7 @@ std::shared_ptr<ov::Model> prepare_decoder_model(std::shared_ptr<ov::Model>& mod
     // 3) Expose all states that requires initialization on the first run as outputs
     expose_runtime_states_as_outputs(decoder_model);
     // 4) Remove cache_position input if it exists
-    if (ov::genai::utils::input_exists(decoder_model, "cache_position")) {
+    if (ov::genai::utils::has_input(decoder_model, "cache_position")) {
         remove_cache_position(decoder_model);
     }
     // 5) Normalize output names - should be done in stateful_to_stateless_transformation
@@ -1023,10 +1023,6 @@ std::shared_ptr<ov::Model> prepare_decoder_with_past_model(std::shared_ptr<ov::M
     normalize_output_key_value_names(decoder_with_past_model);
     expose_runtime_states_as_inputs(decoder_with_past_model);
 
-    if (!ov::genai::utils::input_exists(decoder_with_past_model, "cache_position")) {
-        add_cache_position_input(decoder_with_past_model);
-    }
-
     decoder_with_past_model->reshape({{"input_ids", ov::PartialShape({-1, 1})}});
     decoder_with_past_model->set_friendly_name("Model6");
 
@@ -1066,6 +1062,10 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys
     if (!decoder_model || !decoder_with_past_model)
         OPENVINO_THROW("Decoder/decoder_with_past model is not valid !");
 
+    if (!ov::genai::utils::has_input(decoder_with_past_model, "cache_position")) {
+        add_cache_position_input(decoder_with_past_model);
+    }
+
     add_attention_mask_input(decoder_model, true /* transform_cross_attn */, last_hidden_state_shape[1].get_length());
     // NB: Note, there is no need to transform cross attention for decoder_with_past_model
     // as it accepts only single token and there can't be any padding.
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
@@ -57,14 +57,6 @@ int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) {
     return out_token;
 }
 
-bool input_exists(const std::shared_ptr<ov::Model>& model, const std::string& name) {
-    auto inputs = model->inputs();
-    auto it = std::find_if(inputs.begin(), inputs.end(), [&](const auto& port) {
-        return port.get_names().count(name) != 0;
-    });
-    return it != inputs.end();
-}
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
@@ -19,8 +19,6 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
 
 int64_t argmax(const ov::Tensor& logits, const size_t batch_idx);
 
-bool input_exists(const std::shared_ptr<ov::Model>& model, const std::string& name);
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov