@@ -74,7 +74,6 @@ std::optional<uint32_t> pop_int_and_cast(ov::AnyMap& config, const std::string&
7474}
7575
7676void update_npu_config (ov::AnyMap& config,
77- const std::shared_ptr<ov::Model>& model,
7877 const ov::genai::utils::KVAxesPosition& kv_pos,
7978 const ov::genai::utils::KVDesc& kv_desc) {
8079 update_config (config, {" NPU_USE_NPUW" , " YES" });
@@ -97,6 +96,26 @@ void update_npu_config(ov::AnyMap& config,
9796 rename_key (config, " ++SHARED_HEAD_CONFIG" , " ++NPUW_LLM_SHARED_HEAD_CONFIG" );
9897}
9998
99+ void update_npu_config_whisper (ov::AnyMap& config,
100+ const ov::genai::utils::KVAxesPosition& kv_pos,
101+ const ov::genai::utils::KVDesc& kv_desc) {
102+ update_config (config, {" NPU_USE_NPUW" , " YES" });
103+ update_config (config, {" NPUW_ONLINE_PIPELINE" , " NONE" });
104+ update_config (config, {" NPUW_FUNCALL_FOR_ALL" , " NO" });
105+ update_config (config, {" NPUW_FOLD" , " NO" });
106+ update_config (config, {" NPUW_LLM" , " YES" });
107+ update_config (config, {" NPUW_WHISPER" , " YES" });
108+
109+ update_config (config, {" NPUW_LLM_BATCH_DIM" , kv_pos.batch });
110+ update_config (config, {" NPUW_LLM_SEQ_LEN_DIM" , kv_pos.seq_len });
111+
112+ update_config (config, {" NPUW_LLM_MAX_PROMPT_LEN" , kv_desc.max_prompt_len });
113+ update_config (config, {" NPUW_LLM_MIN_RESPONSE_LEN" , kv_desc.min_response_len });
114+
115+ // To disable chunking
116+ update_config (config, {" NPUW_LLM_PREFILL_HINT" , " STATIC" });
117+ }
118+
100119inline bool is_paged_attention_available () {
101120#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
102121 return true ;
@@ -554,7 +573,8 @@ void print_scheduler_config_info(const SchedulerConfig &scheduler_config) {
554573std::pair<ov::CompiledModel, KVDesc>
555574compile_decoder_for_npu (const std::shared_ptr<ov::Model>& model,
556575 const ov::AnyMap& config,
557- const KVAxesPosition& kv_pos) {
576+ const KVAxesPosition& kv_pos,
577+ const bool is_whisper) {
558578 ov::CompiledModel compiled;
559579 ov::AnyMap properties = config;
560580 KVDesc kv_desc;
@@ -575,9 +595,16 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
575595 kv_desc.max_prompt_len = compiled.get_property (" NPUW_LLM_MAX_PROMPT_LEN" ).as <uint32_t >();
576596 kv_desc.min_response_len = compiled.get_property (" NPUW_LLM_MIN_RESPONSE_LEN" ).as <uint32_t >();
577597 } else {
578- kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (1024u );
579- kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (128u );
580- update_npu_config (properties, model, kv_pos, kv_desc);
598+ if (is_whisper) {
599+ kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (4u );
600+ // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
601+ kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (444u );
602+ update_npu_config_whisper (properties, kv_pos, kv_desc);
603+ } else {
604+ kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (1024u );
605+ kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (128u );
606+ update_npu_config (properties, kv_pos, kv_desc);
607+ }
581608 compiled = ov::genai::utils::singleton_core ().compile_model (model, " NPU" , properties);
582609 // Also export compiled model if required
583610 if (export_blob) {
@@ -813,6 +840,14 @@ void export_model(ov::CompiledModel& compiled_model, const std::filesystem::path
813840 out.close ();
814841}
815842
843+ bool has_input (const std::shared_ptr<ov::Model>& model, const std::string& name) {
844+ auto inputs = model->inputs ();
845+ auto it = std::find_if (inputs.begin (), inputs.end (), [&](const auto & port) {
846+ return port.get_names ().count (name) != 0 ;
847+ });
848+ return it != inputs.end ();
849+ }
850+
816851} // namespace utils
817852} // namespace genai
818853} // namespace ov
0 commit comments