openvinotoolkit
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp‎
Lines changed: 0 additions & 10 deletions b/‎src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp‎
Lines changed: 4 additions & 4 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp‎
Lines changed: 83 additions & 85 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp‎
Lines changed: 83 additions & 85 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp‎
Lines changed: 14 additions & 14 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp‎
Lines changed: 14 additions & 14 deletions
@@ -755,16 +755,6 @@ static constexpr ov::Property<ov::AnyMap> shared_lm_head_config{"NPUW_LLM_SHARED
  * NOTE: !! Write-only !!
  */
 static constexpr ov::Property<ov::AnyMap> additional_shared_lm_head_config{"++NPUW_LLM_SHARED_HEAD_CONFIG"};
-
-class INPUWCompiledModelFactory;
-/**
- * @brief
- * Type: std::shared_ptr<INPUWCompiledModelFactory>
- * Factory for creation of ov::npuw::ICompiledModels instances to use by ov::npuw::LLMCompiledModel.
- *
- * NOTE: Enabled in develop build only.
- */
-static constexpr ov::Property<std::shared_ptr<INPUWCompiledModelFactory>>  npuw_compiled_model_factory{"NPUW_LLM_NPUWMODEL_FACTORY_PTR"};
 }  // namespace llm
 
 namespace whisper {
 
@@ -685,7 +685,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared
     return false;
 }
 
-void ov::npuw::CompiledModelDesc::serialize(std::ostream& stream,
+void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,
                                             const ov::npuw::s11n::WeightsContext& ctx) const {
     using namespace ov::npuw::s11n;
 
@@ -824,7 +824,7 @@ void ov::npuw::CompiledModelDesc::serialize(std::ostream& stream,
     LOG_DEBUG("DONE.");
 }
 
-void ov::npuw::CompiledModelDesc::deserialize(
+void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(
     std::istream& stream,
     const ov::npuw::s11n::WeightsContext& ctx,
     const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx) {
@@ -1647,8 +1647,8 @@ std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) c
     return *comp_model_desc.device_it;
 }
 
-std::vector<ov::npuw::CompiledModelDesc> ov::npuw::CompiledModel::get_compiled_submodels() const {
-    return m_compiled_submodels;
+std::size_t ov::npuw::CompiledModel::num_compiled_submodels() const {
+    return m_compiled_submodels.size();
 }
 
 void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr<ov::Model>& model) {
 
@@ -41,89 +41,6 @@ struct execution_stats {
     std::size_t ops{};
 };
 
-// Compiled Submodel descriptor 
-struct CompiledModelDesc {
-    std::vector<std::string>::const_iterator device_it;
-    std::set<std::string> devices_to_avoid;
-    std::shared_ptr<ov::Model> model;
-    ov::SoPtr<ov::ICompiledModel> compiled_model;
-
-    std::optional<std::size_t> replaced_by;
-
-    Subgraph::Gather host_gather;
-    Subgraph::QuantUnpackGather quant_unpack_gather;
-    std::optional<ov::npuw::compiled::Spatial> spatial;
-    std::optional<ov::npuw::compiled::Attention> attention;
-    std::optional<ov::npuw::compiled::PyramidAttention> pyramid_attention;
-    std::optional<ov::npuw::compiled::HostFlashAttention> host_flash_attention;
-    std::optional<ov::npuw::compiled::MoEExperts> moe_experts;
-    std::optional<ov::npuw::compiled::MoEDownstream> moe_experts_downstream;
-
-    // Infer requests for pyramid attention models (if pyramid_attention is present)
-    std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_infer_requests;
-
-    // Pipeline infer requests for pyramid attention models (if pyramid_attention is present and pipelining is
-    // enabled)
-    std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_pipeline_requests;
-
-    // HFA tile model indices for infer request vectors
-    enum HFATileIdx : size_t {
-        REGULAR_TILE = 0,  // Regular tile model (intermediate tiles)
-        FINAL_TILE = 1,    // Final tile model (last tile with division and transpose)
-        COUNT = 2          // Total number of HFA tile models
-    };
-
-    // Infer requests for host flash attention tile models (if host_flash_attention is present)
-    // [REGULAR_TILE]: regular tile model, [FINAL_TILE]: final tile model
-    std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_infer_requests;
-
-    // Pipeline infer requests for host flash attention tile models (if host_flash_attention is present and
-    // pipelining is enabled)
-    std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_pipeline_requests;
-
-    // Infer requests for MoE expert models with different chunk sizes (if moe_experts is present)
-    // Map: chunk_size -> infer_request
-    std::map<size_t, ov::SoPtr<ov::IAsyncInferRequest>> moe_infer_requests;
-
-    // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
-    // w.r.t. function calls
-    std::size_t param_base = 0;
-
-    struct Closure {
-        std::vector<ov::Tensor> closure;
-        std::vector<int64_t> closure_uid;  // Note: value -1 is considered uninitialized
-        std::vector<bool> is_remote;
-    };
-
-    // Need to wrap closure, since finalize_weights_bank() will
-    // asynchronously evaluate weights and put them in closure.
-    // Other functions of CompiledModel as well as InferRequest and
-    // other entities need to wait for the closure to be populated first
-    // (meaning to wait for async weights processing to end).
-    ov::npuw::util::Delayed<Closure> closure;
-
-    // NB: closure and lazy_closure are of the same size - to preserve proper indexing.
-    //     closure is responsible for host-side tensors (DCOFF, Gather, etc) while
-    //     lazy_closure is used for weights sharing and allocating device memory.
-    std::vector<weights::LazyTensor> lazy_closure;
-    std::vector<ov::Tensor> scales;
-    std::vector<ov::Tensor> zerops;
-
-    bool forced_to_fcall = false;
-
-    // FIXME: Take it out of structure
-    ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
-    bool switched_to_ref = false;
-
-    // Metrics
-    execution_stats stat;
-
-    void serialize(std::ostream& stream, const ov::npuw::s11n::WeightsContext& ctx) const;
-    void deserialize(std::istream& stream,
-                        const ov::npuw::s11n::WeightsContext& ctx,
-                        const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx);
-};
-
 class ICompiledModel : public ov::ICompiledModel {
 public:
     ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin)
@@ -137,7 +54,7 @@ class ICompiledModel : public ov::ICompiledModel {
 
     virtual std::string submodel_device(const std::size_t idx) const = 0;
 
-    virtual std::vector<CompiledModelDesc> get_compiled_submodels() const = 0;
+    virtual std::size_t num_compiled_submodels() const = 0;
 
     virtual void serialize(std::ostream& stream, const ov::npuw::s11n::CompiledContext& ctx) const = 0;
 
@@ -253,7 +170,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
     std::string global_mem_device() const;
     std::string funcall_mem_device(const std::size_t idx) const;
 
-    std::vector<CompiledModelDesc> get_compiled_submodels() const override;
+    std::size_t num_compiled_submodels() const override;
 
     std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc;
     ::intel_npu::Config m_cfg;
@@ -292,6 +209,87 @@ class CompiledModel : public ov::npuw::ICompiledModel {
 
     void init_profiling();
 
+    struct CompiledModelDesc {
+        DevList::const_iterator device_it;
+        std::set<std::string> devices_to_avoid;
+        std::shared_ptr<ov::Model> model;
+        ov::SoPtr<ov::ICompiledModel> compiled_model;
+
+        std::optional<std::size_t> replaced_by;
+
+        Subgraph::Gather host_gather;
+        Subgraph::QuantUnpackGather quant_unpack_gather;
+        std::optional<ov::npuw::compiled::Spatial> spatial;
+        std::optional<ov::npuw::compiled::Attention> attention;
+        std::optional<ov::npuw::compiled::PyramidAttention> pyramid_attention;
+        std::optional<ov::npuw::compiled::HostFlashAttention> host_flash_attention;
+        std::optional<ov::npuw::compiled::MoEExperts> moe_experts;
+        std::optional<ov::npuw::compiled::MoEDownstream> moe_experts_downstream;
+
+        // Infer requests for pyramid attention models (if pyramid_attention is present)
+        std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_infer_requests;
+
+        // Pipeline infer requests for pyramid attention models (if pyramid_attention is present and pipelining is
+        // enabled)
+        std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_pipeline_requests;
+
+        // HFA tile model indices for infer request vectors
+        enum HFATileIdx : size_t {
+            REGULAR_TILE = 0,  // Regular tile model (intermediate tiles)
+            FINAL_TILE = 1,    // Final tile model (last tile with division and transpose)
+            COUNT = 2          // Total number of HFA tile models
+        };
+
+        // Infer requests for host flash attention tile models (if host_flash_attention is present)
+        // [REGULAR_TILE]: regular tile model, [FINAL_TILE]: final tile model
+        std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_infer_requests;
+
+        // Pipeline infer requests for host flash attention tile models (if host_flash_attention is present and
+        // pipelining is enabled)
+        std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_pipeline_requests;
+
+        // Infer requests for MoE expert models with different chunk sizes (if moe_experts is present)
+        // Map: chunk_size -> infer_request
+        std::map<size_t, ov::SoPtr<ov::IAsyncInferRequest>> moe_infer_requests;
+
+        // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
+        // w.r.t. function calls
+        std::size_t param_base = 0;
+
+        struct Closure {
+            std::vector<ov::Tensor> closure;
+            std::vector<int64_t> closure_uid;  // Note: value -1 is considered uninitialized
+            std::vector<bool> is_remote;
+        };
+
+        // Need to wrap closure, since finalize_weights_bank() will
+        // asynchronously evaluate weights and put them in closure.
+        // Other functions of CompiledModel as well as InferRequest and
+        // other entities need to wait for the closure to be populated first
+        // (meaning to wait for async weights processing to end).
+        ov::npuw::util::Delayed<Closure> closure;
+
+        // NB: closure and lazy_closure are of the same size - to preserve proper indexing.
+        //     closure is responsible for host-side tensors (DCOFF, Gather, etc) while
+        //     lazy_closure is used for weights sharing and allocating device memory.
+        std::vector<weights::LazyTensor> lazy_closure;
+        std::vector<ov::Tensor> scales;
+        std::vector<ov::Tensor> zerops;
+
+        bool forced_to_fcall = false;
+
+        // FIXME: Take it out of structure
+        ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
+        bool switched_to_ref = false;
+
+        // Metrics
+        execution_stats stat;
+
+        void serialize(std::ostream& stream, const ov::npuw::s11n::WeightsContext& ctx) const;
+        void deserialize(std::istream& stream,
+                         const ov::npuw::s11n::WeightsContext& ctx,
+                         const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx);
+    };
     std::vector<CompiledModelDesc> m_compiled_submodels;
 
     std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
 
@@ -1205,18 +1205,18 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
     }
 
     // Allocate storage for infer requests: [REGULAR_TILE] and [FINAL_TILE]
-    submodel_desc.hfa_infer_requests.resize(CompiledModelDesc::HFATileIdx::COUNT);
+    submodel_desc.hfa_infer_requests.resize(CompiledModel::CompiledModelDesc::HFATileIdx::COUNT);
     if (is_piped) {
-        submodel_desc.hfa_pipeline_requests.resize(CompiledModelDesc::HFATileIdx::COUNT);
+        submodel_desc.hfa_pipeline_requests.resize(CompiledModel::CompiledModelDesc::HFATileIdx::COUNT);
     }
 
     // Create infer request for regular tile model
     try {
         LOG_INFO("Creating infer request for HFA regular tile model...");
-        submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
+        submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
             hfa._compiled_tile_model->create_infer_request();
         if (is_piped) {
-            submodel_desc.hfa_pipeline_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
+            submodel_desc.hfa_pipeline_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
                 hfa._compiled_tile_model->create_infer_request();
         }
     } catch (const std::exception& ex) {
@@ -1232,10 +1232,10 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
     // For final tile model, reuse the main compiled_model's infer request
     // because compiled_model points to _compiled_final_tile_model for HFA
     LOG_INFO("Reusing " << (is_recreate ? "recreated " : "") << "main infer request for HFA final tile model");
-    submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::FINAL_TILE] =
+    submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::FINAL_TILE] =
         m_subrequests[real_idx];
     if (is_piped) {
-        submodel_desc.hfa_pipeline_requests[CompiledModelDesc::HFATileIdx::FINAL_TILE] =
+        submodel_desc.hfa_pipeline_requests[CompiledModel::CompiledModelDesc::HFATileIdx::FINAL_TILE] =
             m_funcall_pipeline[real_idx].subrequest;
     }
 
@@ -1248,14 +1248,14 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
 
         // Directly share tensor from main infer request to regular tile request
         auto main_tensor = m_subrequests[real_idx]->get_tensor(final_tile_input);
-        submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
+        submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
             tile_input,
             main_tensor);
 
         // Repeat for pipeline infer request if pipelined
         if (is_piped) {
             auto pipeline_tensor = m_funcall_pipeline[real_idx].subrequest->get_tensor(final_tile_input);
-            submodel_desc.hfa_pipeline_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
+            submodel_desc.hfa_pipeline_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
                 tile_input,
                 pipeline_tensor);
         }
@@ -1297,13 +1297,13 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
 
         // Get state tensors from regular tile request
         auto state_acc =
-            submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
+            submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
                 hfa._compiled_tile_model->inputs()[tile_in.acc]);
         auto state_max =
-            submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
+            submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
                 hfa._compiled_tile_model->inputs()[tile_in.max]);
         auto state_sum =
-            submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
+            submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
                 hfa._compiled_tile_model->inputs()[tile_in.d]);
 
         // Initialize state tensors with zeros/minus infinity
@@ -1529,7 +1529,7 @@ void ov::npuw::JustInferRequest::run_hfa_tiled_inference(std::size_t real_idx, s
     auto& hfa_desc = comp_model_desc.host_flash_attention.value();
 
     NPUW_ASSERT(hfa_desc.is_valid() && "HFA configuration must be valid");
-    NPUW_ASSERT(comp_model_desc.hfa_infer_requests.size() == CompiledModelDesc::HFATileIdx::COUNT &&
+    NPUW_ASSERT(comp_model_desc.hfa_infer_requests.size() == CompiledModel::CompiledModelDesc::HFATileIdx::COUNT &&
                 "HFA infer requests must be created");
 
     // Calculate tile configuration
@@ -1565,9 +1565,9 @@ void ov::npuw::JustInferRequest::run_hfa_tiled_inference(std::size_t real_idx, s
 
     // Get tile infer requests
     auto& regular_tile_request =
-        comp_model_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE];
+        comp_model_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE];
     auto& final_tile_request =
-        comp_model_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::FINAL_TILE];
+        comp_model_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::FINAL_TILE];
 
     // Use pre-cached indices (populated during compilation)
     const auto& tile_in = sdpa_info._tile_input_indices;
Original file line number	Diff line number	Diff line change
`@@ -685,7 +685,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared`
`685`	`685`	`return false;`
`686`	`686`	`}`
`687`	`687`
`688`		`-void ov::npuw::CompiledModelDesc::serialize(std::ostream& stream,`
	`688`	`+void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,`
`689`	`689`	`const ov::npuw::s11n::WeightsContext& ctx) const {`
`690`	`690`	`using namespace ov::npuw::s11n;`
`691`	`691`
`@@ -824,7 +824,7 @@ void ov::npuw::CompiledModelDesc::serialize(std::ostream& stream,`
`824`	`824`	`LOG_DEBUG("DONE.");`
`825`	`825`	`}`
`826`	`826`
`827`		`-void ov::npuw::CompiledModelDesc::deserialize(`
	`827`	`+void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(`
`828`	`828`	`std::istream& stream,`
`829`	`829`	`const ov::npuw::s11n::WeightsContext& ctx,`
`830`	`830`	`const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx) {`
`@@ -1647,8 +1647,8 @@ std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) c`
`1647`	`1647`	`return *comp_model_desc.device_it;`
`1648`	`1648`	`}`
`1649`	`1649`
`1650`		`-std::vector<ov::npuw::CompiledModelDesc> ov::npuw::CompiledModel::get_compiled_submodels() const {`
`1651`		`- return m_compiled_submodels;`
	`1650`	`+std::size_t ov::npuw::CompiledModel::num_compiled_submodels() const {`
	`1651`	`+ return m_compiled_submodels.size();`
`1652`	`1652`	`}`
`1653`	`1653`
`1654`	`1654`	`void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr<ov::Model>& model) {`