@@ -41,89 +41,6 @@ struct execution_stats {
4141 std::size_t ops{};
4242};
4343
44- // Compiled Submodel descriptor
45- struct CompiledModelDesc {
46- std::vector<std::string>::const_iterator device_it;
47- std::set<std::string> devices_to_avoid;
48- std::shared_ptr<ov::Model> model;
49- ov::SoPtr<ov::ICompiledModel> compiled_model;
50-
51- std::optional<std::size_t > replaced_by;
52-
53- Subgraph::Gather host_gather;
54- Subgraph::QuantUnpackGather quant_unpack_gather;
55- std::optional<ov::npuw::compiled::Spatial> spatial;
56- std::optional<ov::npuw::compiled::Attention> attention;
57- std::optional<ov::npuw::compiled::PyramidAttention> pyramid_attention;
58- std::optional<ov::npuw::compiled::HostFlashAttention> host_flash_attention;
59- std::optional<ov::npuw::compiled::MoEExperts> moe_experts;
60- std::optional<ov::npuw::compiled::MoEDownstream> moe_experts_downstream;
61-
62- // Infer requests for pyramid attention models (if pyramid_attention is present)
63- std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_infer_requests;
64-
65- // Pipeline infer requests for pyramid attention models (if pyramid_attention is present and pipelining is
66- // enabled)
67- std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_pipeline_requests;
68-
69- // HFA tile model indices for infer request vectors
70- enum HFATileIdx : size_t {
71- REGULAR_TILE = 0 , // Regular tile model (intermediate tiles)
72- FINAL_TILE = 1 , // Final tile model (last tile with division and transpose)
73- COUNT = 2 // Total number of HFA tile models
74- };
75-
76- // Infer requests for host flash attention tile models (if host_flash_attention is present)
77- // [REGULAR_TILE]: regular tile model, [FINAL_TILE]: final tile model
78- std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_infer_requests;
79-
80- // Pipeline infer requests for host flash attention tile models (if host_flash_attention is present and
81- // pipelining is enabled)
82- std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_pipeline_requests;
83-
84- // Infer requests for MoE expert models with different chunk sizes (if moe_experts is present)
85- // Map: chunk_size -> infer_request
86- std::map<size_t , ov::SoPtr<ov::IAsyncInferRequest>> moe_infer_requests;
87-
88- // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
89- // w.r.t. function calls
90- std::size_t param_base = 0 ;
91-
92- struct Closure {
93- std::vector<ov::Tensor> closure;
94- std::vector<int64_t > closure_uid; // Note: value -1 is considered uninitialized
95- std::vector<bool > is_remote;
96- };
97-
98- // Need to wrap closure, since finalize_weights_bank() will
99- // asynchronously evaluate weights and put them in closure.
100- // Other functions of CompiledModel as well as InferRequest and
101- // other entities need to wait for the closure to be populated first
102- // (meaning to wait for async weights processing to end).
103- ov::npuw::util::Delayed<Closure> closure;
104-
105- // NB: closure and lazy_closure are of the same size - to preserve proper indexing.
106- // closure is responsible for host-side tensors (DCOFF, Gather, etc) while
107- // lazy_closure is used for weights sharing and allocating device memory.
108- std::vector<weights::LazyTensor> lazy_closure;
109- std::vector<ov::Tensor> scales;
110- std::vector<ov::Tensor> zerops;
111-
112- bool forced_to_fcall = false ;
113-
114- // FIXME: Take it out of structure
115- ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
116- bool switched_to_ref = false ;
117-
118- // Metrics
119- execution_stats stat;
120-
121- void serialize (std::ostream& stream, const ov::npuw::s11n::WeightsContext& ctx) const ;
122- void deserialize (std::istream& stream,
123- const ov::npuw::s11n::WeightsContext& ctx,
124- const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx);
125- };
126-
12744class ICompiledModel : public ov ::ICompiledModel {
12845public:
12946 ICompiledModel (const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin)
@@ -137,7 +54,7 @@ class ICompiledModel : public ov::ICompiledModel {
13754
13855 virtual std::string submodel_device (const std::size_t idx) const = 0;
13956
140- virtual std::vector<CompiledModelDesc> get_compiled_submodels () const = 0;
57+ virtual std::size_t num_compiled_submodels () const = 0;
14158
14259 virtual void serialize (std::ostream& stream, const ov::npuw::s11n::CompiledContext& ctx) const = 0;
14360
@@ -253,7 +170,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
253170 std::string global_mem_device () const ;
254171 std::string funcall_mem_device (const std::size_t idx) const ;
255172
256- std::vector<CompiledModelDesc> get_compiled_submodels () const override ;
173+ std::size_t num_compiled_submodels () const override ;
257174
258175 std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc;
259176 ::intel_npu::Config m_cfg;
@@ -292,6 +209,87 @@ class CompiledModel : public ov::npuw::ICompiledModel {
292209
293210 void init_profiling ();
294211
212+ struct CompiledModelDesc {
213+ DevList::const_iterator device_it;
214+ std::set<std::string> devices_to_avoid;
215+ std::shared_ptr<ov::Model> model;
216+ ov::SoPtr<ov::ICompiledModel> compiled_model;
217+
218+ std::optional<std::size_t > replaced_by;
219+
220+ Subgraph::Gather host_gather;
221+ Subgraph::QuantUnpackGather quant_unpack_gather;
222+ std::optional<ov::npuw::compiled::Spatial> spatial;
223+ std::optional<ov::npuw::compiled::Attention> attention;
224+ std::optional<ov::npuw::compiled::PyramidAttention> pyramid_attention;
225+ std::optional<ov::npuw::compiled::HostFlashAttention> host_flash_attention;
226+ std::optional<ov::npuw::compiled::MoEExperts> moe_experts;
227+ std::optional<ov::npuw::compiled::MoEDownstream> moe_experts_downstream;
228+
229+ // Infer requests for pyramid attention models (if pyramid_attention is present)
230+ std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_infer_requests;
231+
232+ // Pipeline infer requests for pyramid attention models (if pyramid_attention is present and pipelining is
233+ // enabled)
234+ std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_pipeline_requests;
235+
236+ // HFA tile model indices for infer request vectors
237+ enum HFATileIdx : size_t {
238+ REGULAR_TILE = 0 , // Regular tile model (intermediate tiles)
239+ FINAL_TILE = 1 , // Final tile model (last tile with division and transpose)
240+ COUNT = 2 // Total number of HFA tile models
241+ };
242+
243+ // Infer requests for host flash attention tile models (if host_flash_attention is present)
244+ // [REGULAR_TILE]: regular tile model, [FINAL_TILE]: final tile model
245+ std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_infer_requests;
246+
247+ // Pipeline infer requests for host flash attention tile models (if host_flash_attention is present and
248+ // pipelining is enabled)
249+ std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_pipeline_requests;
250+
251+ // Infer requests for MoE expert models with different chunk sizes (if moe_experts is present)
252+ // Map: chunk_size -> infer_request
253+ std::map<size_t , ov::SoPtr<ov::IAsyncInferRequest>> moe_infer_requests;
254+
255+ // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
256+ // w.r.t. function calls
257+ std::size_t param_base = 0 ;
258+
259+ struct Closure {
260+ std::vector<ov::Tensor> closure;
261+ std::vector<int64_t > closure_uid; // Note: value -1 is considered uninitialized
262+ std::vector<bool > is_remote;
263+ };
264+
265+ // Need to wrap closure, since finalize_weights_bank() will
266+ // asynchronously evaluate weights and put them in closure.
267+ // Other functions of CompiledModel as well as InferRequest and
268+ // other entities need to wait for the closure to be populated first
269+ // (meaning to wait for async weights processing to end).
270+ ov::npuw::util::Delayed<Closure> closure;
271+
272+ // NB: closure and lazy_closure are of the same size - to preserve proper indexing.
273+ // closure is responsible for host-side tensors (DCOFF, Gather, etc) while
274+ // lazy_closure is used for weights sharing and allocating device memory.
275+ std::vector<weights::LazyTensor> lazy_closure;
276+ std::vector<ov::Tensor> scales;
277+ std::vector<ov::Tensor> zerops;
278+
279+ bool forced_to_fcall = false ;
280+
281+ // FIXME: Take it out of structure
282+ ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
283+ bool switched_to_ref = false ;
284+
285+ // Metrics
286+ execution_stats stat;
287+
288+ void serialize (std::ostream& stream, const ov::npuw::s11n::WeightsContext& ctx) const ;
289+ void deserialize (std::istream& stream,
290+ const ov::npuw::s11n::WeightsContext& ctx,
291+ const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx);
292+ };
295293 std::vector<CompiledModelDesc> m_compiled_submodels;
296294
297295 std::function<bool (const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
0 commit comments