Skip to content

Commit 3ecdfba

Browse files
committed
Switched from property to explicit instantiation of LLMCompiledModel in unit tests
1 parent ffc7fdd commit 3ecdfba

File tree

13 files changed

+453
-315
lines changed

13 files changed

+453
-315
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -755,16 +755,6 @@ static constexpr ov::Property<ov::AnyMap> shared_lm_head_config{"NPUW_LLM_SHARED
755755
* NOTE: !! Write-only !!
756756
*/
757757
static constexpr ov::Property<ov::AnyMap> additional_shared_lm_head_config{"++NPUW_LLM_SHARED_HEAD_CONFIG"};
758-
759-
class INPUWCompiledModelFactory;
760-
/**
761-
* @brief
762-
* Type: std::shared_ptr<INPUWCompiledModelFactory>
763-
* Factory for creation of ov::npuw::ICompiledModels instances to use by ov::npuw::LLMCompiledModel.
764-
*
765-
* NOTE: Enabled in develop build only.
766-
*/
767-
static constexpr ov::Property<std::shared_ptr<INPUWCompiledModelFactory>> npuw_compiled_model_factory{"NPUW_LLM_NPUWMODEL_FACTORY_PTR"};
768758
} // namespace llm
769759

770760
namespace whisper {

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared
685685
return false;
686686
}
687687

688-
void ov::npuw::CompiledModelDesc::serialize(std::ostream& stream,
688+
void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,
689689
const ov::npuw::s11n::WeightsContext& ctx) const {
690690
using namespace ov::npuw::s11n;
691691

@@ -824,7 +824,7 @@ void ov::npuw::CompiledModelDesc::serialize(std::ostream& stream,
824824
LOG_DEBUG("DONE.");
825825
}
826826

827-
void ov::npuw::CompiledModelDesc::deserialize(
827+
void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(
828828
std::istream& stream,
829829
const ov::npuw::s11n::WeightsContext& ctx,
830830
const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx) {
@@ -1647,8 +1647,8 @@ std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) c
16471647
return *comp_model_desc.device_it;
16481648
}
16491649

1650-
std::vector<ov::npuw::CompiledModelDesc> ov::npuw::CompiledModel::get_compiled_submodels() const {
1651-
return m_compiled_submodels;
1650+
std::size_t ov::npuw::CompiledModel::num_compiled_submodels() const {
1651+
return m_compiled_submodels.size();
16521652
}
16531653

16541654
void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr<ov::Model>& model) {

src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp

Lines changed: 83 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -41,89 +41,6 @@ struct execution_stats {
4141
std::size_t ops{};
4242
};
4343

44-
// Compiled Submodel descriptor
45-
struct CompiledModelDesc {
46-
std::vector<std::string>::const_iterator device_it;
47-
std::set<std::string> devices_to_avoid;
48-
std::shared_ptr<ov::Model> model;
49-
ov::SoPtr<ov::ICompiledModel> compiled_model;
50-
51-
std::optional<std::size_t> replaced_by;
52-
53-
Subgraph::Gather host_gather;
54-
Subgraph::QuantUnpackGather quant_unpack_gather;
55-
std::optional<ov::npuw::compiled::Spatial> spatial;
56-
std::optional<ov::npuw::compiled::Attention> attention;
57-
std::optional<ov::npuw::compiled::PyramidAttention> pyramid_attention;
58-
std::optional<ov::npuw::compiled::HostFlashAttention> host_flash_attention;
59-
std::optional<ov::npuw::compiled::MoEExperts> moe_experts;
60-
std::optional<ov::npuw::compiled::MoEDownstream> moe_experts_downstream;
61-
62-
// Infer requests for pyramid attention models (if pyramid_attention is present)
63-
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_infer_requests;
64-
65-
// Pipeline infer requests for pyramid attention models (if pyramid_attention is present and pipelining is
66-
// enabled)
67-
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_pipeline_requests;
68-
69-
// HFA tile model indices for infer request vectors
70-
enum HFATileIdx : size_t {
71-
REGULAR_TILE = 0, // Regular tile model (intermediate tiles)
72-
FINAL_TILE = 1, // Final tile model (last tile with division and transpose)
73-
COUNT = 2 // Total number of HFA tile models
74-
};
75-
76-
// Infer requests for host flash attention tile models (if host_flash_attention is present)
77-
// [REGULAR_TILE]: regular tile model, [FINAL_TILE]: final tile model
78-
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_infer_requests;
79-
80-
// Pipeline infer requests for host flash attention tile models (if host_flash_attention is present and
81-
// pipelining is enabled)
82-
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_pipeline_requests;
83-
84-
// Infer requests for MoE expert models with different chunk sizes (if moe_experts is present)
85-
// Map: chunk_size -> infer_request
86-
std::map<size_t, ov::SoPtr<ov::IAsyncInferRequest>> moe_infer_requests;
87-
88-
// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
89-
// w.r.t. function calls
90-
std::size_t param_base = 0;
91-
92-
struct Closure {
93-
std::vector<ov::Tensor> closure;
94-
std::vector<int64_t> closure_uid; // Note: value -1 is considered uninitialized
95-
std::vector<bool> is_remote;
96-
};
97-
98-
// Need to wrap closure, since finalize_weights_bank() will
99-
// asynchronously evaluate weights and put them in closure.
100-
// Other functions of CompiledModel as well as InferRequest and
101-
// other entities need to wait for the closure to be populated first
102-
// (meaning to wait for async weights processing to end).
103-
ov::npuw::util::Delayed<Closure> closure;
104-
105-
// NB: closure and lazy_closure are of the same size - to preserve proper indexing.
106-
// closure is responsible for host-side tensors (DCOFF, Gather, etc) while
107-
// lazy_closure is used for weights sharing and allocating device memory.
108-
std::vector<weights::LazyTensor> lazy_closure;
109-
std::vector<ov::Tensor> scales;
110-
std::vector<ov::Tensor> zerops;
111-
112-
bool forced_to_fcall = false;
113-
114-
// FIXME: Take it out of structure
115-
ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
116-
bool switched_to_ref = false;
117-
118-
// Metrics
119-
execution_stats stat;
120-
121-
void serialize(std::ostream& stream, const ov::npuw::s11n::WeightsContext& ctx) const;
122-
void deserialize(std::istream& stream,
123-
const ov::npuw::s11n::WeightsContext& ctx,
124-
const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx);
125-
};
126-
12744
class ICompiledModel : public ov::ICompiledModel {
12845
public:
12946
ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin)
@@ -137,7 +54,7 @@ class ICompiledModel : public ov::ICompiledModel {
13754

13855
virtual std::string submodel_device(const std::size_t idx) const = 0;
13956

140-
virtual std::vector<CompiledModelDesc> get_compiled_submodels() const = 0;
57+
virtual std::size_t num_compiled_submodels() const = 0;
14158

14259
virtual void serialize(std::ostream& stream, const ov::npuw::s11n::CompiledContext& ctx) const = 0;
14360

@@ -253,7 +170,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
253170
std::string global_mem_device() const;
254171
std::string funcall_mem_device(const std::size_t idx) const;
255172

256-
std::vector<CompiledModelDesc> get_compiled_submodels() const override;
173+
std::size_t num_compiled_submodels() const override;
257174

258175
std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc;
259176
::intel_npu::Config m_cfg;
@@ -292,6 +209,87 @@ class CompiledModel : public ov::npuw::ICompiledModel {
292209

293210
void init_profiling();
294211

212+
struct CompiledModelDesc {
213+
DevList::const_iterator device_it;
214+
std::set<std::string> devices_to_avoid;
215+
std::shared_ptr<ov::Model> model;
216+
ov::SoPtr<ov::ICompiledModel> compiled_model;
217+
218+
std::optional<std::size_t> replaced_by;
219+
220+
Subgraph::Gather host_gather;
221+
Subgraph::QuantUnpackGather quant_unpack_gather;
222+
std::optional<ov::npuw::compiled::Spatial> spatial;
223+
std::optional<ov::npuw::compiled::Attention> attention;
224+
std::optional<ov::npuw::compiled::PyramidAttention> pyramid_attention;
225+
std::optional<ov::npuw::compiled::HostFlashAttention> host_flash_attention;
226+
std::optional<ov::npuw::compiled::MoEExperts> moe_experts;
227+
std::optional<ov::npuw::compiled::MoEDownstream> moe_experts_downstream;
228+
229+
// Infer requests for pyramid attention models (if pyramid_attention is present)
230+
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_infer_requests;
231+
232+
// Pipeline infer requests for pyramid attention models (if pyramid_attention is present and pipelining is
233+
// enabled)
234+
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> pyramid_pipeline_requests;
235+
236+
// HFA tile model indices for infer request vectors
237+
enum HFATileIdx : size_t {
238+
REGULAR_TILE = 0, // Regular tile model (intermediate tiles)
239+
FINAL_TILE = 1, // Final tile model (last tile with division and transpose)
240+
COUNT = 2 // Total number of HFA tile models
241+
};
242+
243+
// Infer requests for host flash attention tile models (if host_flash_attention is present)
244+
// [REGULAR_TILE]: regular tile model, [FINAL_TILE]: final tile model
245+
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_infer_requests;
246+
247+
// Pipeline infer requests for host flash attention tile models (if host_flash_attention is present and
248+
// pipelining is enabled)
249+
std::vector<ov::SoPtr<ov::IAsyncInferRequest>> hfa_pipeline_requests;
250+
251+
// Infer requests for MoE expert models with different chunk sizes (if moe_experts is present)
252+
// Map: chunk_size -> infer_request
253+
std::map<size_t, ov::SoPtr<ov::IAsyncInferRequest>> moe_infer_requests;
254+
255+
// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
256+
// w.r.t. function calls
257+
std::size_t param_base = 0;
258+
259+
struct Closure {
260+
std::vector<ov::Tensor> closure;
261+
std::vector<int64_t> closure_uid; // Note: value -1 is considered uninitialized
262+
std::vector<bool> is_remote;
263+
};
264+
265+
// Need to wrap closure, since finalize_weights_bank() will
266+
// asynchronously evaluate weights and put them in closure.
267+
// Other functions of CompiledModel as well as InferRequest and
268+
// other entities need to wait for the closure to be populated first
269+
// (meaning to wait for async weights processing to end).
270+
ov::npuw::util::Delayed<Closure> closure;
271+
272+
// NB: closure and lazy_closure are of the same size - to preserve proper indexing.
273+
// closure is responsible for host-side tensors (DCOFF, Gather, etc) while
274+
// lazy_closure is used for weights sharing and allocating device memory.
275+
std::vector<weights::LazyTensor> lazy_closure;
276+
std::vector<ov::Tensor> scales;
277+
std::vector<ov::Tensor> zerops;
278+
279+
bool forced_to_fcall = false;
280+
281+
// FIXME: Take it out of structure
282+
ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
283+
bool switched_to_ref = false;
284+
285+
// Metrics
286+
execution_stats stat;
287+
288+
void serialize(std::ostream& stream, const ov::npuw::s11n::WeightsContext& ctx) const;
289+
void deserialize(std::istream& stream,
290+
const ov::npuw::s11n::WeightsContext& ctx,
291+
const ov::npuw::s11n::SubmodelDeserializeCtx& submodel_ctx);
292+
};
295293
std::vector<CompiledModelDesc> m_compiled_submodels;
296294

297295
std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,18 +1205,18 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
12051205
}
12061206

12071207
// Allocate storage for infer requests: [REGULAR_TILE] and [FINAL_TILE]
1208-
submodel_desc.hfa_infer_requests.resize(CompiledModelDesc::HFATileIdx::COUNT);
1208+
submodel_desc.hfa_infer_requests.resize(CompiledModel::CompiledModelDesc::HFATileIdx::COUNT);
12091209
if (is_piped) {
1210-
submodel_desc.hfa_pipeline_requests.resize(CompiledModelDesc::HFATileIdx::COUNT);
1210+
submodel_desc.hfa_pipeline_requests.resize(CompiledModel::CompiledModelDesc::HFATileIdx::COUNT);
12111211
}
12121212

12131213
// Create infer request for regular tile model
12141214
try {
12151215
LOG_INFO("Creating infer request for HFA regular tile model...");
1216-
submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
1216+
submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
12171217
hfa._compiled_tile_model->create_infer_request();
12181218
if (is_piped) {
1219-
submodel_desc.hfa_pipeline_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
1219+
submodel_desc.hfa_pipeline_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE] =
12201220
hfa._compiled_tile_model->create_infer_request();
12211221
}
12221222
} catch (const std::exception& ex) {
@@ -1232,10 +1232,10 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
12321232
// For final tile model, reuse the main compiled_model's infer request
12331233
// because compiled_model points to _compiled_final_tile_model for HFA
12341234
LOG_INFO("Reusing " << (is_recreate ? "recreated " : "") << "main infer request for HFA final tile model");
1235-
submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::FINAL_TILE] =
1235+
submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::FINAL_TILE] =
12361236
m_subrequests[real_idx];
12371237
if (is_piped) {
1238-
submodel_desc.hfa_pipeline_requests[CompiledModelDesc::HFATileIdx::FINAL_TILE] =
1238+
submodel_desc.hfa_pipeline_requests[CompiledModel::CompiledModelDesc::HFATileIdx::FINAL_TILE] =
12391239
m_funcall_pipeline[real_idx].subrequest;
12401240
}
12411241

@@ -1248,14 +1248,14 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
12481248

12491249
// Directly share tensor from main infer request to regular tile request
12501250
auto main_tensor = m_subrequests[real_idx]->get_tensor(final_tile_input);
1251-
submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
1251+
submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
12521252
tile_input,
12531253
main_tensor);
12541254

12551255
// Repeat for pipeline infer request if pipelined
12561256
if (is_piped) {
12571257
auto pipeline_tensor = m_funcall_pipeline[real_idx].subrequest->get_tensor(final_tile_input);
1258-
submodel_desc.hfa_pipeline_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
1258+
submodel_desc.hfa_pipeline_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->set_tensor(
12591259
tile_input,
12601260
pipeline_tensor);
12611261
}
@@ -1297,13 +1297,13 @@ void ov::npuw::JustInferRequest::setup_hfa_infer_requests(std::size_t real_idx,
12971297

12981298
// Get state tensors from regular tile request
12991299
auto state_acc =
1300-
submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
1300+
submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
13011301
hfa._compiled_tile_model->inputs()[tile_in.acc]);
13021302
auto state_max =
1303-
submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
1303+
submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
13041304
hfa._compiled_tile_model->inputs()[tile_in.max]);
13051305
auto state_sum =
1306-
submodel_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
1306+
submodel_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE]->get_tensor(
13071307
hfa._compiled_tile_model->inputs()[tile_in.d]);
13081308

13091309
// Initialize state tensors with zeros/minus infinity
@@ -1529,7 +1529,7 @@ void ov::npuw::JustInferRequest::run_hfa_tiled_inference(std::size_t real_idx, s
15291529
auto& hfa_desc = comp_model_desc.host_flash_attention.value();
15301530

15311531
NPUW_ASSERT(hfa_desc.is_valid() && "HFA configuration must be valid");
1532-
NPUW_ASSERT(comp_model_desc.hfa_infer_requests.size() == CompiledModelDesc::HFATileIdx::COUNT &&
1532+
NPUW_ASSERT(comp_model_desc.hfa_infer_requests.size() == CompiledModel::CompiledModelDesc::HFATileIdx::COUNT &&
15331533
"HFA infer requests must be created");
15341534

15351535
// Calculate tile configuration
@@ -1565,9 +1565,9 @@ void ov::npuw::JustInferRequest::run_hfa_tiled_inference(std::size_t real_idx, s
15651565

15661566
// Get tile infer requests
15671567
auto& regular_tile_request =
1568-
comp_model_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::REGULAR_TILE];
1568+
comp_model_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::REGULAR_TILE];
15691569
auto& final_tile_request =
1570-
comp_model_desc.hfa_infer_requests[CompiledModelDesc::HFATileIdx::FINAL_TILE];
1570+
comp_model_desc.hfa_infer_requests[CompiledModel::CompiledModelDesc::HFATileIdx::FINAL_TILE];
15711571

15721572
// Use pre-cached indices (populated during compilation)
15731573
const auto& tile_in = sdpa_info._tile_input_indices;

0 commit comments

Comments
 (0)