Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,7 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(
read(stream, model_str);
std::stringstream ss(model_str);
pyramid_attention->_compiled_models[i] =
submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device);
submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device, submodel_ctx.import_config);
}

// Reuse the already compiled model for the last pyramid attention model
Expand Down Expand Up @@ -896,7 +896,8 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(
std::string model_str;
read(stream, model_str);
std::stringstream ss(model_str);
auto compiled_model = submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device);
auto compiled_model =
submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device, submodel_ctx.import_config);
moe_experts->_compiled_models[chunk_size] = compiled_model;
LOG_DEBUG("Imported MoE compiled model for chunk_size=" << chunk_size);
}
Expand All @@ -915,7 +916,8 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(
std::string model_str;
read(stream, model_str);
std::stringstream ss(model_str);
auto compiled_model = submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device);
auto compiled_model =
submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device, submodel_ctx.import_config);
moe_experts_downstream->_compiled_model = compiled_model;
LOG_DEBUG("Imported MoE downstream compiled model");
}
Expand All @@ -931,7 +933,7 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(
read(stream, model_str);
std::stringstream ss(model_str);
host_flash_attention->_compiled_tile_model =
submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device);
submodel_ctx.plugin->get_core()->import_model(ss, submodel_ctx.device, submodel_ctx.import_config);
LOG_DEBUG("Imported compiled tile model for host flash attention");
}

Expand Down Expand Up @@ -1429,22 +1431,34 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(

bool has_compiled_model = false;
read(stream, has_compiled_model);

// Build import config for NPU device
ov::AnyMap import_config;
const auto& device = compiled->m_dev_list[device_idx];
if (ov::npuw::util::starts_with(device, "NPU")) {
// Pass NPU_RUN_INFERENCES_SEQUENTIALLY if NPUW_UNFOLD_IREQS is enabled
if (compiled->m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
import_config["NPU_RUN_INFERENCES_SEQUENTIALLY"] = "YES";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what i think the source of issue is that we have two initialization places for this device config, and for example in compile_model() there are already one more option:
EXCLUSIVE_ASYNC_REQUESTS that is not deserialized - not sure whether it is relevant to current issue.

I would propose serialize device config that was used to compile model, and avoid ifs on deserialisation path, for that we would need to keep track of pairs: {ov::compiled_model, extra-options} since not options are matter i'm not sure why model serialized cannot imported correctly, and why do we ever need config again.
So please create a follow-up tasks for addressing that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is quite shady if we need to maintain the compilation config on import, I actually find it counter-intuitive.
If the model was compiled with a certain option, its compiled blob should preserve that option.
However, there's a distinction between compile-time and run-time options (despite they all are passed to .compile()) so we have this problem.

What's done in this PR is, I think, okay-ish.. Unless we'll come up with a better, generic solution.

}
}

if (has_compiled_model) {
// Import model from the plugin
// FIXME: workaround for import/export model since import model seems to reset the file pointer
std::string buf;
read(stream, buf);
std::stringstream buffer(buf);
compiled->m_compiled_submodels[i].compiled_model =
plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
plugin->get_core()->import_model(buffer, device, import_config);
}
compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;

// Create unified deserialization context for submodels with dynamic mechanisms
// (Pyramid Attention, Host Flash Attention, etc.)
ov::npuw::s11n::SubmodelDeserializeCtx submodel_ctx(plugin,
compiled->m_dev_list[device_idx],
compiled->m_compiled_submodels[i].compiled_model);
device,
compiled->m_compiled_submodels[i].compiled_model,
import_config);
compiled->m_compiled_submodels[i].deserialize(stream, compiled->m_import_weights_ctx, submodel_ctx);
}

Expand Down
7 changes: 5 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,17 @@ struct WeightsContext {
struct SubmodelDeserializeCtx {
SubmodelDeserializeCtx(const std::shared_ptr<const ov::IPlugin>& _plugin,
const std::string& _device,
const ov::SoPtr<ov::ICompiledModel>& _compiled_model)
const ov::SoPtr<ov::ICompiledModel>& _compiled_model,
const std::map<std::string, Any>& _import_config = {})
: plugin(_plugin),
device(_device),
compiled_model(_compiled_model) {}
compiled_model(_compiled_model),
import_config(_import_config) {}

std::shared_ptr<const ov::IPlugin> plugin;
std::string device;
const ov::SoPtr<ov::ICompiledModel>& compiled_model;
std::map<std::string, Any> import_config;
};

BF16Cache get_bf16_consts(const std::shared_ptr<ov::Model>& model);
Expand Down
Loading