[NPU] Allow WS with no init schedules to succeed (#34731)

andrey-golubev · web-flow · commit a7eb69bf0072 · 2026-03-18T13:25:42.000Z
### Details: When weightless compilation is requested, depending on the input model, compiler can theoretically find no weights that need to be processed through init schedule. In this case, a regular "blob with weights" is going to be returned and the plugin must be able to work with it successfully. ### Tickets: - [EISW-206596](https://jira.devtools.intel.com/browse/EISW-206596) ### AI Assistance: - *AI assistance used: no*
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp
@@ -138,7 +138,12 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compileWS(std::shared_ptr<ov::Mod
 
         std::shared_ptr<NetworkDescription> mainNetworkDescription = initMainNetworkDescriptions.back();
         initMainNetworkDescriptions.pop_back();
-        OPENVINO_ASSERT(initMainNetworkDescriptions.size() > 0, "No init schedules have been returned by the compiler");
+        if (initMainNetworkDescriptions.empty()) {
+            _logger.warning("NPU compiler did not produce any init schedules. "
+                            "This likely means that the compiled model blob has weights inside even "
+                            "though weightless compilation was requested.");
+        }
+
         std::vector<std::shared_ptr<NetworkDescription>> initNetworkDescriptions =
             std::move(initMainNetworkDescriptions);
         tensorMain = std::move(mainNetworkDescription->compiledNetworkTensor);
diff --git a/src/plugins/intel_npu/tests/functional/behavior/weights_separation.hpp b/src/plugins/intel_npu/tests/functional/behavior/weights_separation.hpp
@@ -122,6 +122,40 @@ class WeightsSeparationTests : public ov::test::behavior::OVPluginTestBase,
         return model;
     }
 
+    // This is a special model that has weightless constants that are guaranteed
+    // to be skipped by weights schedule. This tests cases where compiler
+    // produces "blob with weights" when "weightless blob" is requested: in
+    // theory, this may happen, and must not cause any errors.
+    std::shared_ptr<ov::Model> createTestModelWeightlessWithDummyConstants() {
+        constexpr auto precision = element::f32;
+
+        const auto reshapeWeights =
+            std::make_shared<op::v0::Constant>(element::i64, Shape{3}, std::vector<int64_t>{1, 2, 3});
+
+        const auto input1 = std::make_shared<op::v0::Parameter>(precision, Shape{6});
+        const auto input2 = std::make_shared<op::v0::Parameter>(precision, Shape{1, 2, 3});
+        const auto reshapedInput1 = std::make_shared<op::v1::Reshape>(input1, reshapeWeights, /*special_zero=*/false);
+        auto add = std::make_shared<op::v1::Add>(reshapedInput1, input2);
+
+        reshapeWeights->set_friendly_name("weights");
+        input1->set_friendly_name("input1");
+        input2->set_friendly_name("input2");
+        reshapedInput1->set_friendly_name("reshapedInput1");
+        add->set_friendly_name("add");
+
+        // Note: Reshape weights with weightless cache attribute satisfy the
+        // basic requirement to create weights schedule. However, since this is
+        // a static reshape, these weights would "disappear" during compilation,
+        // causing the compiler to put nothing into the weights schedule.
+        reshapeWeights->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] =
+            ov::WeightlessCacheAttribute(reshapeWeights->get_byte_size(), 0, reshapeWeights->get_element_type());
+
+        auto model =
+            std::make_shared<Model>(OutputVector{add}, ParameterVector{input1, input2}, "Dummy weightless model");
+        ov::util::set_tensors_names(AUTO, *model, {}, {{0, {"add"}}});
+        return model;
+    }
+
     /**
      * @brief This model was fine-tuned in order to compile fast and yield a light init schedule.
      */
@@ -163,6 +197,9 @@ class WeightsSeparationTests : public ov::test::behavior::OVPluginTestBase,
         OV_ASSERT_NO_THROW(utils::compare(expected, output));
     }
 
+    // This is a "template" of a test used in multiple configurations
+    void runCorrectInferenceResultIfCannotCompileAsWeightless();
+
 protected:
     std::shared_ptr<ov::Core> core = utils::PluginCache::get().core();
     ov::AnyMap configuration;
@@ -487,6 +524,41 @@ TEST_P(WeightsSeparationTests, WeightlessBlobIsSmaller) {
     ASSERT_TRUE(weightfullBlobStream.str().size() > weightlessBlobStream.str().size());
 }
 
+void WeightsSeparationTests::runCorrectInferenceResultIfCannotCompileAsWeightless() {
+    model = createTestModelWeightlessWithDummyConstants();
+
+    model_path = ov::util::path_join({utils::getCurrentWorkingDir(), utils::generateTestFilePrefix()}).string();
+    ov::serialize(model, model_path + ".xml", model_path + ".bin");
+
+    // compilation should succeed
+    configuration.insert(ov::intel_npu::weightless_blob(true));
+    OV_ASSERT_NO_THROW(compiled_model = core->compile_model(model, target_device, configuration));
+    ASSERT_TRUE(compiled_model);
+
+    std::stringstream export_stream;
+    compiled_model.export_model(export_stream);
+
+    configuration.insert(ov::weights_path(model_path + ".bin"));
+    OV_ASSERT_NO_THROW(compiled_model = core->import_model(export_stream, target_device, configuration));
+    ASSERT_TRUE(compiled_model);
+
+    // inference should also succeed
+    const ov::Tensor input1 =
+        utils::create_tensor(element::f32, Shape{6}, std::vector<float>{40.0f, 40.0f, 40.0f, 40.0f, 40.0f, 40.0f});
+    const ov::Tensor input2 =
+        utils::create_tensor(element::f32, Shape{1, 2, 3}, std::vector<float>{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
+    OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request());
+    OV_ASSERT_NO_THROW(inference_request.set_tensor("input1", input1));
+    OV_ASSERT_NO_THROW(inference_request.set_tensor("input2", input2));
+    OV_ASSERT_NO_THROW(inference_request.infer());
+
+    const ov::Tensor expected = utils::create_tensor(element::f32,
+                                                     Shape{1, 2, 3},
+                                                     std::vector<float>{40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f});
+    const ov::Tensor output = inference_request.get_tensor("add");
+    OV_ASSERT_NO_THROW(utils::compare(expected, output));
+}
+
 using WeightsSeparationOneShotTests = WeightsSeparationTests;
 
 /**
@@ -504,6 +576,17 @@ TEST_P(WeightsSeparationOneShotTests, CorrectInferenceResultNoImportOneShot) {
     create_infer_request_and_check_result();
 }
 
+/**
+ * @brief compile -> import the result, ov::weights_path provided -> create inference request -> run one inference and
+ * check the result
+ * compilation and inference must work even if the weightless model is not actually weightless
+ * (compiler may not be able to find any "suitable" weights)
+ */
+TEST_P(WeightsSeparationOneShotTests, CorrectInferenceResultIfCannotCompileAsWeightlessOneShot) {
+    configuration.insert(ov::intel_npu::separate_weights_version(ov::intel_npu::WSVersion::ONE_SHOT));
+    runCorrectInferenceResultIfCannotCompileAsWeightless();
+}
+
 using WeightsSeparationIterativeTests = WeightsSeparationTests;
 
 /**
@@ -521,6 +604,15 @@ TEST_P(WeightsSeparationIterativeTests, CorrectInferenceResultNoImportIterative)
     create_infer_request_and_check_result();
 }
 
+/**
+ * @brief Compiles a special model in WeightsSeparation ITERATIVE mode to ensure
+ * compilation succeeds. This is similar to the ONE_SHOT version test.
+ */
+TEST_P(WeightsSeparationIterativeTests, CorrectInferenceResultIfCannotCompileAsWeightlessIterative) {
+    configuration.insert(ov::intel_npu::separate_weights_version(ov::intel_npu::WSVersion::ITERATIVE));
+    runCorrectInferenceResultIfCannotCompileAsWeightless();
+}
+
 }  // namespace behavior
 }  // namespace test
 }  // namespace ov