openvinotoolkit · pereanub · Mar 18, 2026
@@ -32,8 +32,8 @@ class ZeroDevice : public IDevice {
     std::map<ov::element::Type, float> getGops() const override;
     ov::device::Type getDeviceType() const override;
 
-    std::shared_ptr<SyncInferRequest> createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
-                                                         const Config& config) override;
+    std::shared_ptr<InferRequest> createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
+                                                     const Config& config) override;
 
     void updateInfo(const ov::AnyMap& properties) override;
 

@@ -19,27 +19,20 @@ class ZeroDynamicInferRequest final : public ZeroInferRequest {
     void infer_async() override;
 
 protected:
-    void construct_pipeline() override;
-
-    /**
-     * @brief Allocates a tensor on host and stores the reference inside multiple attributes.
-     * @param index The index which the allocated tensor shall use.
-     * @param isInput Determines the containers in which the newly allocated tensors will be stored.
-     * @param batchSize If provided, the value of the shape on the 0th axis is overriden with this value.
-     * @return Pointer towards the allocated tensor
-     */
-    std::shared_ptr<ZeroTensor> allocate_tensor(const size_t index,
-                                                const bool isInput,
-                                                const std::optional<std::size_t> batchSize = std::nullopt) const;
-
-    void update_command_list_for_tensor(SyncInferRequest::FoundPort& foundPort,
-                                        const ov::SoPtr<ov::ITensor>& tensor) override;
-
-    void update_command_list_for_tensors(SyncInferRequest::FoundPort& foundPort,
-                                         const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
-                                         std::optional<size_t> batchSizeCandidate = std::nullopt) override;
-    void predict_shapes(std::vector<IDynamicGraph::MemRefType>& outputProps);
+    void create_pipeline_impl() override;
+
+    std::shared_ptr<ZeroTensor> allocate_tensor(
+        const size_t index,
+        const bool isInput,
+        const std::optional<std::size_t>& batchSize = std::nullopt) const override;
 
+    void sync_zero_tensor_with_graph(const ZeroInferRequest::FoundPort& foundPort,
+                                     const ov::SoPtr<ov::ITensor>& tensor) override;
+    void sync_zero_tensors_with_graph(const ZeroInferRequest::FoundPort& foundPort,
+                                      const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
+                                      const std::optional<size_t>& batchSize = std::nullopt) override;
+
+    void predict_shapes(std::vector<IDynamicGraph::MemRefType>& outputProps);
     void check_tensor_and_predicted_shapes(const std::vector<IDynamicGraph::MemRefType>& outputProps);
 
     void update_tensor(const std::vector<IDynamicGraph::MemRefType>& outputProps);

@@ -4,15 +4,12 @@
 
 #pragma once
 
-#include <ze_api.h>
-#include <ze_graph_ext.h>
-
+#include "intel_npu/common/icompiled_model.hpp"
+#include "intel_npu/common/igraph.hpp"
 #include "intel_npu/common/npu.hpp"
-#include "intel_npu/common/sync_infer_request.hpp"
+#include "intel_npu/network_metadata.hpp"
 #include "intel_npu/utils/logger/logger.hpp"
-#include "intel_npu/utils/zero/zero_remote_tensor.hpp"
 #include "intel_npu/utils/zero/zero_tensor.hpp"
-#include "intel_npu/utils/zero/zero_wrappers.hpp"
 #include "zero_pipeline.hpp"
 
 namespace intel_npu {
@@ -28,31 +25,57 @@ std::optional<size_t> determine_dynamic_batch_size(const IODescriptor& desc,
 
 void* get_tensor_data_ptr(const std::shared_ptr<ov::ITensor>& tensor);
 
-class ZeroInferRequest : public SyncInferRequest {
+class ZeroInferRequest : public InferRequest {
 public:
     explicit ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
                               const std::shared_ptr<const ICompiledModel>& compiledModel,
                               const Config& config);
 
     ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override;
+    std::vector<ov::SoPtr<ov::ITensor>> get_tensors(const ov::Output<const ov::Node>& port) const override;
     void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
     void set_tensors(const ov::Output<const ov::Node>& port,
                      const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
 
     void infer() override;
-    void infer_async() override;
-
+    virtual void infer_async() override;
     void get_result() override;
 
+    const std::vector<ov::Output<const ov::Node>>& get_inputs() const override;
+    const std::vector<ov::Output<const ov::Node>>& get_outputs() const override;
+
+    const std::shared_ptr<const ov::ICompiledModel>& get_compiled_model() const override;
+
+    std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+
 protected:
-    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
+    /**
+     * @see ov::ISyncInferRequest
+     */
+    struct FoundPort {
+        size_t idx;
+        enum class Type { NOT_FOUND = 0, INPUT, OUTPUT } type;
+
+        bool found() const {
+            return type != Type::NOT_FOUND;
+        }
+        bool is_input() const {
+            return type == Type::INPUT;
+        }
+        bool is_output() const {
+            return !is_input();
+        }
+    };
 
-    void check_network_precision(const ov::element::Type_t precision) const override;
-    void create_pipeline();
-    virtual void construct_pipeline();
+    /**
+     * @brief Finds input or output port
+     * @return structure which contains index of Input/Output or report that port wasn't found
+     * @see ov::ISyncInferRequest
+     */
+    ZeroInferRequest::FoundPort find_port(const ov::Output<const ov::Node>& port) const;
 
-    std::shared_ptr<ZeroTensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
-    std::vector<std::shared_ptr<ZeroTensor>>& get_level_zero_inputs(size_t index) const;
+    void setup_pipeline();
+    virtual void create_pipeline_impl();
 
     /**
      * @brief Allocates a tensor on host and stores the reference inside multiple attributes.
@@ -61,39 +84,98 @@ class ZeroInferRequest : public SyncInferRequest {
      * @param batchSize If provided, the value of the shape on the 0th axis is overridden with this value.
      * @return Pointer towards the allocated tensor
      */
-    std::shared_ptr<ZeroTensor> allocate_tensor(const size_t index,
-                                                const bool isInput,
-                                                const std::optional<std::size_t>& batchSize = std::nullopt) const;
+    virtual std::shared_ptr<ZeroTensor> allocate_tensor(
+        const size_t index,
+        const bool isInput,
+        const std::optional<std::size_t>& batchSize = std::nullopt) const;
 
+    void initialize_states();
     void add_state(const IODescriptor& descriptor, size_t tensorIndex) const;
 
     void update_pipeline_if_memory_changed();
     void update_states_if_memory_changed();
 
-    virtual void update_command_list_for_tensor(SyncInferRequest::FoundPort& foundPort,
-                                                const ov::SoPtr<ov::ITensor>& tensor);
-
-    virtual void update_command_list_for_tensors(SyncInferRequest::FoundPort& foundPort,
-                                                 const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
-                                                 std::optional<size_t> batchSizeCandidate = std::nullopt);
+    virtual void sync_zero_tensor_with_graph(const ZeroInferRequest::FoundPort& foundPort,
+                                             const ov::SoPtr<ov::ITensor>& tensor);
+    virtual void sync_zero_tensors_with_graph(const ZeroInferRequest::FoundPort& foundPort,
+                                              const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
+                                              const std::optional<size_t>& batchSize = std::nullopt);
 
     virtual void prepare_inputs();
     virtual void prepare_outputs();
 
+    /**
+     * @brief Basic checks for input/output tensor
+     *
+     * @param port Input/Output port
+     * @param tensor Input/Output tensor
+     */
+    void check_tensor(const ov::Output<const ov::Node>& port,
+                      const ov::SoPtr<ov::ITensor>& tensor,
+                      const bool supportStrides) const;
+
+    /**
+     * @brief Basic checks for input tensors
+     *
+     * @param port Input port
+     * @param tensors Input tensors
+     */
+    void check_batched_tensors(const ov::Output<const ov::Node>& port,
+                               const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
+                               const bool supportStrides) const;
+
+    bool is_batched_input(size_t idx) const;
+
+    /**
+     * @brief Check that all tensors are valid. Throws an exception if it's not.
+     */
+    void check_tensors() const override;
+
+    ov::SoPtr<ov::ITensor>& get_user_input(size_t index) const;
+    std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs(size_t index) const;
+
+    std::shared_ptr<ZeroTensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
+    std::vector<std::shared_ptr<ZeroTensor>>& get_level_zero_inputs(size_t index) const;
+
+    void check_network_precision(const ov::element::Type_t precision) const;
+    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
+
     const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
+
+    // This is intel_npu::ICompiledModel pointer, but need to use OV base class because
+    // ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
+    std::shared_ptr<const ov::ICompiledModel> _compiledModel;
+
     const std::shared_ptr<IGraph> _graph;
+    NetworkMetadata _metadata;
     const Config _config;
-    Logger _logger;
+
+    // In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
+    mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
+    mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;
+
+    mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;
 
     // A copy of each tensor is needed to maintain the original L0 memory allocation in case the user provides another
     // memory area for the tensor.
     mutable std::vector<std::vector<std::shared_ptr<ZeroTensor>>> _levelZeroInputTensors;
     mutable std::vector<std::shared_ptr<ZeroTensor>> _levelZeroOutputTensors;
 
     std::unique_ptr<Pipeline> _pipeline;
-
     bool _pipelineIsCreated = false;
     bool _dynamicBatchValueChanged = false;
+
+    Logger _logger;
+
+    /**
+     * @see ov::ISyncInferRequest
+     */
+    mutable std::unordered_map<size_t, ZeroInferRequest::FoundPort> _cachedPorts;
+
+    /**
+     * @see ov::ISyncInferRequest
+     */
+    mutable std::mutex _cacheMutex;
 };
 
 }  //  namespace intel_npu
@@ -183,9 +183,8 @@ ov::device::Type ZeroDevice::getDeviceType() const {
     return ov::device::Type::INTEGRATED;
 }
 
-std::shared_ptr<SyncInferRequest> ZeroDevice::createInferRequest(
-    const std::shared_ptr<const ICompiledModel>& compiledModel,
-    const Config& config) {
+std::shared_ptr<InferRequest> ZeroDevice::createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
+                                                             const Config& config) {
     if (dynamic_cast<IDynamicGraph*>(compiledModel->get_graph().get())) {
         return std::make_shared<ZeroDynamicInferRequest>(_initStructs, compiledModel, config);
     }