Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/plugins/intel_npu/src/backend/include/zero_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ class ZeroDevice : public IDevice {
std::map<ov::element::Type, float> getGops() const override;
ov::device::Type getDeviceType() const override;

std::shared_ptr<SyncInferRequest> createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config) override;
std::shared_ptr<InferRequest> createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config) override;

void updateInfo(const ov::AnyMap& properties) override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,20 @@ class ZeroDynamicInferRequest final : public ZeroInferRequest {
void infer_async() override;

protected:
void construct_pipeline() override;

/**
* @brief Allocates a tensor on host and stores the reference inside multiple attributes.
* @param index The index which the allocated tensor shall use.
* @param isInput Determines the containers in which the newly allocated tensors will be stored.
* @param batchSize If provided, the value of the shape on the 0th axis is overriden with this value.
* @return Pointer towards the allocated tensor
*/
std::shared_ptr<ZeroTensor> allocate_tensor(const size_t index,
const bool isInput,
const std::optional<std::size_t> batchSize = std::nullopt) const;

void update_command_list_for_tensor(SyncInferRequest::FoundPort& foundPort,
const ov::SoPtr<ov::ITensor>& tensor) override;

void update_command_list_for_tensors(SyncInferRequest::FoundPort& foundPort,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
std::optional<size_t> batchSizeCandidate = std::nullopt) override;
void predict_shapes(std::vector<IDynamicGraph::MemRefType>& outputProps);
void create_pipeline_impl() override;

std::shared_ptr<ZeroTensor> allocate_tensor(
const size_t index,
const bool isInput,
const std::optional<std::size_t>& batchSize = std::nullopt) const override;

void sync_zero_tensor_with_graph(const ZeroInferRequest::FoundPort& foundPort,
const ov::SoPtr<ov::ITensor>& tensor) override;
void sync_zero_tensors_with_graph(const ZeroInferRequest::FoundPort& foundPort,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
const std::optional<size_t>& batchSize = std::nullopt) override;

void predict_shapes(std::vector<IDynamicGraph::MemRefType>& outputProps);
void check_tensor_and_predicted_shapes(const std::vector<IDynamicGraph::MemRefType>& outputProps);

void update_tensor(const std::vector<IDynamicGraph::MemRefType>& outputProps);
Expand Down
134 changes: 108 additions & 26 deletions src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@

#pragma once

#include <ze_api.h>
#include <ze_graph_ext.h>

#include "intel_npu/common/icompiled_model.hpp"
#include "intel_npu/common/igraph.hpp"
#include "intel_npu/common/npu.hpp"
#include "intel_npu/common/sync_infer_request.hpp"
#include "intel_npu/network_metadata.hpp"
#include "intel_npu/utils/logger/logger.hpp"
#include "intel_npu/utils/zero/zero_remote_tensor.hpp"
#include "intel_npu/utils/zero/zero_tensor.hpp"
#include "intel_npu/utils/zero/zero_wrappers.hpp"
#include "zero_pipeline.hpp"

namespace intel_npu {
Expand All @@ -28,31 +25,57 @@ std::optional<size_t> determine_dynamic_batch_size(const IODescriptor& desc,

void* get_tensor_data_ptr(const std::shared_ptr<ov::ITensor>& tensor);

class ZeroInferRequest : public SyncInferRequest {
class ZeroInferRequest : public InferRequest {
public:
explicit ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config);

ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override;
std::vector<ov::SoPtr<ov::ITensor>> get_tensors(const ov::Output<const ov::Node>& port) const override;
void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
void set_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;

void infer() override;
void infer_async() override;

virtual void infer_async() override;
void get_result() override;

const std::vector<ov::Output<const ov::Node>>& get_inputs() const override;
const std::vector<ov::Output<const ov::Node>>& get_outputs() const override;

const std::shared_ptr<const ov::ICompiledModel>& get_compiled_model() const override;

std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;

protected:
std::vector<ov::ProfilingInfo> get_profiling_info() const override;
/**
* @see ov::ISyncInferRequest
*/
struct FoundPort {
size_t idx;
enum class Type { NOT_FOUND = 0, INPUT, OUTPUT } type;

bool found() const {
return type != Type::NOT_FOUND;
}
bool is_input() const {
return type == Type::INPUT;
}
bool is_output() const {
return !is_input();
}
};

void check_network_precision(const ov::element::Type_t precision) const override;
void create_pipeline();
virtual void construct_pipeline();
/**
* @brief Finds input or output port
* @return structure which contains index of Input/Output or report that port wasn't found
* @see ov::ISyncInferRequest
*/
ZeroInferRequest::FoundPort find_port(const ov::Output<const ov::Node>& port) const;

std::shared_ptr<ZeroTensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
std::vector<std::shared_ptr<ZeroTensor>>& get_level_zero_inputs(size_t index) const;
void setup_pipeline();
virtual void create_pipeline_impl();

/**
* @brief Allocates a tensor on host and stores the reference inside multiple attributes.
Expand All @@ -61,39 +84,98 @@ class ZeroInferRequest : public SyncInferRequest {
* @param batchSize If provided, the value of the shape on the 0th axis is overridden with this value.
* @return Pointer towards the allocated tensor
*/
std::shared_ptr<ZeroTensor> allocate_tensor(const size_t index,
const bool isInput,
const std::optional<std::size_t>& batchSize = std::nullopt) const;
virtual std::shared_ptr<ZeroTensor> allocate_tensor(
const size_t index,
const bool isInput,
const std::optional<std::size_t>& batchSize = std::nullopt) const;

void initialize_states();
void add_state(const IODescriptor& descriptor, size_t tensorIndex) const;

void update_pipeline_if_memory_changed();
void update_states_if_memory_changed();

virtual void update_command_list_for_tensor(SyncInferRequest::FoundPort& foundPort,
const ov::SoPtr<ov::ITensor>& tensor);

virtual void update_command_list_for_tensors(SyncInferRequest::FoundPort& foundPort,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
std::optional<size_t> batchSizeCandidate = std::nullopt);
virtual void sync_zero_tensor_with_graph(const ZeroInferRequest::FoundPort& foundPort,
const ov::SoPtr<ov::ITensor>& tensor);
virtual void sync_zero_tensors_with_graph(const ZeroInferRequest::FoundPort& foundPort,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
const std::optional<size_t>& batchSize = std::nullopt);

virtual void prepare_inputs();
virtual void prepare_outputs();

/**
* @brief Basic checks for input/output tensor
*
* @param port Input/Output port
* @param tensor Input/Output tensor
*/
void check_tensor(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor,
const bool supportStrides) const;

/**
* @brief Basic checks for input tensors
*
* @param port Input port
* @param tensors Input tensors
*/
void check_batched_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors,
const bool supportStrides) const;

bool is_batched_input(size_t idx) const;

/**
* @brief Check that all tensors are valid. Throws an exception if it's not.
*/
void check_tensors() const override;

ov::SoPtr<ov::ITensor>& get_user_input(size_t index) const;
std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs(size_t index) const;

std::shared_ptr<ZeroTensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
std::vector<std::shared_ptr<ZeroTensor>>& get_level_zero_inputs(size_t index) const;

void check_network_precision(const ov::element::Type_t precision) const;
std::vector<ov::ProfilingInfo> get_profiling_info() const override;

const std::shared_ptr<ZeroInitStructsHolder> _initStructs;

// This is intel_npu::ICompiledModel pointer, but need to use OV base class because
// ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
std::shared_ptr<const ov::ICompiledModel> _compiledModel;

const std::shared_ptr<IGraph> _graph;
NetworkMetadata _metadata;
const Config _config;
Logger _logger;

// In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;

mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;

// A copy of each tensor is needed to maintain the original L0 memory allocation in case the user provides another
// memory area for the tensor.
mutable std::vector<std::vector<std::shared_ptr<ZeroTensor>>> _levelZeroInputTensors;
mutable std::vector<std::shared_ptr<ZeroTensor>> _levelZeroOutputTensors;

std::unique_ptr<Pipeline> _pipeline;

bool _pipelineIsCreated = false;
bool _dynamicBatchValueChanged = false;

Logger _logger;

/**
* @see ov::ISyncInferRequest
*/
mutable std::unordered_map<size_t, ZeroInferRequest::FoundPort> _cachedPorts;

/**
* @see ov::ISyncInferRequest
*/
mutable std::mutex _cacheMutex;
};

} // namespace intel_npu
5 changes: 2 additions & 3 deletions src/plugins/intel_npu/src/backend/src/zero_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,8 @@ ov::device::Type ZeroDevice::getDeviceType() const {
return ov::device::Type::INTEGRATED;
}

std::shared_ptr<SyncInferRequest> ZeroDevice::createInferRequest(
const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config) {
std::shared_ptr<InferRequest> ZeroDevice::createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config) {
if (dynamic_cast<IDynamicGraph*>(compiledModel->get_graph().get())) {
return std::make_shared<ZeroDynamicInferRequest>(_initStructs, compiledModel, config);
}
Expand Down
Loading
Loading