Skip to content

Commit

Permalink
Create zero tensor class
Browse files Browse the repository at this point in the history
Signed-off-by: Bogdan Pereanu <[email protected]>
  • Loading branch information
pereanub committed Dec 16, 2024
1 parent 5cebf60 commit b1afd2c
Show file tree
Hide file tree
Showing 10 changed files with 301 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ class ZeroInferRequest final : public SyncInferRequest {
std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;

std::shared_ptr<ov::ITensor> allocate_tensor(
const IODescriptor& descriptor,
const size_t index,
const bool isInput,
const ov::Allocator& allocator = {},
const std::optional<std::size_t> batchSize = std::nullopt) const override;

const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
const std::shared_ptr<IGraph> _graph;
const Config _config;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

namespace intel_npu {

class ZeroRemoteTensor : public RemoteTensor {
class ZeroRemoteTensor final : public RemoteTensor {
public:
ZeroRemoteTensor(const std::shared_ptr<ov::IRemoteContext>& context,
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
Expand All @@ -25,6 +25,13 @@ class ZeroRemoteTensor : public RemoteTensor {
ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
void* mem = nullptr);

/**
* @brief Set new shape for tensor
* @note Memory allocation may happen
* @param shape A new shape
*/
void set_shape(ov::Shape shape) override;

~ZeroRemoteTensor() override;

private:
Expand Down
56 changes: 56 additions & 0 deletions src/plugins/intel_npu/src/backend/include/zero_tensor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <mutex>

#include "intel_npu/config/config.hpp"
#include "intel_npu/utils/zero/zero_init.hpp"
#include "openvino/runtime/common.hpp"
#include "openvino/runtime/itensor.hpp"
#include "openvino/runtime/so_ptr.hpp"

namespace intel_npu {

class ZeroTensor final : public ov::ITensor {
public:
ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const ov::element::Type element_type,
const ov::Shape& shape,
const ov::Allocator& allocator);

void* data(const ov::element::Type& element_type) const override;

const ov::element::Type& get_element_type() const override;

const ov::Shape& get_shape() const override;

void set_shape(ov::Shape new_shape) override;

const ov::Strides& get_strides() const override;

~ZeroTensor();

private:
static void initialize_elements(void* data, const ov::element::Type& element_type, const ov::Shape& shape);
void update_strides() const;
size_t get_capacity() const;
size_t get_bytes_capacity() const;
void destroy_elements(size_t begin_ind, size_t end_ind);
void destroy_memory();

std::shared_ptr<ZeroInitStructsHolder> _init_structs;

ov::element::Type _element_type;
ov::Shape _shape;
ov::Shape _capacity;
mutable ov::Strides _strides;
mutable std::once_flag _strides_once;
ov::Allocator _allocator;
void* _ptr = nullptr;
};

} // namespace intel_npu
58 changes: 52 additions & 6 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
#include "intel_npu/utils/zero/zero_api.hpp"
#include "openvino/op/util/op_types.hpp"
#include "openvino/runtime/intel_npu/remote_properties.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "zero_memory.hpp"
#include "zero_tensor.hpp"

using namespace intel_npu;

Expand Down Expand Up @@ -191,7 +191,8 @@ void ZeroInferRequest::create_pipeline() {
continue;
}

_logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
_logger.debug("ZeroInferRequest::create_pipeline - allocate new input tensor %s",
_metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex),
inputIndex,
INPUT,
Expand All @@ -206,7 +207,8 @@ void ZeroInferRequest::create_pipeline() {
_metadata.outputs.at(outputIndex).nodeFriendlyName.c_str());
continue;
}
_logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
_logger.debug("ZeroInferRequest::create_pipeline - allocate new output tensor %s",
_metadata.outputs.at(outputIndex).nodeFriendlyName.c_str());
_levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex),
outputIndex,
OUTPUT,
Expand All @@ -221,7 +223,7 @@ void ZeroInferRequest::create_pipeline() {
continue;
}

const IODescriptor inputDescriptor = _metadata.outputs.at(inputIndex);
const IODescriptor inputDescriptor = _metadata.inputs.at(inputIndex);
if (inputDescriptor.isShapeTensor || inputDescriptor.isStateInput) {
continue;
}
Expand All @@ -230,6 +232,7 @@ void ZeroInferRequest::create_pipeline() {
if (levelZeroRemoteTensor == nullptr) {
_originalMemoryIdInputLevelZeroTensor.at(inputIndex) =
get_memory_id(_initStructs->getContext(), get_level_zero_input(inputIndex)->data());

} else {
void* levelZeroBuffer =
extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
Expand Down Expand Up @@ -509,13 +512,15 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
return userTensors;
}

_logger.debug("ZeroInferRequest::get_tensor - tensor is not allocated, create the tensor");
auto& metadata = isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex);
_logger.debug("ZeroInferRequest::get_tensor - tensor is not allocated, create tensor %s",
metadata.nodeFriendlyName.c_str());

auto& levelZeroTensors = isInput ? get_level_zero_input(ioIndex) : _levelZeroOutputTensors.at(ioIndex);
auto& tensorCreatedLocally =
isInput ? _inputLevelZeroTensorCreatedLocally.at(ioIndex) : _outputLevelZeroTensorCreatedLocally.at(ioIndex);

levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex),
levelZeroTensors = allocate_tensor(metadata,
ioIndex,
isInput,
isInput ? *_inputAllocator : *_outputAllocator,
Expand Down Expand Up @@ -847,6 +852,47 @@ std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
}
}

std::shared_ptr<ov::ITensor> ZeroInferRequest::allocate_tensor(const IODescriptor& descriptor,
const size_t index,
const bool isInput,
const ov::Allocator& allocator,
const std::optional<std::size_t> batchSize) const {
check_network_precision(descriptor.precision);

std::shared_ptr<ov::ITensor> tensor;
ov::Shape allocatedTensorShape = descriptor.shapeFromCompiler.get_max_shape();

if (batchSize.has_value()) {
allocatedTensorShape[BATCH_AXIS] = *batchSize;
}

if (descriptor.isStateOutput) {
// Only one buffer is required for each (state input, state output) pair, acting as an input before running the
// inference and as an output after performing it. Thus both the "state input" and "state output" entries shall
// point to the same buffer.
OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(),
"The link between state descriptors is missing, state name: ",
descriptor.nameFromCompiler);
tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr;
} else {
tensor = std::make_shared<ZeroTensor>(_initStructs, descriptor.precision, allocatedTensorShape, allocator);
}

if (isInput) {
if (get_user_input(index) == nullptr) {
get_user_input(index) = tensor;
}

if (descriptor.isStateInput) {
_variableStates.push_back(std::make_shared<VariableState>(descriptor.nameFromCompiler, tensor));
}
} else if (_userOutputTensors.at(index) == nullptr) {
_userOutputTensors.at(index) = tensor;
}

return tensor;
}

std::vector<uint8_t> ZeroInferRequest::get_raw_profiling_data() const {
return _profilingQuery.getData<uint8_t>();
}
Expand Down
25 changes: 25 additions & 0 deletions src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,31 @@ void ZeroRemoteTensor::allocate(const size_t bytes) {
update_strides();
}

void ZeroRemoteTensor::set_shape(ov::Shape new_shape) {
if (_shape == new_shape)
return;

_shape = std::move(new_shape);

if (ov::shape_size(_shape) > ov::shape_size(_capacity)) {
if (!_init_structs->getMutableCommandListVersion()) {
OPENVINO_THROW("Cannot set a larger shape with this driver version.");
}

if (!deallocate()) {
OPENVINO_THROW("Cannot deallocate tensor while an attempt to enlarge tensor area in set_shape.");
}

_capacity = _shape;

const auto byte_size = ov::element::get_memory_size(_element_type, shape_size(_shape));
allocate(byte_size);
}

_strides.clear();
update_strides();
}

bool ZeroRemoteTensor::is_allocated() const noexcept {
return _data != nullptr;
}
Expand Down
139 changes: 139 additions & 0 deletions src/plugins/intel_npu/src/backend/src/zero_tensor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "zero_tensor.hpp"

#include "openvino/core/type/element_iterator.hpp"
#include "openvino/runtime/properties.hpp"
#include "openvino/runtime/tensor.hpp"

namespace intel_npu {

ZeroTensor::ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const ov::element::Type element_type,
const ov::Shape& shape,
const ov::Allocator& allocator)
: _init_structs(init_structs),
_element_type{element_type},
_shape{shape},
_capacity{_shape},
_strides{},
_strides_once{},
_allocator{allocator} {
OPENVINO_ASSERT(_element_type != ov::element::undefined && _element_type.is_static());
OPENVINO_ASSERT(allocator, "Allocator was not initialized");
const auto byte_size = ov::element::get_memory_size(_element_type, shape_size(_shape));
auto data = const_cast<ov::Allocator&>(_allocator).allocate(byte_size);
OPENVINO_ASSERT(byte_size == 0 || data != nullptr, "Failed to allocate memory");
initialize_elements(data, element_type, _shape);
_ptr = data;
}

void* ZeroTensor::data(const ov::element::Type& element_type) const {
if (element_type != ov::element::undefined && element_type != ov::element::dynamic &&
(element_type.bitwidth() != get_element_type().bitwidth() ||
element_type.is_real() != get_element_type().is_real() ||
(element_type == ov::element::string && get_element_type() != ov::element::string) ||
(element_type != ov::element::string && get_element_type() == ov::element::string))) {
OPENVINO_THROW("Tensor data with element type ",
get_element_type(),
", is not representable as pointer to ",
element_type);
}
return _ptr;
}

const ov::element::Type& ZeroTensor::get_element_type() const {
return _element_type;
}

const ov::Shape& ZeroTensor::get_shape() const {
return _shape;
}

void ZeroTensor::update_strides() const {
if (_element_type.bitwidth() < 8)
return;

auto& shape = get_shape();
if (_strides.empty() && !shape.empty()) {
_strides.resize(shape.size());
_strides.back() = shape.back() == 0 ? 0 : _element_type.size();
std::transform(shape.crbegin(),
shape.crend() - 1,
_strides.rbegin(),
_strides.rbegin() + 1,
std::multiplies<size_t>());
}
}

const ov::Strides& ZeroTensor::get_strides() const {
OPENVINO_ASSERT(_element_type.bitwidth() >= 8,
"Could not get strides for types with bitwidths less then 8 bit. Tensor type: ",
_element_type);
std::call_once(_strides_once, &ZeroTensor::update_strides, this);
return _strides;
}

void ZeroTensor::initialize_elements(void* data, const ov::element::Type& element_type, const ov::Shape& shape) {
if (element_type == ov::element::Type_t::string) {
auto num_elements = shape_size(shape);
auto string_ptr = static_cast<std::string*>(data);
std::uninitialized_fill_n(string_ptr, num_elements, std::string());
}
}

size_t ZeroTensor::get_capacity() const {
return shape_size(_capacity);
}

size_t ZeroTensor::get_bytes_capacity() const {
return ov::element::get_memory_size(get_element_type(), get_capacity());
}

void ZeroTensor::destroy_elements(size_t begin_ind, size_t end_ind) {
// it removes elements from tail
if (get_element_type() == ov::element::Type_t::string) {
auto strings = static_cast<std::string*>(_ptr);
for (size_t ind = begin_ind; ind < end_ind; ++ind) {
using std::string;
strings[ind].~string();
}
}
}

void ZeroTensor::destroy_memory() {
destroy_elements(0, get_capacity());
_allocator.deallocate(_ptr, get_bytes_capacity());
_ptr = nullptr;
}

void ZeroTensor::set_shape(ov::Shape new_shape) {
if (_shape == new_shape)
return;

_shape = std::move(new_shape);

if (get_size() > get_capacity()) {
if (!_init_structs->getMutableCommandListVersion()) {
OPENVINO_THROW("Cannot set a larger shape with this driver version.");
}

destroy_memory();

// allocate buffer and initialize objects from scratch
_capacity = _shape;
_ptr = _allocator.allocate(get_bytes_capacity());
initialize_elements(_ptr, _element_type, _shape);
}

_strides.clear();
update_strides();
}

ZeroTensor::~ZeroTensor() {
destroy_memory();
}

} // namespace intel_npu
Loading

0 comments on commit b1afd2c

Please sign in to comment.