Create zero tensor class

Signed-off-by: Bogdan Pereanu <[email protected]>
openvinotoolkit · Dec 16, 2024 · b1afd2c · b1afd2c
1 parent 5cebf60
commit b1afd2c
Show file tree

Hide file tree

Showing 10 changed files with 301 additions and 48 deletions.
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -62,6 +62,13 @@ class ZeroInferRequest final : public SyncInferRequest {
     std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
     std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;
 
+    std::shared_ptr<ov::ITensor> allocate_tensor(
+        const IODescriptor& descriptor,
+        const size_t index,
+        const bool isInput,
+        const ov::Allocator& allocator = {},
+        const std::optional<std::size_t> batchSize = std::nullopt) const override;
+
     const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
     const std::shared_ptr<IGraph> _graph;
     const Config _config;

diff --git a/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp
@@ -14,7 +14,7 @@
 
 namespace intel_npu {
 
-class ZeroRemoteTensor : public RemoteTensor {
+class ZeroRemoteTensor final : public RemoteTensor {
 public:
     ZeroRemoteTensor(const std::shared_ptr<ov::IRemoteContext>& context,
                      const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
@@ -25,6 +25,13 @@ class ZeroRemoteTensor : public RemoteTensor {
                      ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
                      void* mem = nullptr);
 
+    /**
+     * @brief Set new shape for tensor
+     * @note Memory allocation may happen
+     * @param shape A new shape
+     */
+    void set_shape(ov::Shape shape) override;
+
     ~ZeroRemoteTensor() override;
 
 private:

diff --git a/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_tensor.hpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+
+#include "intel_npu/config/config.hpp"
+#include "intel_npu/utils/zero/zero_init.hpp"
+#include "openvino/runtime/common.hpp"
+#include "openvino/runtime/itensor.hpp"
+#include "openvino/runtime/so_ptr.hpp"
+
+namespace intel_npu {
+
+class ZeroTensor final : public ov::ITensor {
+public:
+    ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
+               const ov::element::Type element_type,
+               const ov::Shape& shape,
+               const ov::Allocator& allocator);
+
+    void* data(const ov::element::Type& element_type) const override;
+
+    const ov::element::Type& get_element_type() const override;
+
+    const ov::Shape& get_shape() const override;
+
+    void set_shape(ov::Shape new_shape) override;
+
+    const ov::Strides& get_strides() const override;
+
+    ~ZeroTensor();
+
+private:
+    static void initialize_elements(void* data, const ov::element::Type& element_type, const ov::Shape& shape);
+    void update_strides() const;
+    size_t get_capacity() const;
+    size_t get_bytes_capacity() const;
+    void destroy_elements(size_t begin_ind, size_t end_ind);
+    void destroy_memory();
+
+    std::shared_ptr<ZeroInitStructsHolder> _init_structs;
+
+    ov::element::Type _element_type;
+    ov::Shape _shape;
+    ov::Shape _capacity;
+    mutable ov::Strides _strides;
+    mutable std::once_flag _strides_once;
+    ov::Allocator _allocator;
+    void* _ptr = nullptr;
+};
+
+}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -12,8 +12,8 @@
 #include "intel_npu/utils/zero/zero_api.hpp"
 #include "openvino/op/util/op_types.hpp"
 #include "openvino/runtime/intel_npu/remote_properties.hpp"
-#include "openvino/runtime/make_tensor.hpp"
 #include "zero_memory.hpp"
+#include "zero_tensor.hpp"
 
 using namespace intel_npu;
 
@@ -191,7 +191,8 @@ void ZeroInferRequest::create_pipeline() {
             continue;
         }
 
-        _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
+        _logger.debug("ZeroInferRequest::create_pipeline - allocate new input tensor %s",
+                      _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
         get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex),
                                                            inputIndex,
                                                            INPUT,
@@ -206,7 +207,8 @@ void ZeroInferRequest::create_pipeline() {
                           _metadata.outputs.at(outputIndex).nodeFriendlyName.c_str());
             continue;
         }
-        _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
+        _logger.debug("ZeroInferRequest::create_pipeline - allocate new output tensor %s",
+                      _metadata.outputs.at(outputIndex).nodeFriendlyName.c_str());
         _levelZeroOutputTensors.at(outputIndex) = allocate_tensor(_metadata.outputs.at(outputIndex),
                                                                   outputIndex,
                                                                   OUTPUT,
@@ -221,7 +223,7 @@ void ZeroInferRequest::create_pipeline() {
                 continue;
             }
 
-            const IODescriptor inputDescriptor = _metadata.outputs.at(inputIndex);
+            const IODescriptor inputDescriptor = _metadata.inputs.at(inputIndex);
             if (inputDescriptor.isShapeTensor || inputDescriptor.isStateInput) {
                 continue;
             }
@@ -230,6 +232,7 @@ void ZeroInferRequest::create_pipeline() {
             if (levelZeroRemoteTensor == nullptr) {
                 _originalMemoryIdInputLevelZeroTensor.at(inputIndex) =
                     get_memory_id(_initStructs->getContext(), get_level_zero_input(inputIndex)->data());
+
             } else {
                 void* levelZeroBuffer =
                     extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
@@ -509,13 +512,15 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
         return userTensors;
     }
 
-    _logger.debug("ZeroInferRequest::get_tensor - tensor is not allocated, create the tensor");
+    auto& metadata = isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex);
+    _logger.debug("ZeroInferRequest::get_tensor - tensor is not allocated, create tensor %s",
+                  metadata.nodeFriendlyName.c_str());
 
     auto& levelZeroTensors = isInput ? get_level_zero_input(ioIndex) : _levelZeroOutputTensors.at(ioIndex);
     auto& tensorCreatedLocally =
         isInput ? _inputLevelZeroTensorCreatedLocally.at(ioIndex) : _outputLevelZeroTensorCreatedLocally.at(ioIndex);
 
-    levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex),
+    levelZeroTensors = allocate_tensor(metadata,
                                        ioIndex,
                                        isInput,
                                        isInput ? *_inputAllocator : *_outputAllocator,
@@ -847,6 +852,47 @@ std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
     }
 }
 
+std::shared_ptr<ov::ITensor> ZeroInferRequest::allocate_tensor(const IODescriptor& descriptor,
+                                                               const size_t index,
+                                                               const bool isInput,
+                                                               const ov::Allocator& allocator,
+                                                               const std::optional<std::size_t> batchSize) const {
+    check_network_precision(descriptor.precision);
+
+    std::shared_ptr<ov::ITensor> tensor;
+    ov::Shape allocatedTensorShape = descriptor.shapeFromCompiler.get_max_shape();
+
+    if (batchSize.has_value()) {
+        allocatedTensorShape[BATCH_AXIS] = *batchSize;
+    }
+
+    if (descriptor.isStateOutput) {
+        // Only one buffer is required for each (state input, state output) pair, acting as an input before running the
+        // inference and as an output after performing it. Thus both the "state input" and "state output" entries shall
+        // point to the same buffer.
+        OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(),
+                        "The link between state descriptors is missing, state name: ",
+                        descriptor.nameFromCompiler);
+        tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr;
+    } else {
+        tensor = std::make_shared<ZeroTensor>(_initStructs, descriptor.precision, allocatedTensorShape, allocator);
+    }
+
+    if (isInput) {
+        if (get_user_input(index) == nullptr) {
+            get_user_input(index) = tensor;
+        }
+
+        if (descriptor.isStateInput) {
+            _variableStates.push_back(std::make_shared<VariableState>(descriptor.nameFromCompiler, tensor));
+        }
+    } else if (_userOutputTensors.at(index) == nullptr) {
+        _userOutputTensors.at(index) = tensor;
+    }
+
+    return tensor;
+}
+
 std::vector<uint8_t> ZeroInferRequest::get_raw_profiling_data() const {
     return _profilingQuery.getData<uint8_t>();
 }

diff --git a/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp
@@ -147,6 +147,31 @@ void ZeroRemoteTensor::allocate(const size_t bytes) {
     update_strides();
 }
 
+void ZeroRemoteTensor::set_shape(ov::Shape new_shape) {
+    if (_shape == new_shape)
+        return;
+
+    _shape = std::move(new_shape);
+
+    if (ov::shape_size(_shape) > ov::shape_size(_capacity)) {
+        if (!_init_structs->getMutableCommandListVersion()) {
+            OPENVINO_THROW("Cannot set a larger shape with this driver version.");
+        }
+
+        if (!deallocate()) {
+            OPENVINO_THROW("Cannot deallocate tensor while an attempt to enlarge tensor area in set_shape.");
+        }
+
+        _capacity = _shape;
+
+        const auto byte_size = ov::element::get_memory_size(_element_type, shape_size(_shape));
+        allocate(byte_size);
+    }
+
+    _strides.clear();
+    update_strides();
+}
+
 bool ZeroRemoteTensor::is_allocated() const noexcept {
     return _data != nullptr;
 }

diff --git a/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_tensor.cpp
@@ -0,0 +1,139 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "zero_tensor.hpp"
+
+#include "openvino/core/type/element_iterator.hpp"
+#include "openvino/runtime/properties.hpp"
+#include "openvino/runtime/tensor.hpp"
+
+namespace intel_npu {
+
+ZeroTensor::ZeroTensor(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
+                       const ov::element::Type element_type,
+                       const ov::Shape& shape,
+                       const ov::Allocator& allocator)
+    : _init_structs(init_structs),
+      _element_type{element_type},
+      _shape{shape},
+      _capacity{_shape},
+      _strides{},
+      _strides_once{},
+      _allocator{allocator} {
+    OPENVINO_ASSERT(_element_type != ov::element::undefined && _element_type.is_static());
+    OPENVINO_ASSERT(allocator, "Allocator was not initialized");
+    const auto byte_size = ov::element::get_memory_size(_element_type, shape_size(_shape));
+    auto data = const_cast<ov::Allocator&>(_allocator).allocate(byte_size);
+    OPENVINO_ASSERT(byte_size == 0 || data != nullptr, "Failed to allocate memory");
+    initialize_elements(data, element_type, _shape);
+    _ptr = data;
+}
+
+void* ZeroTensor::data(const ov::element::Type& element_type) const {
+    if (element_type != ov::element::undefined && element_type != ov::element::dynamic &&
+        (element_type.bitwidth() != get_element_type().bitwidth() ||
+         element_type.is_real() != get_element_type().is_real() ||
+         (element_type == ov::element::string && get_element_type() != ov::element::string) ||
+         (element_type != ov::element::string && get_element_type() == ov::element::string))) {
+        OPENVINO_THROW("Tensor data with element type ",
+                       get_element_type(),
+                       ", is not representable as pointer to ",
+                       element_type);
+    }
+    return _ptr;
+}
+
+const ov::element::Type& ZeroTensor::get_element_type() const {
+    return _element_type;
+}
+
+const ov::Shape& ZeroTensor::get_shape() const {
+    return _shape;
+}
+
+void ZeroTensor::update_strides() const {
+    if (_element_type.bitwidth() < 8)
+        return;
+
+    auto& shape = get_shape();
+    if (_strides.empty() && !shape.empty()) {
+        _strides.resize(shape.size());
+        _strides.back() = shape.back() == 0 ? 0 : _element_type.size();
+        std::transform(shape.crbegin(),
+                       shape.crend() - 1,
+                       _strides.rbegin(),
+                       _strides.rbegin() + 1,
+                       std::multiplies<size_t>());
+    }
+}
+
+const ov::Strides& ZeroTensor::get_strides() const {
+    OPENVINO_ASSERT(_element_type.bitwidth() >= 8,
+                    "Could not get strides for types with bitwidths less then 8 bit. Tensor type: ",
+                    _element_type);
+    std::call_once(_strides_once, &ZeroTensor::update_strides, this);
+    return _strides;
+}
+
+void ZeroTensor::initialize_elements(void* data, const ov::element::Type& element_type, const ov::Shape& shape) {
+    if (element_type == ov::element::Type_t::string) {
+        auto num_elements = shape_size(shape);
+        auto string_ptr = static_cast<std::string*>(data);
+        std::uninitialized_fill_n(string_ptr, num_elements, std::string());
+    }
+}
+
+size_t ZeroTensor::get_capacity() const {
+    return shape_size(_capacity);
+}
+
+size_t ZeroTensor::get_bytes_capacity() const {
+    return ov::element::get_memory_size(get_element_type(), get_capacity());
+}
+
+void ZeroTensor::destroy_elements(size_t begin_ind, size_t end_ind) {
+    // it removes elements from tail
+    if (get_element_type() == ov::element::Type_t::string) {
+        auto strings = static_cast<std::string*>(_ptr);
+        for (size_t ind = begin_ind; ind < end_ind; ++ind) {
+            using std::string;
+            strings[ind].~string();
+        }
+    }
+}
+
+void ZeroTensor::destroy_memory() {
+    destroy_elements(0, get_capacity());
+    _allocator.deallocate(_ptr, get_bytes_capacity());
+    _ptr = nullptr;
+}
+
+void ZeroTensor::set_shape(ov::Shape new_shape) {
+    if (_shape == new_shape)
+        return;
+
+    _shape = std::move(new_shape);
+
+    if (get_size() > get_capacity()) {
+        if (!_init_structs->getMutableCommandListVersion()) {
+            OPENVINO_THROW("Cannot set a larger shape with this driver version.");
+        }
+
+        destroy_memory();
+
+        // allocate buffer and initialize objects from scratch
+        _capacity = _shape;
+        _ptr = _allocator.allocate(get_bytes_capacity());
+        initialize_elements(_ptr, _element_type, _shape);
+    }
+
+    _strides.clear();
+    update_strides();
+}
+
+ZeroTensor::~ZeroTensor() {
+    destroy_memory();
+}
+
+}  // namespace intel_npu