Skip to content

Commit

Permalink
[NPU] Re-create zero tensor for internal usage if it was shared with …
Browse files Browse the repository at this point in the history
…user (#28438)

### Details:
 - *Hotfix GenAI*
- *Re-create the internal level zero tensor for internal usage if this
has already been shared with the user. And do not re-use the shared
memory*

### Tickets:
 - *CVS-160546*

---------

Signed-off-by: Bogdan Pereanu <[email protected]>
  • Loading branch information
pereanub authored Jan 15, 2025
1 parent 5c55539 commit 2f5af17
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 23 deletions.
6 changes: 5 additions & 1 deletion src/plugins/intel_npu/src/backend/include/zero_tensor.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2024 Intel Corporation
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand Down Expand Up @@ -41,6 +41,9 @@ class ZeroTensor final : public ov::ITensor {
bool memory_address_changed();
void reset_memory_flag();

bool tensor_was_shared_with_user();
void set_tensor_shared_with_user();

~ZeroTensor();

private:
Expand All @@ -61,6 +64,7 @@ class ZeroTensor final : public ov::ITensor {
ov::Allocator _allocator;
void* _ptr = nullptr;
bool _reset_tensor_memory = false;
bool _tensor_shared_with_user = false;
};

} // namespace intel_npu
56 changes: 35 additions & 21 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,16 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data");
auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);

const auto& zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(tensor);
bool updateCommandListArg = false;

if (zeroTensor == nullptr) {
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation");
if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) {
_logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context");
levelZeroTensors = tensor;
} else {
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation");
if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) {
_logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context");
levelZeroTensors = tensor;
updateCommandListArg = true;
} else {
auto zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(levelZeroTensors);
if (zeroTensor != nullptr && zeroTensor->tensor_was_shared_with_user()) {
_logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor");
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor");

Expand All @@ -242,20 +244,22 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
isInput,
isInput ? *_inputAllocator : *_outputAllocator,
_graph->get_batch_size());

updateCommandListArg = true;
}
}

if (_pipelineIsCreated) {
_logger.debug("ZeroInferRequest::infer_async - update command list");
if (_pipelineIsCreated && updateCommandListArg) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

OPENVINO_ASSERT(levelZeroTensors->data(), "Empty buffer");
OPENVINO_ASSERT(levelZeroTensors->data(), "Empty buffer");

OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList");
_pipeline->updateCommandList(isInput ? _graph->get_input_descriptors().at(index).idx
: _graph->get_output_descriptors().at(index).idx,
levelZeroTensors->data(),
levelZeroTensors->get_byte_size());
_pipeline->closeCommandList();
}
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList");
_pipeline->updateCommandList(
isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx,
levelZeroTensors->data(),
levelZeroTensors->get_byte_size());
_pipeline->closeCommandList();
}
}

Expand All @@ -270,15 +274,15 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTe
OPENVINO_THROW("Using different context for creating the tensor is not supported");
}

auto data = extract_object(tensor->get_properties(), ov::intel_npu::mem_handle);
OPENVINO_ASSERT(data, "Empty buffer");

auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
levelZeroTensors = tensor;

if (_pipelineIsCreated) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

auto data = extract_object(tensor->get_properties(), ov::intel_npu::mem_handle);
OPENVINO_ASSERT(data, "Empty buffer");

OV_ITT_TASK_NEXT(ZERO_SET_REMOTE_TENSOR, "updateCommandList");
_pipeline->updateCommandList(
isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx,
Expand Down Expand Up @@ -421,6 +425,11 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
auto& userTensors = isInput ? get_user_input(ioIndex) : _userOutputTensors.at(ioIndex);

if (userTensors) {
auto zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(userTensors._ptr);
if (zeroTensor != nullptr) {
zeroTensor->set_tensor_shared_with_user();
}

_logger.debug("ZeroInferRequest::get_tensor - tensor allocated, get the tensor");
return userTensors;
}
Expand All @@ -437,7 +446,12 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
isInput ? *_inputAllocator : *_outputAllocator,
_graph->get_batch_size());

return levelZeroTensors;
auto zeroTensor = std::dynamic_pointer_cast<ZeroTensor>(levelZeroTensors);
if (zeroTensor != nullptr) {
zeroTensor->set_tensor_shared_with_user();
}

return userTensors;
}

void ZeroInferRequest::infer() {
Expand Down
9 changes: 8 additions & 1 deletion src/plugins/intel_npu/src/backend/src/zero_tensor.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2024 Intel Corporation
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand Down Expand Up @@ -145,6 +145,13 @@ void ZeroTensor::reset_memory_flag() {
_reset_tensor_memory = false;
}

bool ZeroTensor::tensor_was_shared_with_user() {
return _tensor_shared_with_user;
}
void ZeroTensor::set_tensor_shared_with_user() {
_tensor_shared_with_user = true;
}

ZeroTensor::~ZeroTensor() {
destroy_memory();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTest,
::testing::ValuesIn(configsInferRequestRunTests)),
InferRequestRunTests::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest,
RandomTensorOverZeroTensorRunTests,
::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU),
::testing::ValuesIn(configsInferRequestRunTests)),
InferRequestRunTests::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest,
RunSeqTests,
::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU),
Expand Down
109 changes: 109 additions & 0 deletions src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,115 @@ TEST_P(InferRequestRunTests, RecreateL0TensorIfNeeded) {
}
}

using RandomTensorOverZeroTensorRunTests = InferRequestRunTests;

TEST_P(RandomTensorOverZeroTensorRunTests, SetRandomTensorOverZeroTensor0) {
// Skip test according to plugin specific disabledTestPatterns() (if any)
SKIP_IF_CURRENT_TEST_IS_DISABLED()

auto shape = Shape{1, 2, 2, 2};
auto shape_size = ov::shape_size(shape);
auto model = createModel(element::f32, shape, "N...");

compiled_model = core->compile_model(model, target_device, configuration);
ov::InferRequest inference_request;
inference_request = compiled_model.create_infer_request();

auto input_zero_tensor = inference_request.get_input_tensor(0);
auto* input_zero_data = input_zero_tensor.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
input_zero_data[i] = 5.f;
}

inference_request.infer(); // Adds '1' to each element

auto output_tensor = inference_request.get_output_tensor(0);
auto* output_data = output_tensor.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(output_data[i], 6.f, 1e-5) << "Expected=6, actual=" << output_data[i] << " for index " << i;
}

float* buffer = new float[shape_size];
ov::Tensor tensor{element::f32, shape, buffer};
auto* input_data = tensor.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
input_data[i] = 9.f;
}

inference_request.set_input_tensor(tensor);
inference_request.infer(); // Adds '1' to each element
for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(output_data[i], 10.f, 1e-5) << "Expected=10, actual=" << output_data[i] << " for index " << i;
}

for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(input_zero_data[i], 5.f, 1e-5) << "Expected=5, actual=" << input_zero_data[i] << " for index " << i;
}

delete[] buffer;
}

TEST_P(RandomTensorOverZeroTensorRunTests, SetRandomTensorOverZeroTensor1) {
// Skip test according to plugin specific disabledTestPatterns() (if any)
SKIP_IF_CURRENT_TEST_IS_DISABLED()

auto shape = Shape{1, 2, 2, 2};
auto shape_size = ov::shape_size(shape);
auto model = createModel(element::f32, shape, "N...");

compiled_model = core->compile_model(model, target_device, configuration);
ov::InferRequest inference_request0, inference_request1;
inference_request0 = compiled_model.create_infer_request();
inference_request1 = compiled_model.create_infer_request();

auto input_zero_tensor = inference_request0.get_input_tensor(0);
auto* input_zero_data = input_zero_tensor.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
input_zero_data[i] = 5.f;
}

inference_request0.infer(); // Adds '1' to each element

auto output_tensor0 = inference_request0.get_output_tensor(0);
auto* output_data0 = output_tensor0.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(output_data0[i], 6.f, 1e-5) << "Expected=6, actual=" << output_data0[i] << " for index " << i;
}

inference_request1.set_input_tensor(output_tensor0);
inference_request1.infer(); // Adds '1' to each element

auto output_tensor1 = inference_request1.get_output_tensor(0);
auto* output_data1 = output_tensor1.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(output_data1[i], 7.f, 1e-5) << "Expected=7, actual=" << output_data1[i] << " for index " << i;
}

float* buffer = new float[shape_size];
ov::Tensor tensor{element::f32, shape, buffer};
auto* input_data = tensor.data<float>();
for (size_t i = 0; i < shape_size; ++i) {
input_data[i] = 9.f;
}

inference_request1.set_input_tensor(tensor);
inference_request1.infer(); // Adds '1' to each element

for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(output_data1[i], 10.f, 1e-5) << "Expected=10, actual=" << output_data1[i] << " for index " << i;
}

for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(output_data0[i], 6.f, 1e-5) << "Expected=6, actual=" << output_data0[i] << " for index " << i;
}

for (size_t i = 0; i < shape_size; ++i) {
EXPECT_NEAR(input_zero_data[i], 5.f, 1e-5) << "Expected=5, actual=" << input_zero_data[i] << " for index " << i;
}

delete[] buffer;
}

using BatchingRunTests = InferRequestRunTests;

TEST_P(BatchingRunTests, CheckBatchingSupportInfer) {
Expand Down

0 comments on commit 2f5af17

Please sign in to comment.