From a51a7de9584338938214ad3ea89cb1a097c81baf Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Wed, 8 Nov 2023 17:47:09 +0200 Subject: [PATCH 01/14] [NVIDIA] Add operator==/!= to DevicePointer --- modules/nvidia_plugin/src/cuda/device_pointers.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules/nvidia_plugin/src/cuda/device_pointers.hpp b/modules/nvidia_plugin/src/cuda/device_pointers.hpp index d9452bea1..f39bab3e6 100644 --- a/modules/nvidia_plugin/src/cuda/device_pointers.hpp +++ b/modules/nvidia_plugin/src/cuda/device_pointers.hpp @@ -52,6 +52,16 @@ auto operator-(DevicePointer l, DevicePointer r) noexcept { return static_cast(l.get()) - static_cast(r); } +template +bool operator==(const DevicePointer& lhs, const DevicePointer& rhs) { + return lhs.get() == rhs.get(); +} + +template +bool operator!=(const DevicePointer& lhs, const DevicePointer& rhs) { + return lhs.get() != rhs.get(); +} + template class DeviceBuffer : private gsl::span { public: From 8be74b7d2b06824b61debba7648817d788486047 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Wed, 8 Nov 2023 17:55:44 +0200 Subject: [PATCH 02/14] [NVIDIA] Add CUDA::NodeParams, CUDA::TransferNode, CUDA::KernelNode --- modules/nvidia_plugin/src/cuda/graph.cpp | 83 ++++++++++++------- modules/nvidia_plugin/src/cuda/graph.hpp | 57 +++++++++++++ .../nvidia_plugin/src/cuda/node_params.hpp | 47 +++++++++++ 3 files changed, 159 insertions(+), 28 deletions(-) create mode 100644 modules/nvidia_plugin/src/cuda/node_params.hpp diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index 3bf743a1a..aa2a8542d 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -17,7 +17,7 @@ Graph::Graph(cudaGraph_t graph) : Handle { createFromNative, cudaGraphDestroy, graph } { } -cudaError_t Graph::createFromNative(cudaGraph_t *pGraph, const cudaGraph_t anotherGraph) { +cudaError_t Graph::createFromNative(cudaGraph_t* pGraph, const cudaGraph_t anotherGraph) { *pGraph = anotherGraph; return cudaSuccess; } @@ -28,52 +28,52 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) { return g; } -bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); } +bool operator==(const Graph& rhs, const Graph& lhs) { return rhs.get() == lhs.get(); } -GraphExec::GraphExec(const Graph &g) +GraphExec::GraphExec(const Graph& g) #if !defined(NDEBUG) || defined(_DEBUG) try #endif : Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), - static_cast(nullptr), + static_cast(nullptr), #if !defined(NDEBUG) || defined(_DEBUG) errorMsg_, kErrorStringLen) #else - static_cast(nullptr), + static_cast(nullptr), static_cast(0ul)) #endif { } #if !defined(NDEBUG) || defined(_DEBUG) -catch (std::exception &e) { +catch (std::exception& e) { OPENVINO_THROW(e.what(), ": ", errorMsg_); } #endif #if defined(CUDA_VERSION) && CUDA_VERSION >= 12020 -cudaGraphExecUpdateResultInfo GraphExec::update(const Graph &g) const { +cudaGraphExecUpdateResultInfo GraphExec::update(const Graph& g) const { cudaGraphExecUpdateResultInfo res; throwIfError(cudaGraphExecUpdate(get(), g.get(), &res)); return res; } #else -cudaGraphExecUpdateResult GraphExec::update(const Graph &g) const { +cudaGraphExecUpdateResult GraphExec::update(const Graph& g) const { cudaGraphExecUpdateResult res; throwIfError(cudaGraphExecUpdate(get(), g.get(), nullptr, &res)); return res; } #endif -void GraphExec::launch(const Stream &stream) const { +void GraphExec::launch(const Stream& stream) const { throwIfError(cudaGraphLaunch(get(), stream.get())); } -bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); } +bool operator==(const GraphExec& lhs, const GraphExec& rhs) { return rhs.get() == lhs.get(); } -GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} { +GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture& graphCapture) : graphCapture_{graphCapture} { throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal)); } @@ -81,7 +81,7 @@ GraphCapture::GraphCaptureScope::~GraphCaptureScope() { graphCapture_.capturedError_ = cudaStreamEndCapture(graphCapture_.stream_.get(), &graphCapture_.cudaGraph_); } -GraphCapture::GraphCapture(const Stream &capturedStream) : +GraphCapture::GraphCapture(const Stream& capturedStream) : stream_ { capturedStream } { } @@ -100,12 +100,12 @@ const Graph& GraphCapture::getGraph() { return graph_.value(); } -CaptureInfo::CaptureInfo(const Stream &capturedStream) : stream_{capturedStream} { +CaptureInfo::CaptureInfo(const Stream& capturedStream) : stream_{capturedStream} { throwIfError(cudaStreamGetCaptureInfo_v2(capturedStream.get(), &captureStatus_, nullptr, &capturingGraph_, &deps_, &depCount_)); } -UploadNode CaptureInfo::addUploadNode(DevicePointer dst, const void *src, std::size_t size) { +UploadNode CaptureInfo::addUploadNode(DevicePointer dst, const void* src, std::size_t size) { cudaGraphNode_t newNode; throwIfError(cudaGraphAddMemcpyNode1D(&newNode, capturingGraph_, deps_, depCount_, dst.get(), src, size, cudaMemcpyHostToDevice)); @@ -113,7 +113,7 @@ UploadNode CaptureInfo::addUploadNode(DevicePointer dst, const void *src, return UploadNode{newNode, dst, src, size}; } -DownloadNode CaptureInfo::addDownloadNode(void *dst, DevicePointer src, +DownloadNode CaptureInfo::addDownloadNode(void* dst, DevicePointer src, std::size_t size) { cudaGraphNode_t newNode; throwIfError(cudaGraphAddMemcpyNode1D(&newNode, capturingGraph_, deps_, depCount_, @@ -122,7 +122,17 @@ DownloadNode CaptureInfo::addDownloadNode(void *dst, DevicePointer return DownloadNode{newNode, dst, src, size}; } -void UploadNode::update_src(const GraphExec& exec, const void *src) { +TransferNode CaptureInfo::addTransferNode(CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + cudaGraphNode_t newNode; + throwIfError(cudaGraphAddMemcpyNode1D( + &newNode, capturingGraph_, deps_, depCount_, dst.get(), src.get(), size, cudaMemcpyDeviceToDevice)); + throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); + return TransferNode{newNode, dst, src, size}; +} + +void UploadNode::update_src(const GraphExec& exec, const void* src) { if (src_ != src) { throwIfError(cudaGraphExecMemcpyNodeSetParams1D(exec.get(), node_, dst_.get(), src, size_, cudaMemcpyHostToDevice)); @@ -130,15 +140,14 @@ void UploadNode::update_src(const GraphExec& exec, const void *src) { } } -UploadNode::UploadNode(cudaGraphNode_t node, DevicePointer dst, const void *src, - std::size_t size) +UploadNode::UploadNode(cudaGraphNode_t node, DevicePointer dst, const void* src, std::size_t size) : node_{node}, dst_{dst}, src_{src}, size_{size} { } -void DownloadNode::update_dst(const GraphExec& exec, void *dst) { +void DownloadNode::update_dst(const GraphExec& exec, void* dst) { if (dst_ != dst) { throwIfError(cudaGraphExecMemcpyNodeSetParams1D(exec.get(), node_, dst, src_.get(), size_, cudaMemcpyDeviceToHost)); @@ -146,20 +155,38 @@ void DownloadNode::update_dst(const GraphExec& exec, void *dst) { } } -DownloadNode::DownloadNode(cudaGraphNode_t node, void *dst, DevicePointer src, - std::size_t size) - : node_{node}, - dst_{dst}, - src_{src}, - size_{size} { +DownloadNode::DownloadNode(cudaGraphNode_t node, void* dst, DevicePointer src, std::size_t size) + : node_{node}, dst_{dst}, src_{src}, size_{size} {} + +void CUDA::TransferNode::update_ptrs(const GraphExec& exec, + CUDA::DevicePointer dst, + CUDA::DevicePointer src) { + if (dst_ != dst && src_ != src) { + dst_ = dst; + src_ = src; + throwIfError(cudaGraphExecMemcpyNodeSetParams1D( + exec.get(), node_, dst_.get(), src_.get(), size_, cudaMemcpyDeviceToDevice)); + } } -bool UploadNode::operator ==(const UploadNode &rhs) const { +CUDA::TransferNode::TransferNode(cudaGraphNode_t node, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) + : node_{node}, dst_{dst}, src_{src}, size_{size} {} + +CUDA::KernelNode::KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params) : node_{node}, node_params_{params} {} + +bool UploadNode::operator==(const UploadNode& rhs) const { return size_ == rhs.size_ && src_ == rhs.src_ && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; } -bool DownloadNode::operator ==(const DownloadNode &rhs) const { +bool DownloadNode::operator==(const DownloadNode& rhs) const { return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_ == rhs.dst_ && node_ == rhs.node_; } -} // namespace CUDA +bool CUDA::TransferNode::operator==(const TransferNode& rhs) const { + return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; +} + +} // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp index b014e2131..4cfeeaa04 100644 --- a/modules/nvidia_plugin/src/cuda/graph.hpp +++ b/modules/nvidia_plugin/src/cuda/graph.hpp @@ -6,6 +6,7 @@ #include +#include #include "runtime.hpp" namespace CUDA { @@ -92,6 +93,7 @@ class UploadNode { private: UploadNode(cudaGraphNode_t node, CUDA::DevicePointer dst, const void* src, std::size_t size); + cudaGraphNode_t node_; CUDA::DevicePointer dst_; const void* src_; @@ -107,17 +109,62 @@ class DownloadNode { private: DownloadNode(cudaGraphNode_t node, void* dst, CUDA::DevicePointer src, std::size_t size); + cudaGraphNode_t node_; void* dst_; CUDA::DevicePointer src_; std::size_t size_; }; +class TransferNode { + friend CaptureInfo; + +public: + void update_ptrs(const GraphExec& exec, CUDA::DevicePointer dst, CUDA::DevicePointer src); + bool operator==(const TransferNode& rhs) const; + +private: + TransferNode(cudaGraphNode_t node, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + + cudaGraphNode_t node_; + CUDA::DevicePointer dst_; + CUDA::DevicePointer src_; + std::size_t size_; +}; + +bool operator==(const cudaKernelNodeParams& lhs, const cudaKernelNodeParams& rhs); + +class KernelNode { + friend CaptureInfo; + +public: + template + void update_params(const GraphExec& exec, Args&&... args) { + node_params_.reset_args(); + node_params_.add_args(std::forward(args)...); + throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, &node_params_.get_knp())); + } + +private: + KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params); + + cudaGraphNode_t node_; + CUDA::NodeParams node_params_; +}; + class CaptureInfo { public: CaptureInfo(const Stream& capturedStream); UploadNode addUploadNode(CUDA::DevicePointer dst, const void* src, std::size_t size); DownloadNode addDownloadNode(void* dst, CUDA::DevicePointer src, std::size_t size); + TransferNode addTransferNode(CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + template + KernelNode addKernelNode(void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args); private: const Stream& stream_; @@ -127,4 +174,14 @@ class CaptureInfo { size_t depCount_; }; +template +KernelNode CaptureInfo::addKernelNode(void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args) { + cudaGraphNode_t newNode; + CUDA::NodeParams params{kernel, gridDim, blockDim}; + params.add_args(std::forward(args)...); + throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, ¶ms.get_knp())); + throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); + return KernelNode{newNode, std::move(params)}; +} + } // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/node_params.hpp b/modules/nvidia_plugin/src/cuda/node_params.hpp new file mode 100644 index 000000000..c89e41855 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda/node_params.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include + +namespace CUDA { + +struct NodeParams { + NodeParams(void* kernel, dim3 gridDim, dim3 blockDim) { + knp_.func = kernel; + knp_.gridDim = gridDim; + knp_.blockDim = blockDim; + knp_.sharedMemBytes = 0; + knp_.kernelParams = nullptr; + knp_.extra = nullptr; + ptrs_.reserve(20); + } + + template + void add_args(const T& value) { + ptrs_.emplace_back(const_cast(&value)); + } + + template + void add_args(const T& arg, Args&&... args) { + add_args(std::forward(arg)); + add_args(std::forward(args)...); + }; + + const cudaKernelNodeParams& get_knp() { + knp_.kernelParams = ptrs_.data(); + return knp_; + } + + void reset_args() { ptrs_.clear(); } + +private: + std::vector ptrs_; + cudaKernelNodeParams knp_; +}; + +} // namespace CUDA From bdcee9c8ef90ba61b9dd9b693879ec985e25fdf8 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Mon, 13 Nov 2023 16:16:17 +0200 Subject: [PATCH 03/14] [NVIDIA] Add kernel args getters for Insert/Slice --- modules/nvidia_plugin/src/kernels/insert.cu | 40 +++++++++++++++- modules/nvidia_plugin/src/kernels/insert.hpp | 8 +++- modules/nvidia_plugin/src/kernels/slice.cu | 48 +++++++++++++++++--- modules/nvidia_plugin/src/kernels/slice.hpp | 8 +++- 4 files changed, 94 insertions(+), 10 deletions(-) diff --git a/modules/nvidia_plugin/src/kernels/insert.cu b/modules/nvidia_plugin/src/kernels/insert.cu index a521d60bc..da96e0a41 100644 --- a/modules/nvidia_plugin/src/kernels/insert.cu +++ b/modules/nvidia_plugin/src/kernels/insert.cu @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -70,11 +70,47 @@ void Insert::operator()(const cudaStream_t stream, const void* src, void* dst, c case Type_t::u64: return call(stream, src, dst, start); default: - throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", + throw_ov_exception(fmt::format("Input element type = {} is not supported by Insert operation !!", static_cast(element_type_))); } } +void* Insert::getKernel() const { + switch (element_type_) { + case Type_t::boolean: + return reinterpret_cast(&insert_part); +#ifdef CUDA_HAS_BF16_TYPE + case Type_t::bf16: + return reinterpret_cast(&insert_part<__nv_bfloat16>); +#endif + case Type_t::f16: + return reinterpret_cast(&insert_part<__half>); + case Type_t::f32: + return reinterpret_cast(&insert_part); + case Type_t::f64: + return reinterpret_cast(&insert_part); + case Type_t::i8: + return reinterpret_cast(&insert_part); + case Type_t::i16: + return reinterpret_cast(&insert_part); + case Type_t::i32: + return reinterpret_cast(&insert_part); + case Type_t::i64: + return reinterpret_cast(&insert_part); + case Type_t::u8: + return reinterpret_cast(&insert_part); + case Type_t::u16: + return reinterpret_cast(&insert_part); + case Type_t::u32: + return reinterpret_cast(&insert_part); + case Type_t::u64: + return reinterpret_cast(&insert_part); + default: + throw_ov_exception(fmt::format("Input element type = {} is not supported by Insert operation !!", + static_cast(element_type_))); + } +} + template void Insert::call(const cudaStream_t stream, const void* src, void* dst, const size_t start) const { assertThrow(props_ptr_, "props_ptr_ == nullptr"); diff --git a/modules/nvidia_plugin/src/kernels/insert.hpp b/modules/nvidia_plugin/src/kernels/insert.hpp index 111edf8db..c9e09b130 100644 --- a/modules/nvidia_plugin/src/kernels/insert.hpp +++ b/modules/nvidia_plugin/src/kernels/insert.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,6 +31,12 @@ class Insert { size_t getImmutableWorkbufferSize() const; void setImmutableWorkbuffer(void* immutableBuffer); + void* getKernel() const; + size_t getSize() const { return size_; } + size_t getNumBlocks() const { return num_blocks_; } + size_t getThreadsPerBlock() const { return threads_per_block_; } + const Props* getPropsPtr() const { return static_cast(props_ptr_); } + private: template void call(const cudaStream_t stream, const void* src, void* dst, const size_t start) const; diff --git a/modules/nvidia_plugin/src/kernels/slice.cu b/modules/nvidia_plugin/src/kernels/slice.cu index 8edb06d62..f358a8db6 100644 --- a/modules/nvidia_plugin/src/kernels/slice.cu +++ b/modules/nvidia_plugin/src/kernels/slice.cu @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -16,7 +16,7 @@ namespace nvidia_gpu { namespace kernel { template -static __global__ void slice_part(const Slice::Props *props, const size_t start, const size_t size, const T *x, T *y) { +static __global__ void slice_part(const Slice::Props* props, const size_t start, const size_t size, const T* x, T* y) { const unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { const size_t old_rank = rank(props->old_shape); @@ -32,13 +32,13 @@ static __global__ void slice_part(const Slice::Props *props, const size_t start, } } -Slice::Slice(const Type_t element_type, const Props &props, const size_t max_threads_per_block) +Slice::Slice(const Type_t element_type, const Props& props, const size_t max_threads_per_block) : element_type_{element_type}, props_{props}, size_{shape_size(props.new_shape)} { TypeValidator::check(element_type_); std::tie(num_blocks_, threads_per_block_) = calculateElementwiseGrid(size_, max_threads_per_block); } -void Slice::operator()(cudaStream_t stream, const void *src, void *dst, const size_t start) const { +void Slice::operator()(cudaStream_t stream, const void* src, void* dst, const size_t start) const { switch (element_type_) { case Type_t::boolean: return call(stream, src, dst, start); @@ -76,11 +76,47 @@ void Slice::operator()(cudaStream_t stream, const void *src, void *dst, const si } } +void* Slice::getKernel() const { + switch (element_type_) { + case Type_t::boolean: + return reinterpret_cast(&slice_part); +#ifdef CUDA_HAS_BF16_TYPE + case Type_t::bf16: + return reinterpret_cast(&slice_part<__nv_bfloat16>); +#endif + case Type_t::f16: + return reinterpret_cast(&slice_part<__half>); + case Type_t::f32: + return reinterpret_cast(&slice_part); + case Type_t::f64: + return reinterpret_cast(&slice_part); + case Type_t::i8: + return reinterpret_cast(&slice_part); + case Type_t::i16: + return reinterpret_cast(&slice_part); + case Type_t::i32: + return reinterpret_cast(&slice_part); + case Type_t::i64: + return reinterpret_cast(&slice_part); + case Type_t::u8: + return reinterpret_cast(&slice_part); + case Type_t::u16: + return reinterpret_cast(&slice_part); + case Type_t::u32: + return reinterpret_cast(&slice_part); + case Type_t::u64: + return reinterpret_cast(&slice_part); + default: + throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", + static_cast(element_type_))); + } +} + template -void Slice::call(cudaStream_t stream, const void *src, void *dst, size_t start) const { +void Slice::call(cudaStream_t stream, const void* src, void* dst, size_t start) const { assertThrow(props_ptr_, "props_ptr_ == nullptr"); slice_part<<>>( - static_cast(props_ptr_), start, size_, static_cast(src), static_cast(dst)); + static_cast(props_ptr_), start, size_, static_cast(src), static_cast(dst)); } } // namespace kernel diff --git a/modules/nvidia_plugin/src/kernels/slice.hpp b/modules/nvidia_plugin/src/kernels/slice.hpp index 64c58434c..e54dd4686 100644 --- a/modules/nvidia_plugin/src/kernels/slice.hpp +++ b/modules/nvidia_plugin/src/kernels/slice.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -29,6 +29,12 @@ class Slice { size_t getImmutableWorkbufferSize() const; void setImmutableWorkbuffer(void* immutableBuffer); + void* getKernel() const; + size_t getSize() const { return size_; } + size_t getNumBlocks() const { return num_blocks_; } + size_t getThreadsPerBlock() const { return threads_per_block_; } + const Props* getPropsPtr() const { return static_cast(props_ptr_); } + private: template void call(cudaStream_t stream, const void* src, void* dst, size_t start) const; From 5145804d225bb0f5d996aa3ea104b8f8910f9479 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Wed, 8 Nov 2023 22:15:32 +0200 Subject: [PATCH 04/14] [NVIDIA] Add KernelNodeTest and TransferNodeTest --- .../tests/unit/cuda_graph_nodes_test.cpp | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp diff --git a/modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp new file mode 100644 index 000000000..9b990faf3 --- /dev/null +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +using namespace ov::nvidia_gpu; + +class KernelNodeTest : public testing::Test {}; + +TEST_F(KernelNodeTest, InsertKernel) { + constexpr size_t in_size = 2 * 1 * 4; + constexpr size_t out_size = 2 * 3 * 4; + kernel::Type_t element_type = kernel::Type_t::i32; + kernel::Insert::Props props; + props.old_shape[0] = 2; + props.old_shape[1] = 1; + props.old_shape[2] = 4; + props.new_shape[0] = 2; + props.new_shape[1] = 3; + props.new_shape[2] = 4; + props.axe = 1; + const size_t start = 1; + auto insert = kernel::Insert(element_type, props, CUDA::Device{}.props().maxThreadsPerBlock); + + CUDA::Stream stream{}; + auto iwb = stream.malloc(insert.getImmutableWorkbufferSize()); + insert.setImmutableWorkbuffer(iwb.get()); + + // Regular kernel + graph with KernelNode + const int32_t in_arr1[2][1][4] = {{{1, 42, 38, 17}}, {{1, 2, 18, 17}}}; + auto src1 = stream.malloc(sizeof(int32_t) * in_size); + auto dst1 = stream.malloc(sizeof(int32_t) * out_size); + auto host_out_arr1 = std::make_unique(out_size); + + stream.upload(src1, in_arr1, sizeof(int32_t) * in_size); + insert(stream.get(), src1.get(), dst1.get(), start); + stream.download(host_out_arr1.get(), dst1, sizeof(int32_t) * out_size); + + auto dst1_graph = stream.malloc(sizeof(int32_t) * out_size); + + std::optional kernel_node; + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + CUDA::CaptureInfo captureInfo{stream}; + kernel_node.emplace(captureInfo.addKernelNode(insert.getKernel(), + insert.getNumBlocks(), + insert.getThreadsPerBlock(), + insert.getPropsPtr(), + start, + insert.getSize(), + src1.get(), + dst1_graph.get())); + } + CUDA::GraphExec graph_exec{capture.getGraph()}; + graph_exec.launch(stream); + + auto host_out_arr1_graph = std::make_unique(out_size); + stream.download(host_out_arr1_graph.get(), dst1_graph, sizeof(int32_t) * out_size); + stream.synchronize(); + + ASSERT_TRUE(std::equal(host_out_arr1.get(), host_out_arr1.get() + out_size, host_out_arr1_graph.get())); + + // Regular kernel + updated graph with KernelNode + const int32_t in_arr2[2][1][4] = {{{31, 2, 8, 10}}, {{20, 12, 1, 7}}}; + + auto src2 = stream.malloc(sizeof(int32_t) * in_size); + auto dst2 = stream.malloc(sizeof(int32_t) * out_size); + auto host_out_arr2 = std::make_unique(out_size); + + stream.upload(src2, in_arr2, sizeof(int32_t) * in_size); + insert(stream.get(), src2.get(), dst2.get(), start); + stream.download(host_out_arr2.get(), dst2, sizeof(int32_t) * out_size); + + auto dst2_graph = stream.malloc(sizeof(int32_t) * out_size); + auto host_out_arr2_graph = std::make_unique(out_size); + + kernel_node.value().update_params( + graph_exec, insert.getPropsPtr(), start, insert.getSize(), src2.get(), dst2_graph.get()); + graph_exec.launch(stream); + stream.download(host_out_arr2_graph.get(), dst2_graph, sizeof(int32_t) * out_size); + stream.synchronize(); + + ASSERT_TRUE(std::equal(host_out_arr2.get(), host_out_arr2.get() + out_size, host_out_arr2_graph.get())); +} + +class TransferNodeTest : public testing::Test {}; + +TEST_F(TransferNodeTest, Transfer) { + constexpr size_t size = 2 * 1 * 4; + const int32_t host_arr1[2][1][4] = {{{1, 42, 38, 17}}, {{1, 2, 18, 17}}}; + CUDA::Stream stream{}; + + // Transfer with graph and TransferNode + auto src1 = stream.malloc(sizeof(int32_t) * size); + auto dst1 = stream.malloc(sizeof(int32_t) * size); + const auto host_out_arr1 = std::make_unique(size); + + stream.upload(src1, host_arr1, sizeof(int32_t) * size); + + std::optional transfer_node; + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + CUDA::CaptureInfo captureInfo{stream}; + transfer_node.emplace(captureInfo.addTransferNode(dst1, src1, sizeof(int32_t) * size)); + } + CUDA::GraphExec graph_exec{capture.getGraph()}; + graph_exec.launch(stream); + + stream.download(host_out_arr1.get(), dst1, sizeof(int32_t) * size); + stream.synchronize(); + + const auto* src_ptr1 = static_cast(static_cast(host_arr1)); + ASSERT_TRUE(std::equal(src_ptr1, src_ptr1 + size, host_out_arr1.get())); + + // Transfer with graph and updated TransferNode + const int32_t host_arr2[2][1][4] = {{{31, 2, 8, 10}}, {{20, 12, 1, 7}}}; + + auto src2 = stream.malloc(sizeof(int32_t) * size); + auto dst2 = stream.malloc(sizeof(int32_t) * size); + auto host_out_arr2 = std::make_unique(size); + + stream.upload(src2, host_arr2, sizeof(int32_t) * size); + + transfer_node.value().update_ptrs(graph_exec, dst2, src2); + graph_exec.launch(stream); + + stream.download(host_out_arr2.get(), dst2, sizeof(int32_t) * size); + stream.synchronize(); + + const auto* src_ptr2 = static_cast(static_cast(host_arr2)); + ASSERT_TRUE(std::equal(src_ptr2, src_ptr2 + size, host_out_arr2.get())); +} From 6d5aed414b62966e1613805cd3bde26309d6f9c8 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Mon, 20 Nov 2023 15:09:03 +0200 Subject: [PATCH 05/14] [NVIDIA] Fix review issues --- modules/nvidia_plugin/src/cuda/graph.cpp | 2 +- modules/nvidia_plugin/src/cuda/node_params.hpp | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index aa2a8542d..3428c7130 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -161,7 +161,7 @@ DownloadNode::DownloadNode(cudaGraphNode_t node, void* dst, DevicePointer dst, CUDA::DevicePointer src) { - if (dst_ != dst && src_ != src) { + if (dst_ != dst || src_ != src) { dst_ = dst; src_ = src; throwIfError(cudaGraphExecMemcpyNodeSetParams1D( diff --git a/modules/nvidia_plugin/src/cuda/node_params.hpp b/modules/nvidia_plugin/src/cuda/node_params.hpp index c89e41855..2edc20139 100644 --- a/modules/nvidia_plugin/src/cuda/node_params.hpp +++ b/modules/nvidia_plugin/src/cuda/node_params.hpp @@ -11,13 +11,7 @@ namespace CUDA { struct NodeParams { - NodeParams(void* kernel, dim3 gridDim, dim3 blockDim) { - knp_.func = kernel; - knp_.gridDim = gridDim; - knp_.blockDim = blockDim; - knp_.sharedMemBytes = 0; - knp_.kernelParams = nullptr; - knp_.extra = nullptr; + NodeParams(void* kernel, dim3 gridDim, dim3 blockDim) : knp_{kernel, gridDim, blockDim, 0u, nullptr, nullptr} { ptrs_.reserve(20); } From 3e553b19c7396fbe976b056b49fd236e9bf5b00c Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Mon, 13 Nov 2023 19:09:04 +0200 Subject: [PATCH 06/14] [NVIDIA] Add launchers to TI, refactor Execute() --- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 178 +++++++++--------- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 97 ++++++++-- 2 files changed, 170 insertions(+), 105 deletions(-) diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index c92238aec..b99286cdd 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -133,6 +133,28 @@ TensorIteratorOp::TensorIteratorOp(const CreationContext& context, } updateExecSequence(); + + // Input mapping of ports + slices_.reserve(portmap_inputs_.size()); + for (const auto& it : portmap_inputs_) { + const auto& inputIdx = it.first; + const auto& paramIdx = inputs_parameters_map_.at(inputIdx); + slices_.emplace_back(*this, inputIdx, paramIdx); + } + + // Back-edge mapping + transfers_.reserve(results_parameters_map_.size()); + for (const auto& [resultIdx, paramIdx] : results_parameters_map_) { + transfers_.emplace_back(*this, resultIdx, paramIdx); + } + + // Output mapping of ports + inserts_.reserve(results_outputs_map_.size()); + for (const auto& [resultIdx, outputIdx] : results_outputs_map_) { + if (portmap_outputs_.count(outputIdx) > 0) { + inserts_.emplace_back(*this, resultIdx, outputIdx); + } + } } void TensorIteratorOp::Execute(const InferenceRequestContext& context, @@ -142,57 +164,49 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& cancellationToken = context.getCancellationToken(); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); // First iteration for (const auto inputIdx : invariant_inputs_) { const auto paramIdx = inputs_parameters_map_.at(inputIdx); - copyParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); } for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { if (portmap_inputs_.count(inputIdx) == 0) { - copyParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); } } for (int64_t iter = 0; iter < num_iterations_; ++iter) { - // Input mapping of ports - for (auto& it : portmap_inputs_) { - const auto& inputIdx = it.first; - const auto& paramIdx = inputs_parameters_map_.at(inputIdx); - copyParam(stream, mutableBuffer, inputTensors, iter, inputIdx, paramIdx); + for (const auto& slice : slices_) { + slice(stream, inputTensors, mutableBuffer, iter); } // Inner loop executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); // Back-edge mapping - for (auto& [resultIdx, paramIdx] : results_parameters_map_) { - copyBackEdge(stream, mutableBuffer, resultIdx, paramIdx); + for (const auto& transfer : transfers_) { + transfer(stream, mutableBuffer); } // Output mapping of ports - for (const auto& [resultIdx, outputIdx] : results_outputs_map_) { - if (portmap_outputs_.count(outputIdx) > 0) { - copyResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); - } + for (const auto& insert : inserts_) { + insert(stream, mutableBuffer, outputTensors, iter); } // Copy data to output if (iterations_results_map_.count(iter) > 0) { for (const auto& resultIdx : iterations_results_map_.at(iter)) { const auto& outputIdx = results_outputs_map_.at(resultIdx); - copyResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); + transferResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); } } } } -// TODO: Investigate problem with multi-graphs in some networks -// benchmark_app may hang in throughput mode bool TensorIteratorOp::IsCudaGraphCompatible() const { return false; } void TensorIteratorOp::Capture(InferenceRequestContext& context, @@ -202,6 +216,39 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, Execute(context, inputTensors, outputTensors, workbuffers); } +TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx) + : input_idx_{inputIdx}, + param_{*ti.params_[paramIdx]}, + memory_manager_{*ti.memory_manager_}, + slice_{ti.kernelmap_inputs_.at(inputIdx)} { + OPENVINO_ASSERT(ti.portmap_inputs_.count(inputIdx) != 0, "Node name: ", ti.GetName()); + const auto& portMap = ti.portmap_inputs_.at(input_idx_); + const auto& inputShape = ti.inputs_info_[input_idx_].shape_; + start_ = portMap.start < 0 ? inputShape[portMap.axis] + portMap.start : portMap.start; + stride_ = portMap.stride; +} + +TensorIteratorOp::TransferLauncher::TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx) + : param_{*ti.params_[paramIdx]}, result_{*ti.results_[resultIdx]}, memory_manager_{*ti.memory_manager_} { + param_size_ = ti.params_info_[paramIdx].size_; + const auto resultSize = ti.results_info_[resultIdx].size_; + OPENVINO_ASSERT(param_size_ == resultSize, "Node name: ", ti.GetName()); +} + +TensorIteratorOp::InsertLauncher::InsertLauncher(const TensorIteratorOp& ti, + const std::size_t resultIdx, + const std::size_t outputIdx) + : output_idx_{outputIdx}, + result_{*ti.results_[resultIdx]}, + memory_manager_{*ti.memory_manager_}, + insert_{ti.kernelmap_outputs_.at(outputIdx)} { + OPENVINO_ASSERT(ti.portmap_outputs_.count(outputIdx) != 0, "Node name: ", ti.GetName()); + const auto& portMap = ti.portmap_outputs_.at(output_idx_); + const auto& outputShape = ti.outputs_info_[output_idx_].shape_; + start_ = portMap.start < 0 ? outputShape[portMap.axis] + portMap.start : portMap.start; + stride_ = portMap.stride; +} + WorkbufferRequest TensorIteratorOp::GetWorkBufferRequest() const { std::vector immutable_sizes; immutable_sizes.reserve(kernelmap_inputs_.size() + kernelmap_outputs_.size()); @@ -227,87 +274,42 @@ void TensorIteratorOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { } } -void TensorIteratorOp::copyParam(const CUDA::Stream& stream, - const CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - const std::int64_t iter, - const uint64_t inputIdx, - const uint64_t paramIdx) const { +void TensorIteratorOp::transferParam(const CUDA::Stream& stream, + const CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + const std::int64_t iter, + const uint64_t inputIdx, + const uint64_t paramIdx) const { + OPENVINO_ASSERT(portmap_inputs_.count(inputIdx) == 0, "Node name: ", GetName()); auto& memoryManager = *memory_manager_; const std::size_t inputSize = inputs_info_[inputIdx].size_; const std::size_t paramSize = params_info_[paramIdx].size_; - if (portmap_inputs_.count(inputIdx) == 0) { - auto& input = inputTensors[inputIdx]; - const auto& param = params_[paramIdx]; - auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - OPENVINO_ASSERT(inputSize == paramSize, "Node name: ", GetName()); - stream.transfer(outputTensors[0], input, inputSize); - } else { - const auto& portMap = portmap_inputs_.at(inputIdx); - const auto& param = params_[paramIdx]; - auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - const auto inputShape = inputs_info_[inputIdx].shape_; - - const auto& slice = kernelmap_inputs_.at(inputIdx); - std::size_t start; - if (portMap.start < 0) { - start = inputShape[portMap.axis] + portMap.start; - } else { - start = portMap.start; - } - start += iter * portMap.stride; - auto input = inputTensors[inputIdx]; - slice(stream.get(), input.get(), outputTensors[0].get(), start); - } -} -void TensorIteratorOp::copyBackEdge(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const uint64_t resultIdx, - const uint64_t paramIdx) const { - auto& memoryManager = *memory_manager_; - const auto& result = results_[resultIdx]; + auto& input = inputTensors[inputIdx]; const auto& param = params_[paramIdx]; - auto paramTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - auto resultTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const std::size_t paramSize = params_info_[paramIdx].size_; - const std::size_t resultSize = results_info_[resultIdx].size_; - OPENVINO_ASSERT(paramSize == resultSize, "Node name: ", GetName()); - stream.transfer(paramTensors[0], resultTensors[0], paramSize); + auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); + OPENVINO_ASSERT(inputSize == paramSize, "Node name: ", GetName()); + + stream.transfer(outputTensors[0], input, inputSize); } -void TensorIteratorOp::copyResult(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - const std::int64_t iter, - const std::size_t resultIdx, - const std::size_t outputIdx) const { +void TensorIteratorOp::transferResult(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + const std::int64_t iter, + const std::size_t resultIdx, + const std::size_t outputIdx) const { + OPENVINO_ASSERT(portmap_outputs_.count(outputIdx) == 0, "Node name: ", GetName()); auto& memoryManager = *memory_manager_; const auto resultSize = results_info_[resultIdx].size_; const std::size_t outputSize = outputs_info_[outputIdx].size_; - if (portmap_outputs_.count(outputIdx) == 0) { - const auto result = results_[resultIdx]; - auto inTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const auto output = outputTensors[outputIdx]; - OPENVINO_ASSERT(resultSize == outputSize, "Node name: ", GetName()); - stream.transfer(output, inTensors[0], outputSize); - } else { - auto output = outputTensors[outputIdx]; - const auto& result = results_[resultIdx]; - auto inputTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const auto portMap = portmap_outputs_.at(outputIdx); - const auto outputShape = outputs_info_[outputIdx].shape_; - - const auto& insert = kernelmap_outputs_.at(outputIdx); - std::size_t start; - if (portMap.start < 0) { - start = outputShape[portMap.axis] + portMap.start; - } else { - start = portMap.start; - } - start += iter * portMap.stride; - insert(stream.get(), inputTensors[0].get(), output.get(), start); - } + + const auto result = results_[resultIdx]; + auto inTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); + const auto output = outputTensors[outputIdx]; + OPENVINO_ASSERT(resultSize == outputSize, "Node name: ", GetName()); + + stream.transfer(output, inTensors[0], outputSize); } void TensorIteratorOp::updateExecSequence() { diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index d172f360f..bae84ca38 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -43,25 +43,84 @@ class TensorIteratorOp : public SubGraph { int64_t axis{0}; }; + class SliceLauncher { + public: + SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx); + + void operator()(const CUDA::Stream& stream, + const IOperationExec::Inputs& inputTensors, + CUDA::DevicePointer mutableBuffer, + int64_t iter) const { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + slice_(stream.get(), src, dst, start_ + iter * stride_); + } + private: + uint64_t input_idx_; + const OperationBase& param_; + const MemoryManager& memory_manager_; + const kernel::Slice& slice_; + size_t start_; + int64_t stride_; + }; + class TransferLauncher { + public: + TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx); + + void operator()(const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer) const { + const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); + const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); + auto* dst = paramTensors[0].get(); + const auto* src = resultTensors[0].get(); + + throwIfError(cudaMemcpyAsync(dst, src, param_size_, cudaMemcpyDeviceToDevice, stream.get())); + } + + + private: + const OperationBase& param_; + const OperationBase& result_; + const MemoryManager& memory_manager_; + std::size_t param_size_; + }; + + class InsertLauncher { + public: + InsertLauncher(const TensorIteratorOp& ti, const std::size_t resultIdx, const std::size_t outputIdx); + + void operator()(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) const { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + insert_(stream.get(), src, dst, start_ + iter * stride_); + } + private: + uint64_t output_idx_; + const OperationBase& result_; + const MemoryManager& memory_manager_; + size_t start_; + int64_t stride_; + const kernel::Insert& insert_; + }; + WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; - void copyParam(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - std::int64_t iter, - uint64_t inputIdx, - uint64_t paramIdx) const; - void copyBackEdge(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - uint64_t resultIdx, - uint64_t paramIdx) const; - void copyResult(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - int64_t iter, - std::size_t resultIdx, - std::size_t outputIdx) const; + void transferParam(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + std::int64_t iter, + uint64_t inputIdx, + uint64_t paramIdx) const; + + void transferResult(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter, + std::size_t resultIdx, + std::size_t outputIdx) const; void updateExecSequence(); @@ -78,6 +137,10 @@ class TensorIteratorOp : public SubGraph { std::unordered_map portmap_outputs_; std::unordered_map kernelmap_outputs_; std::unordered_map results_parameters_map_; + + mutable std::vector slices_; + mutable std::vector transfers_; + mutable std::vector inserts_; }; } // namespace nvidia_gpu From b5e7404f43f26de1cb2b871895e797c39deab6d2 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Mon, 13 Nov 2023 19:21:11 +0200 Subject: [PATCH 07/14] [NVIDIA] Add TiCudaGraphInfo --- .../nvidia_plugin/src/cuda_graph_context.cpp | 37 ++++++++++++++ .../nvidia_plugin/src/cuda_graph_context.hpp | 51 +++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index e1f9e2487..cfa6fbc9f 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -7,6 +7,41 @@ namespace ov { namespace nvidia_gpu { +void TiCudaGraphInfo::add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); +} + +void TiCudaGraphInfo::set_params_graph(const CUDA::Graph& graph) { + paramsGraph_.emplace(graph); + paramsGraphExec_.emplace(graph); +} + +void TiCudaGraphInfo::set_body_graph(const CUDA::Graph& graph) { + bodyGraph_.emplace(graph); + bodyGraphExec_.emplace(graph); +} + +void TiCudaGraphInfo::set_results_graph(const CUDA::Graph& graph) { + resultsGraph_.emplace(graph); + resultsGraphExec_.emplace(graph); +} + +void TiCudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); } + +void TiCudaGraphInfo::launch_body_graph(const CUDA::Stream& stream) const { bodyGraphExec_.value().launch(stream); } + +void TiCudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { + resultsGraphExec_.value().launch(stream); +} + +std::size_t TiCudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); } + +std::size_t TiCudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); } + void CudaGraphContext::reset() { graphs_.clear(); currentGraphIndex_ = 0; @@ -40,6 +75,8 @@ void CudaGraphContext::add_graph(const CUDA::Graph& graph) { graphs_[currentGraphIndex_].set_graph(graph); } +TiCudaGraphInfo& CudaGraphContext::get_ti_graph(const std::string& ti_op_name) const { return ti_graphs_[ti_op_name]; } + bool CudaGraphContext::is_initialized() const { const auto size = graphs_.size(); return size != 0 && graphs_[size - 1].is_initialized(); diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index c0ca01e18..c3c1545a2 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -11,6 +11,49 @@ namespace ov { namespace nvidia_gpu { +class TiCudaGraphInfo { +public: + void add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + + template + void add_kernel(const CUDA::Stream& stream, void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args) { + CUDA::CaptureInfo captureInfo{stream}; + kernelNodes_.emplace_back(captureInfo.addKernelNode(kernel, gridDim, blockDim, std::forward(args)...)); + } + + void set_params_graph(const CUDA::Graph& graph); + void set_body_graph(const CUDA::Graph& graph); + void set_results_graph(const CUDA::Graph& graph); + + template + void update_kernel(std::size_t index, Args&&... args) { + kernelNodes_[index].update_params(bodyGraphExec_.value(), std::forward(args)...); + } + + void launch_params_graph(const CUDA::Stream& stream) const; + void launch_body_graph(const CUDA::Stream& stream) const; + void launch_results_graph(const CUDA::Stream& stream) const; + + std::size_t get_transfers_count() const; + std::size_t get_kernels_count() const; + +private: + std::optional paramsGraph_{}; + std::optional paramsGraphExec_{}; + + std::optional bodyGraph_{}; + std::optional bodyGraphExec_{}; + + std::optional resultsGraph_{}; + std::optional resultsGraphExec_{}; + + std::vector transferNodes_; + std::vector kernelNodes_; +}; + class CudaGraphContext { public: void reset(); @@ -31,6 +74,10 @@ class CudaGraphContext { void add_graph(const CUDA::Graph& graph); + void add_ti_graph(const std::string& ti_op_name, const CUDA::Graph& graph); + + TiCudaGraphInfo& get_ti_graph(const std::string& ti_op_name) const; + bool is_initialized() const; void update_capture(const TensorMappingContext& context); @@ -39,6 +86,7 @@ class CudaGraphContext { std::size_t get_params_count() const; std::size_t get_results_count() const; + std::size_t get_graphs_count() const; friend bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); @@ -78,12 +126,15 @@ class CudaGraphContext { std::optional graphExec_{}; std::map parameterNodes_; std::map resultNodes_; + }; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); +private: std::vector graphs_{}; + mutable std::unordered_map ti_graphs_; mutable std::size_t currentGraphIndex_ = 0; }; From 4ce98f4e82062d39921211abefde4d870daf2fc8 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Mon, 13 Nov 2023 20:16:48 +0200 Subject: [PATCH 08/14] [NVIDIA] Update TI to support CUDA graph as a body of iterations loop --- .../src/cuda_graph_topology_runner.cpp | 39 +++-- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 136 +++++++++++++++++- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 43 +++++- 3 files changed, 206 insertions(+), 12 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 1e851ec41..7c6c757db 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -5,14 +5,26 @@ #include "cuda_graph_topology_runner.hpp" #include "cuda/event.hpp" +#include "ops/tensor_iterator.hpp" namespace ov { namespace nvidia_gpu { +namespace { + +std::shared_ptr getTI(const SubGraph& sg) { + auto& seq = sg.getExecSequence(); + if (seq.size() != 1) { + return nullptr; + } + return std::dynamic_pointer_cast(seq[0]); +} + +} // namespace + CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model) - : orig_subgraph_{context, model}, - cuda_graphs_count_{0} { + : orig_subgraph_{context, model}, cuda_graphs_count_{0} { std::vector sequences; SubGraph::ExecSequence currentSequence; const auto& origSequence = orig_subgraph_.getExecSequence(); @@ -23,7 +35,7 @@ CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, currentSequence.push_back(origSequence[0]); for (size_t i = 1; i < totalSize; ++i) { const auto& op = origSequence[i]; - if (op->IsCudaGraphCompatible() != isLastOpCompatible) { + if (std::dynamic_pointer_cast(op) || op->IsCudaGraphCompatible() != isLastOpCompatible) { isLastOpCompatible = !isLastOpCompatible; sequences.emplace_back(std::move(currentSequence)); currentSequence.clear(); @@ -45,7 +57,14 @@ void CudaGraphTopologyRunner::Run(const InferenceRequestContext& context, const const auto& stream = context.getThreadContext().stream(); std::size_t graphIndex = 0; for (auto& subgraph : subgraphs_) { - if (subgraph.IsCudaGraphCompatible()) { + if (auto ti = getTI(subgraph)) { + CUDA::DevicePointer mutableBuffer{memoryBlock.view().data()}; + const auto& memoryManager = *subgraph.memoryManager(); + const auto& inputTensors = memoryManager.inputTensorPointers(*ti, mutableBuffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*ti, mutableBuffer); + const auto& workBuffers = memoryManager.workBuffers(*ti, mutableBuffer); + ti->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); + } else if (subgraph.IsCudaGraphCompatible()) { context.getCudaGraphContext().launch(graphIndex, stream); graphIndex++; } else { @@ -63,21 +82,23 @@ void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, graphContext.reset(); for (const auto& subgraph : subgraphs_) { - if (subgraph.IsCudaGraphCompatible()) { + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + if (getTI(subgraph)) { + subgraph.Capture(context, {}, {}, workbuffers); + } else if (subgraph.IsCudaGraphCompatible()) { graphContext.start_next_graph_addition(); CUDA::GraphCapture capture{stream}; { auto scope = capture.getScope(); - Workbuffers workbuffers{}; - workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); subgraph.Capture(context, {}, {}, workbuffers); } const auto& graph = capture.getGraph(); graphContext.add_graph(graph); } } - OPENVINO_ASSERT(graphContext.get_graphs_count() == GetCudaGraphsCount(), - "CudaGraphTopologyRunner/CudaGraphContext graphs count mismatch"); + // OPENVINO_ASSERT(graphContext.get_graphs_count() == GetCudaGraphsCount(), + // "CudaGraphTopologyRunner/CudaGraphContext graphs count mismatch"); } const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index b99286cdd..4da29334b 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -207,13 +207,101 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, } } -bool TensorIteratorOp::IsCudaGraphCompatible() const { return false; } +void TensorIteratorOp::ExecuteGraph(const InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + auto& tiGraphInfo = context.getCudaGraphContext().get_ti_graph(GetName()); + + tiGraphInfo.launch_params_graph(stream); + + OPENVINO_ASSERT(tiGraphInfo.get_kernels_count() == slices_.size() + inserts_.size(), + "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + + for (int64_t iter = 0; iter < num_iterations_; ++iter) { + for (std::size_t i = 0; i < slices_.size(); ++i) { + slices_[i].update_kernel_node(tiGraphInfo, i, mutableBuffer, inputTensors, iter); + } + for (std::size_t i = 0; i < inserts_.size(); ++i) { + inserts_[i].update_kernel_node(tiGraphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); + } + tiGraphInfo.launch_body_graph(stream); + } + + tiGraphInfo.launch_results_graph(stream); +} + +bool TensorIteratorOp::IsCudaGraphCompatible() const { + // This implementation is CUDA graph compatible only if this is the standard TI with output only of the last + // iteration (which is handled outside of the iterations loop) + if (iterations_results_map_.size() != 1 || iterations_results_map_.count(num_iterations_ - 1) == 0) { + return false; + } + return SubGraph::IsCudaGraphCompatible(); +} void TensorIteratorOp::Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const { - Execute(context, inputTensors, outputTensors, workbuffers); + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + + auto& tiGraphInfo = context.getCudaGraphContext().get_ti_graph(GetName()); + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + // First iteration + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + } + tiGraphInfo.set_params_graph(capture.getGraph()); + { + auto scope = capture.getScope(); + // Input mapping of ports + for (auto& slice : slices_) { + slice.add_kernel_node(tiGraphInfo, stream, mutableBuffer, inputTensors); + } + + // Inner loop + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); + + // Back-edge mapping + for (auto& transfer : transfers_) { + transfer.add_transfer_node(tiGraphInfo, stream, mutableBuffer); + } + + // Output mapping of ports + for (auto& insert : inserts_) { + insert.add_kernel_node(tiGraphInfo, stream, mutableBuffer, outputTensors); + } + } + tiGraphInfo.set_body_graph(capture.getGraph()); + { + auto scope = capture.getScope(); + // Copy data to output + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } + } + } + tiGraphInfo.set_results_graph(capture.getGraph()); } TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx) @@ -228,6 +316,23 @@ TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint6 stride_ = portMap.stride; } +void TensorIteratorOp::SliceLauncher::add_kernel_node(TiCudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors) { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + info.add_kernel(stream, + slice_.getKernel(), + slice_.getNumBlocks(), + slice_.getThreadsPerBlock(), + slice_.getPropsPtr(), + start_, + slice_.getSize(), + src, + dst); +} + TensorIteratorOp::TransferLauncher::TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx) : param_{*ti.params_[paramIdx]}, result_{*ti.results_[resultIdx]}, memory_manager_{*ti.memory_manager_} { param_size_ = ti.params_info_[paramIdx].size_; @@ -235,6 +340,16 @@ TensorIteratorOp::TransferLauncher::TransferLauncher(const TensorIteratorOp& ti, OPENVINO_ASSERT(param_size_ == resultSize, "Node name: ", ti.GetName()); } +void TensorIteratorOp::TransferLauncher::add_transfer_node(TiCudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer) { + const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); + auto dst = paramTensors[0]; + const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); + const auto src = resultTensors[0]; + info.add_transfer(stream, dst, src, param_size_); +} + TensorIteratorOp::InsertLauncher::InsertLauncher(const TensorIteratorOp& ti, const std::size_t resultIdx, const std::size_t outputIdx) @@ -249,6 +364,23 @@ TensorIteratorOp::InsertLauncher::InsertLauncher(const TensorIteratorOp& ti, stride_ = portMap.stride; } +void TensorIteratorOp::InsertLauncher::add_kernel_node(TiCudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors) { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + info.add_kernel(stream, + insert_.getKernel(), + insert_.getNumBlocks(), + insert_.getThreadsPerBlock(), + insert_.getPropsPtr(), + start_, + insert_.getSize(), + src, + dst); +} + WorkbufferRequest TensorIteratorOp::GetWorkBufferRequest() const { std::vector immutable_sizes; immutable_sizes.reserve(kernelmap_inputs_.size() + kernelmap_outputs_.size()); diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index bae84ca38..f53ccfad1 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,11 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers) const override; + void ExecuteGraph(const InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers); + bool IsCudaGraphCompatible() const override; void Capture(InferenceRequestContext& context, @@ -55,6 +61,22 @@ class TensorIteratorOp : public SubGraph { auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); slice_(stream.get(), src, dst, start_ + iter * stride_); } + + void add_kernel_node(TiCudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors); + + void update_kernel_node(TiCudaGraphInfo& info, + std::size_t index, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + int64_t iter) { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + info.update_kernel(index, slice_.getPropsPtr(), start_ + iter * stride_, slice_.getSize(), src, dst); + } + private: uint64_t input_idx_; const OperationBase& param_; @@ -63,6 +85,7 @@ class TensorIteratorOp : public SubGraph { size_t start_; int64_t stride_; }; + class TransferLauncher { public: TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx); @@ -72,10 +95,12 @@ class TensorIteratorOp : public SubGraph { const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); auto* dst = paramTensors[0].get(); const auto* src = resultTensors[0].get(); - throwIfError(cudaMemcpyAsync(dst, src, param_size_, cudaMemcpyDeviceToDevice, stream.get())); } + void add_transfer_node(TiCudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer); private: const OperationBase& param_; @@ -96,6 +121,22 @@ class TensorIteratorOp : public SubGraph { auto* dst = outputTensors[output_idx_].get(); insert_(stream.get(), src, dst, start_ + iter * stride_); } + + void add_kernel_node(TiCudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors); + + void update_kernel_node(TiCudaGraphInfo& info, + std::size_t index, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + info.update_kernel(index, insert_.getPropsPtr(), start_ + iter * stride_, insert_.getSize(), src, dst); + } + private: uint64_t output_idx_; const OperationBase& result_; From d7f3957ae0d8c91ae14995464a66b1553ede474d Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Tue, 14 Nov 2023 23:12:42 +0200 Subject: [PATCH 09/14] [NVIDIA] Add operator== for dim3, KernelNode and NodeParams --- modules/nvidia_plugin/src/cuda/graph.cpp | 4 ++++ modules/nvidia_plugin/src/cuda/graph.hpp | 2 ++ modules/nvidia_plugin/src/cuda/node_params.hpp | 9 +++++++++ modules/nvidia_plugin/src/cuda/utils.hpp | 15 +++++++++++++++ 4 files changed, 30 insertions(+) create mode 100644 modules/nvidia_plugin/src/cuda/utils.hpp diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index 3428c7130..ceae4a8c2 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -189,4 +189,8 @@ bool CUDA::TransferNode::operator==(const TransferNode& rhs) const { return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; } +bool KernelNode::operator==(const KernelNode& rhs) const { + return node_ == rhs.node_ && node_params_ == rhs.node_params_; +} + } // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp index 4cfeeaa04..4360af27b 100644 --- a/modules/nvidia_plugin/src/cuda/graph.hpp +++ b/modules/nvidia_plugin/src/cuda/graph.hpp @@ -148,6 +148,8 @@ class KernelNode { throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, &node_params_.get_knp())); } + bool operator==(const KernelNode& rhs) const; + private: KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params); diff --git a/modules/nvidia_plugin/src/cuda/node_params.hpp b/modules/nvidia_plugin/src/cuda/node_params.hpp index 2edc20139..aadea48fa 100644 --- a/modules/nvidia_plugin/src/cuda/node_params.hpp +++ b/modules/nvidia_plugin/src/cuda/node_params.hpp @@ -6,6 +6,7 @@ #include +#include #include namespace CUDA { @@ -33,9 +34,17 @@ struct NodeParams { void reset_args() { ptrs_.clear(); } + friend bool operator==(const NodeParams& lhs, const NodeParams& rhs); + private: std::vector ptrs_; cudaKernelNodeParams knp_; }; +inline bool operator==(const NodeParams& lhs, const NodeParams& rhs) { + return lhs.ptrs_ == rhs.ptrs_ && rhs.knp_.func == lhs.knp_.func && rhs.knp_.gridDim == lhs.knp_.gridDim && + rhs.knp_.blockDim == lhs.knp_.blockDim && rhs.knp_.sharedMemBytes == lhs.knp_.sharedMemBytes && + rhs.knp_.extra == lhs.knp_.extra; +} + } // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/utils.hpp b/modules/nvidia_plugin/src/cuda/utils.hpp new file mode 100644 index 000000000..1ac504076 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda/utils.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace CUDA { + +inline bool operator==(dim3 rhs, dim3 lhs) { return rhs.x == lhs.x && rhs.y == lhs.y && rhs.z == lhs.z; } + +inline bool operator!=(dim3 rhs, dim3 lhs) { return !(rhs == lhs); } + +} // namespace CUDA From f45363393178b013c6a0b43b1d78d463949cbdcb Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Tue, 14 Nov 2023 23:15:24 +0200 Subject: [PATCH 10/14] [NVIDIA] Update Run() of *TopologyRunners to take non-const context reference --- modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp | 2 +- modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp | 2 +- modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp | 2 +- modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp | 2 +- modules/nvidia_plugin/src/cuda_itopology_runner.hpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp index 383cff255..892d48b49 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp @@ -10,7 +10,7 @@ namespace nvidia_gpu { EagerTopologyRunner::EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model) : SubGraph(context, model) {} -void EagerTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { +void EagerTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { Workbuffers workbuffers{}; workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); SubGraph::Execute(context, {}, {}, workbuffers); diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp index 67230df6b..ef03cf251 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp @@ -16,7 +16,7 @@ class EagerTopologyRunner final : public SubGraph, public ITopologyRunner { EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model); ~EagerTopologyRunner() override = default; - void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override{}; const SubGraph& GetSubGraph() const override; }; diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 7c6c757db..fbd4356f6 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -53,7 +53,7 @@ CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, } } -void CudaGraphTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { +void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { const auto& stream = context.getThreadContext().stream(); std::size_t graphIndex = 0; for (auto& subgraph : subgraphs_) { diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp index 8e7cd1b85..408129fc9 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp @@ -14,7 +14,7 @@ class CudaGraphTopologyRunner final : public ITopologyRunner { CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model); ~CudaGraphTopologyRunner() override = default; - void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; const SubGraph& GetSubGraph() const override; diff --git a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp index 04cb61bbc..0e450db79 100644 --- a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp @@ -10,7 +10,7 @@ namespace ov { namespace nvidia_gpu { struct ITopologyRunner { - virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; virtual const SubGraph& GetSubGraph() const = 0; virtual ~ITopologyRunner() = default; From a267341bc09b8b44190e513341e43612a65ba561 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Tue, 14 Nov 2023 23:35:17 +0200 Subject: [PATCH 11/14] [NVIDIA] Remove TiCudaGraphInfo, add set_current_graph(), add_new_graph_info(), get_current_graph_info(), select_current_graph() --- .../nvidia_plugin/src/cuda_graph_context.cpp | 169 ++++++++---------- .../nvidia_plugin/src/cuda_graph_context.hpp | 112 +++++------- .../src/cuda_graph_topology_runner.cpp | 13 +- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 36 ++-- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 12 +- 5 files changed, 155 insertions(+), 187 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index cfa6fbc9f..4033b5df5 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -7,58 +7,84 @@ namespace ov { namespace nvidia_gpu { -void TiCudaGraphInfo::add_transfer(const CUDA::Stream& stream, - CUDA::DevicePointer dst, - CUDA::DevicePointer src, - std::size_t size) { +void CudaGraphInfo::add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); +} + +void CudaGraphInfo::add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); +} + +void CudaGraphInfo::add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { CUDA::CaptureInfo captureInfo{stream}; transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); } -void TiCudaGraphInfo::set_params_graph(const CUDA::Graph& graph) { - paramsGraph_.emplace(graph); - paramsGraphExec_.emplace(graph); +bool CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } + +void CudaGraphInfo::update_capture(const TensorMappingContext& context) { + for (auto&& [tensorName, node] : parameterNodes_) { + node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); + } + for (auto&& [tensorName, node] : resultNodes_) { + node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); + } +} + +void CudaGraphInfo::set_graph(const CUDA::Graph& graph) { + graph_.emplace(graph); + graphExec_.emplace(graph); } -void TiCudaGraphInfo::set_body_graph(const CUDA::Graph& graph) { - bodyGraph_.emplace(graph); - bodyGraphExec_.emplace(graph); +void CudaGraphInfo::set_params_graph(const CUDA::Graph& graph) { + paramsGraph_.emplace(graph); + paramsGraphExec_.emplace(graph); } -void TiCudaGraphInfo::set_results_graph(const CUDA::Graph& graph) { +void CudaGraphInfo::set_results_graph(const CUDA::Graph& graph) { resultsGraph_.emplace(graph); resultsGraphExec_.emplace(graph); } -void TiCudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); } +void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } -void TiCudaGraphInfo::launch_body_graph(const CUDA::Stream& stream) const { bodyGraphExec_.value().launch(stream); } +void CudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); } -void TiCudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { - resultsGraphExec_.value().launch(stream); -} +void CudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { resultsGraphExec_.value().launch(stream); } -std::size_t TiCudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); } +bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) { + return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && + lhs.resultNodes_ == rhs.resultNodes_ && lhs.transferNodes_ == rhs.transferNodes_ && + lhs.kernelNodes_ == rhs.kernelNodes_; +} -std::size_t TiCudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); } +bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) { return !(lhs == rhs); } void CudaGraphContext::reset() { - graphs_.clear(); + graph_infos_.clear(); currentGraphIndex_ = 0; } -void CudaGraphContext::start_next_graph_addition() { - currentGraphIndex_ = graphs_.size(); - graphs_.emplace_back(); -} - void CudaGraphContext::add_parameter(const std::string& tensorName, const CUDA::Stream& stream, CUDA::DevicePointer dst, const void* src, std::size_t size) { - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size); + OPENVINO_ASSERT(currentGraphIndex_ < graph_infos_.size(), "Graph index/vector size incosistency"); + graph_infos_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size); } void CudaGraphContext::add_result(const std::string& tensorName, @@ -66,37 +92,43 @@ void CudaGraphContext::add_result(const std::string& tensorName, void* dst, CUDA::DevicePointer src, std::size_t size) { - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); + OPENVINO_ASSERT(currentGraphIndex_ < graph_infos_.size(), "Graph index/vector size incosistency"); + graph_infos_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); } -void CudaGraphContext::add_graph(const CUDA::Graph& graph) { - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].set_graph(graph); +void CudaGraphContext::set_current_graph(const CUDA::Graph& graph) { + OPENVINO_ASSERT(currentGraphIndex_ < graph_infos_.size(), "Graph index/vector size incosistency"); + graph_infos_[currentGraphIndex_].set_graph(graph); } -TiCudaGraphInfo& CudaGraphContext::get_ti_graph(const std::string& ti_op_name) const { return ti_graphs_[ti_op_name]; } - bool CudaGraphContext::is_initialized() const { - const auto size = graphs_.size(); - return size != 0 && graphs_[size - 1].is_initialized(); + const auto size = graph_infos_.size(); + return size != 0 && graph_infos_[size - 1].is_initialized(); } void CudaGraphContext::update_capture(const TensorMappingContext& context) { - for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) { - graphs_[currentGraphIndex_].update_capture(context); + for (currentGraphIndex_ = 0; currentGraphIndex_ < graph_infos_.size(); ++currentGraphIndex_) { + graph_infos_[currentGraphIndex_].update_capture(context); } } -void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const { +void CudaGraphContext::add_new_graph_info() { + currentGraphIndex_ = graph_infos_.size(); + graph_infos_.emplace_back(); +} + +const CudaGraphInfo& CudaGraphContext::get_current_graph_info() const { return graph_infos_[currentGraphIndex_]; } + +CudaGraphInfo& CudaGraphContext::get_current_graph_info() { return graph_infos_[currentGraphIndex_]; } + +void CudaGraphContext::select_current_graph(std::size_t index) { + OPENVINO_ASSERT(index < graph_infos_.size(), "Graph index/vector size incosistency"); currentGraphIndex_ = index; - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].launch(stream); } std::size_t CudaGraphContext::get_params_count() const { std::size_t res = 0; - for (const auto& graph : graphs_) { + for (const auto& graph : graph_infos_) { res += graph.get_params_count(); } return res; @@ -104,64 +136,15 @@ std::size_t CudaGraphContext::get_params_count() const { std::size_t CudaGraphContext::get_results_count() const { std::size_t res = 0; - for (const auto& graph : graphs_) { + for (const auto& graph : graph_infos_) { res += graph.get_results_count(); } return res; } -std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); } - -void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - const void* src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); -} - -void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName, - const CUDA::Stream& stream, - void* dst, - CUDA::DevicePointer src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); -} - -void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) { - graph_.emplace(graph); - graphExec_.emplace(graph); -} - -bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } - -void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) { - for (auto&& [tensorName, node] : parameterNodes_) { - node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); - } - for (auto&& [tensorName, node] : resultNodes_) { - node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); - } -} - -void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } - -std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); } - -std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); } - -bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { - return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && - lhs.resultNodes_ == rhs.resultNodes_; -} - -bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { - return !(lhs == rhs); -} +std::size_t CudaGraphContext::get_graphs_count() const { return graph_infos_.size(); } -bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; } +bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graph_infos_ == rhs.graph_infos_; } bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); } diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index c3c1545a2..484bc15c5 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -11,8 +11,20 @@ namespace ov { namespace nvidia_gpu { -class TiCudaGraphInfo { +class CudaGraphInfo { public: + void add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size); + + void add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size); + void add_transfer(const CUDA::Stream& stream, CUDA::DevicePointer dst, CUDA::DevicePointer src, @@ -24,32 +36,44 @@ class TiCudaGraphInfo { kernelNodes_.emplace_back(captureInfo.addKernelNode(kernel, gridDim, blockDim, std::forward(args)...)); } - void set_params_graph(const CUDA::Graph& graph); - void set_body_graph(const CUDA::Graph& graph); - void set_results_graph(const CUDA::Graph& graph); - template void update_kernel(std::size_t index, Args&&... args) { - kernelNodes_[index].update_params(bodyGraphExec_.value(), std::forward(args)...); + kernelNodes_[index].update_params(graphExec_.value(), std::forward(args)...); } + bool is_initialized() const; + + void update_capture(const TensorMappingContext& context); + + std::size_t get_params_count() const { return parameterNodes_.size(); } + std::size_t get_results_count() const { return resultNodes_.size(); } + std::size_t get_transfers_count() const { return transferNodes_.size(); } + std::size_t get_kernels_count() const { return kernelNodes_.size(); } + + void set_graph(const CUDA::Graph& graph); + void set_params_graph(const CUDA::Graph& graph); + void set_results_graph(const CUDA::Graph& graph); + + void launch(const CUDA::Stream& stream) const; void launch_params_graph(const CUDA::Stream& stream) const; - void launch_body_graph(const CUDA::Stream& stream) const; void launch_results_graph(const CUDA::Stream& stream) const; - std::size_t get_transfers_count() const; - std::size_t get_kernels_count() const; + friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); private: + std::optional graph_{}; + std::optional graphExec_{}; + std::optional paramsGraph_{}; std::optional paramsGraphExec_{}; - std::optional bodyGraph_{}; - std::optional bodyGraphExec_{}; - std::optional resultsGraph_{}; std::optional resultsGraphExec_{}; + std::map parameterNodes_; + std::map resultNodes_; + std::vector transferNodes_; std::vector kernelNodes_; }; @@ -58,8 +82,6 @@ class CudaGraphContext { public: void reset(); - void start_next_graph_addition(); - void add_parameter(const std::string& tensorName, const CUDA::Stream& stream, CUDA::DevicePointer dst, @@ -72,17 +94,18 @@ class CudaGraphContext { CUDA::DevicePointer src, std::size_t size); - void add_graph(const CUDA::Graph& graph); - - void add_ti_graph(const std::string& ti_op_name, const CUDA::Graph& graph); - - TiCudaGraphInfo& get_ti_graph(const std::string& ti_op_name) const; + void set_current_graph(const CUDA::Graph& graph); bool is_initialized() const; void update_capture(const TensorMappingContext& context); - void launch(std::size_t index, const CUDA::Stream& stream) const; + void add_new_graph_info(); + + const CudaGraphInfo& get_current_graph_info() const; + CudaGraphInfo& get_current_graph_info(); + + void select_current_graph(std::size_t index); std::size_t get_params_count() const; std::size_t get_results_count() const; @@ -93,54 +116,13 @@ class CudaGraphContext { friend bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); private: - class CudaGraphInfo { - public: - void add_parameter(const std::string& tensorName, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - const void* src, - std::size_t size); - - void add_result(const std::string& tensorName, - const CUDA::Stream& stream, - void* dst, - CUDA::DevicePointer src, - std::size_t size); - - void set_graph(const CUDA::Graph& graph); - - bool is_initialized() const; - - void update_capture(const TensorMappingContext& context); - - void launch(const CUDA::Stream& stream) const; - - std::size_t get_params_count() const; - std::size_t get_results_count() const; - - friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - - private: - std::optional graph_{}; - std::optional graphExec_{}; - std::map parameterNodes_; - std::map resultNodes_; - - }; - - friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - -private: - std::vector graphs_{}; - mutable std::unordered_map ti_graphs_; - mutable std::size_t currentGraphIndex_ = 0; + std::vector graph_infos_{}; + std::size_t currentGraphIndex_ = 0; }; -bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); +bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); -bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); +bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index fbd4356f6..615083dd7 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -55,6 +55,7 @@ CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { const auto& stream = context.getThreadContext().stream(); + auto& graphContext = context.getCudaGraphContext(); std::size_t graphIndex = 0; for (auto& subgraph : subgraphs_) { if (auto ti = getTI(subgraph)) { @@ -63,9 +64,12 @@ void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const Device const auto& inputTensors = memoryManager.inputTensorPointers(*ti, mutableBuffer); const auto& outputTensors = memoryManager.outputTensorPointers(*ti, mutableBuffer); const auto& workBuffers = memoryManager.workBuffers(*ti, mutableBuffer); + graphContext.select_current_graph(graphIndex); ti->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); + graphIndex++; } else if (subgraph.IsCudaGraphCompatible()) { - context.getCudaGraphContext().launch(graphIndex, stream); + graphContext.select_current_graph(graphIndex); + graphContext.get_current_graph_info().launch(stream); graphIndex++; } else { Workbuffers workbuffers{}; @@ -85,20 +89,19 @@ void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, Workbuffers workbuffers{}; workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); if (getTI(subgraph)) { + graphContext.add_new_graph_info(); subgraph.Capture(context, {}, {}, workbuffers); } else if (subgraph.IsCudaGraphCompatible()) { - graphContext.start_next_graph_addition(); + graphContext.add_new_graph_info(); CUDA::GraphCapture capture{stream}; { auto scope = capture.getScope(); subgraph.Capture(context, {}, {}, workbuffers); } const auto& graph = capture.getGraph(); - graphContext.add_graph(graph); + graphContext.set_current_graph(graph); } } - // OPENVINO_ASSERT(graphContext.get_graphs_count() == GetCudaGraphsCount(), - // "CudaGraphTopologyRunner/CudaGraphContext graphs count mismatch"); } const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 4da29334b..b0dbd0c5f 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -207,7 +207,7 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, } } -void TensorIteratorOp::ExecuteGraph(const InferenceRequestContext& context, +void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) { @@ -215,24 +215,24 @@ void TensorIteratorOp::ExecuteGraph(const InferenceRequestContext& context, const auto& memoryManager = *memory_manager_; const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& tiGraphInfo = context.getCudaGraphContext().get_ti_graph(GetName()); + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); - tiGraphInfo.launch_params_graph(stream); + graphInfo.launch_params_graph(stream); - OPENVINO_ASSERT(tiGraphInfo.get_kernels_count() == slices_.size() + inserts_.size(), + OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); for (int64_t iter = 0; iter < num_iterations_; ++iter) { for (std::size_t i = 0; i < slices_.size(); ++i) { - slices_[i].update_kernel_node(tiGraphInfo, i, mutableBuffer, inputTensors, iter); + slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter); } for (std::size_t i = 0; i < inserts_.size(); ++i) { - inserts_[i].update_kernel_node(tiGraphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); + inserts_[i].update_kernel_node(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); } - tiGraphInfo.launch_body_graph(stream); + graphInfo.launch(stream); } - tiGraphInfo.launch_results_graph(stream); + graphInfo.launch_results_graph(stream); } bool TensorIteratorOp::IsCudaGraphCompatible() const { @@ -254,7 +254,7 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - auto& tiGraphInfo = context.getCudaGraphContext().get_ti_graph(GetName()); + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); CUDA::GraphCapture capture{stream}; { auto scope = capture.getScope(); @@ -269,12 +269,12 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, } } } - tiGraphInfo.set_params_graph(capture.getGraph()); + graphInfo.set_params_graph(capture.getGraph()); { auto scope = capture.getScope(); // Input mapping of ports for (auto& slice : slices_) { - slice.add_kernel_node(tiGraphInfo, stream, mutableBuffer, inputTensors); + slice.add_kernel_node(graphInfo, stream, mutableBuffer, inputTensors); } // Inner loop @@ -282,15 +282,15 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, // Back-edge mapping for (auto& transfer : transfers_) { - transfer.add_transfer_node(tiGraphInfo, stream, mutableBuffer); + transfer.add_transfer_node(graphInfo, stream, mutableBuffer); } // Output mapping of ports for (auto& insert : inserts_) { - insert.add_kernel_node(tiGraphInfo, stream, mutableBuffer, outputTensors); + insert.add_kernel_node(graphInfo, stream, mutableBuffer, outputTensors); } } - tiGraphInfo.set_body_graph(capture.getGraph()); + graphInfo.set_graph(capture.getGraph()); { auto scope = capture.getScope(); // Copy data to output @@ -301,7 +301,7 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, } } } - tiGraphInfo.set_results_graph(capture.getGraph()); + graphInfo.set_results_graph(capture.getGraph()); } TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx) @@ -316,7 +316,7 @@ TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint6 stride_ = portMap.stride; } -void TensorIteratorOp::SliceLauncher::add_kernel_node(TiCudaGraphInfo& info, +void TensorIteratorOp::SliceLauncher::add_kernel_node(CudaGraphInfo& info, const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer, const IOperationExec::Inputs& inputTensors) { @@ -340,7 +340,7 @@ TensorIteratorOp::TransferLauncher::TransferLauncher(const TensorIteratorOp& ti, OPENVINO_ASSERT(param_size_ == resultSize, "Node name: ", ti.GetName()); } -void TensorIteratorOp::TransferLauncher::add_transfer_node(TiCudaGraphInfo& info, +void TensorIteratorOp::TransferLauncher::add_transfer_node(CudaGraphInfo& info, const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer) { const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); @@ -364,7 +364,7 @@ TensorIteratorOp::InsertLauncher::InsertLauncher(const TensorIteratorOp& ti, stride_ = portMap.stride; } -void TensorIteratorOp::InsertLauncher::add_kernel_node(TiCudaGraphInfo& info, +void TensorIteratorOp::InsertLauncher::add_kernel_node(CudaGraphInfo& info, const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer, const IOperationExec::Outputs& outputTensors) { diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index f53ccfad1..44f22bc01 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -28,7 +28,7 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers) const override; - void ExecuteGraph(const InferenceRequestContext& context, + void ExecuteGraph(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers); @@ -62,12 +62,12 @@ class TensorIteratorOp : public SubGraph { slice_(stream.get(), src, dst, start_ + iter * stride_); } - void add_kernel_node(TiCudaGraphInfo& info, + void add_kernel_node(CudaGraphInfo& info, const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer, const IOperationExec::Inputs& inputTensors); - void update_kernel_node(TiCudaGraphInfo& info, + void update_kernel_node(CudaGraphInfo& info, std::size_t index, CUDA::DevicePointer mutableBuffer, const IOperationExec::Inputs& inputTensors, @@ -98,7 +98,7 @@ class TensorIteratorOp : public SubGraph { throwIfError(cudaMemcpyAsync(dst, src, param_size_, cudaMemcpyDeviceToDevice, stream.get())); } - void add_transfer_node(TiCudaGraphInfo& info, + void add_transfer_node(CudaGraphInfo& info, const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer); @@ -122,12 +122,12 @@ class TensorIteratorOp : public SubGraph { insert_(stream.get(), src, dst, start_ + iter * stride_); } - void add_kernel_node(TiCudaGraphInfo& info, + void add_kernel_node(CudaGraphInfo& info, const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer, const IOperationExec::Outputs& outputTensors); - void update_kernel_node(TiCudaGraphInfo& info, + void update_kernel_node(CudaGraphInfo& info, std::size_t index, CUDA::DevicePointer mutableBuffer, const IOperationExec::Outputs& outputTensors, From 80dff6fcef7570f3b25d040af2700a0c3d5296ff Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Wed, 15 Nov 2023 18:38:32 +0200 Subject: [PATCH 12/14] [NVIDIA] Change IsCudaGraphCompatible() interface to GetCudaGraphCompatibility() using enum --- .../src/cuda_graph_topology_runner.cpp | 30 +++++++++++-------- .../nvidia_plugin/src/cuda_operation_base.hpp | 6 ++-- .../src/ops/activation_forward_cudnn_base.cpp | 4 ++- .../src/ops/activation_forward_cudnn_base.hpp | 2 +- modules/nvidia_plugin/src/ops/avgpool.cpp | 2 +- modules/nvidia_plugin/src/ops/avgpool.hpp | 2 +- modules/nvidia_plugin/src/ops/broadcast.cpp | 2 +- modules/nvidia_plugin/src/ops/broadcast.hpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cuda.cpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cuda.hpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cudnn.cpp | 2 +- modules/nvidia_plugin/src/ops/clamp_cudnn.hpp | 2 +- modules/nvidia_plugin/src/ops/comparison.cpp | 2 +- modules/nvidia_plugin/src/ops/comparison.hpp | 2 +- modules/nvidia_plugin/src/ops/concat.cpp | 2 +- modules/nvidia_plugin/src/ops/concat.hpp | 2 +- modules/nvidia_plugin/src/ops/convert.cpp | 2 +- modules/nvidia_plugin/src/ops/convert.hpp | 2 +- .../src/ops/convert_color_i420.hpp | 2 +- .../src/ops/convert_color_nv12.hpp | 2 +- .../src/ops/convolution_backprop_data.cpp | 4 +-- .../src/ops/convolution_backprop_data.hpp | 2 +- .../src/ops/convolution_cudnn.cpp | 2 +- .../src/ops/convolution_cudnn.hpp | 2 +- .../src/ops/convolution_cudnn_be.cpp | 2 +- .../src/ops/convolution_cudnn_be.hpp | 2 +- .../src/ops/cudnn_tensor_op_base.cpp | 2 +- .../src/ops/cudnn_tensor_op_base.hpp | 2 +- .../src/ops/detection_output.cpp | 2 +- .../src/ops/detection_output.hpp | 2 +- .../src/ops/elementwise_binary.hpp | 2 +- .../src/ops/elementwise_unary.hpp | 2 +- modules/nvidia_plugin/src/ops/elu.cpp | 2 +- modules/nvidia_plugin/src/ops/elu.hpp | 2 +- .../nvidia_plugin/src/ops/fake_quantize.cpp | 2 +- .../nvidia_plugin/src/ops/fake_quantize.hpp | 2 +- .../nvidia_plugin/src/ops/fully_connected.cpp | 2 +- .../nvidia_plugin/src/ops/fully_connected.hpp | 2 +- .../ops/fused_convolution_backprop_data.cpp | 4 ++- .../ops/fused_convolution_backprop_data.hpp | 2 +- .../src/ops/fused_convolution_cudnn.cpp | 2 +- .../src/ops/fused_convolution_cudnn.hpp | 2 +- .../src/ops/fused_convolution_cudnn_be.cpp | 4 ++- .../src/ops/fused_convolution_cudnn_be.hpp | 2 +- .../fused_convolution_cudnn_decomposed.cpp | 4 ++- .../fused_convolution_cudnn_decomposed.hpp | 2 +- modules/nvidia_plugin/src/ops/gather.cpp | 2 +- modules/nvidia_plugin/src/ops/gather.hpp | 2 +- .../src/ops/group_convolution.cpp | 2 +- .../src/ops/group_convolution.hpp | 2 +- modules/nvidia_plugin/src/ops/gru_cell.cpp | 2 +- modules/nvidia_plugin/src/ops/gru_cell.hpp | 2 +- .../nvidia_plugin/src/ops/gru_sequence.cpp | 6 ++-- .../nvidia_plugin/src/ops/gru_sequence.hpp | 4 +-- .../src/ops/interpolate_cubic.cpp | 2 +- .../src/ops/interpolate_cubic.hpp | 2 +- .../src/ops/interpolate_linear.cpp | 2 +- .../src/ops/interpolate_linear.hpp | 2 +- .../src/ops/interpolate_nearest.cpp | 2 +- .../src/ops/interpolate_nearest.hpp | 2 +- modules/nvidia_plugin/src/ops/logical_not.cpp | 2 +- modules/nvidia_plugin/src/ops/logical_not.hpp | 2 +- modules/nvidia_plugin/src/ops/lstm_cell.cpp | 2 +- modules/nvidia_plugin/src/ops/lstm_cell.hpp | 2 +- .../src/ops/lstm_sequence_base.cpp | 6 ++-- .../src/ops/lstm_sequence_base.hpp | 4 +-- modules/nvidia_plugin/src/ops/matmul.cpp | 2 +- modules/nvidia_plugin/src/ops/matmul.hpp | 2 +- modules/nvidia_plugin/src/ops/maxpool.cpp | 2 +- modules/nvidia_plugin/src/ops/maxpool.hpp | 2 +- modules/nvidia_plugin/src/ops/mvn.cpp | 2 +- modules/nvidia_plugin/src/ops/mvn.hpp | 2 +- modules/nvidia_plugin/src/ops/nop_op.hpp | 2 +- modules/nvidia_plugin/src/ops/pad.cpp | 2 +- modules/nvidia_plugin/src/ops/pad.hpp | 2 +- modules/nvidia_plugin/src/ops/parameter.cpp | 2 +- modules/nvidia_plugin/src/ops/parameter.hpp | 2 +- modules/nvidia_plugin/src/ops/range.cpp | 2 +- modules/nvidia_plugin/src/ops/range.hpp | 2 +- modules/nvidia_plugin/src/ops/reduce.cpp | 2 +- modules/nvidia_plugin/src/ops/reduce.hpp | 2 +- modules/nvidia_plugin/src/ops/result.cpp | 2 +- modules/nvidia_plugin/src/ops/result.hpp | 2 +- modules/nvidia_plugin/src/ops/round.cpp | 2 +- modules/nvidia_plugin/src/ops/round.hpp | 2 +- .../src/ops/scatter_nd_update.cpp | 2 +- .../src/ops/scatter_nd_update.hpp | 2 +- modules/nvidia_plugin/src/ops/select.cpp | 2 +- modules/nvidia_plugin/src/ops/select.hpp | 2 +- modules/nvidia_plugin/src/ops/softmax.cpp | 2 +- modules/nvidia_plugin/src/ops/softmax.hpp | 2 +- modules/nvidia_plugin/src/ops/split.cpp | 2 +- modules/nvidia_plugin/src/ops/split.hpp | 2 +- .../nvidia_plugin/src/ops/strided_slice.cpp | 4 ++- .../nvidia_plugin/src/ops/strided_slice.hpp | 2 +- modules/nvidia_plugin/src/ops/subgraph.cpp | 16 ++++++---- modules/nvidia_plugin/src/ops/subgraph.hpp | 7 ++--- modules/nvidia_plugin/src/ops/swish.cpp | 2 +- modules/nvidia_plugin/src/ops/swish.hpp | 2 +- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 7 +++-- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 2 +- modules/nvidia_plugin/src/ops/topk.cpp | 2 +- modules/nvidia_plugin/src/ops/topk.hpp | 2 +- modules/nvidia_plugin/src/ops/transpose.cpp | 2 +- modules/nvidia_plugin/src/ops/transpose.hpp | 2 +- .../nvidia_plugin/src/ops/variadic_split.cpp | 2 +- .../nvidia_plugin/src/ops/variadic_split.hpp | 2 +- ...tible.cpp => cuda_graph_compatibility.cpp} | 12 ++++---- .../tests/unit/cuda_multi_graph_test.cpp | 4 +-- 109 files changed, 167 insertions(+), 143 deletions(-) rename modules/nvidia_plugin/tests/unit/{is_cuda_graph_compatible.cpp => cuda_graph_compatibility.cpp} (95%) diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 615083dd7..cb456c4e5 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -31,12 +31,12 @@ CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const auto totalSize = origSequence.size(); OPENVINO_ASSERT(totalSize != 0, "ExecSequence size is 0"); - bool isLastOpCompatible = origSequence[0]->IsCudaGraphCompatible(); + CudaGraphCompatibility lastOpCompatibility = origSequence[0]->GetCudaGraphCompatibility(); currentSequence.push_back(origSequence[0]); for (size_t i = 1; i < totalSize; ++i) { const auto& op = origSequence[i]; - if (std::dynamic_pointer_cast(op) || op->IsCudaGraphCompatible() != isLastOpCompatible) { - isLastOpCompatible = !isLastOpCompatible; + if (auto c = op->GetCudaGraphCompatibility(); c != lastOpCompatibility) { + lastOpCompatibility = c; sequences.emplace_back(std::move(currentSequence)); currentSequence.clear(); } @@ -47,7 +47,7 @@ CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const auto& memoryManager = orig_subgraph_.memoryManager(); for (auto&& sequence : sequences) { subgraphs_.emplace_back(context, model, std::move(sequence), memoryManager); - if (subgraphs_[subgraphs_.size() - 1].IsCudaGraphCompatible()) { + if (subgraphs_.back().GetCudaGraphCompatibility() != CudaGraphCompatibility::NONE) { ++cuda_graphs_count_; } } @@ -58,7 +58,14 @@ void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const Device auto& graphContext = context.getCudaGraphContext(); std::size_t graphIndex = 0; for (auto& subgraph : subgraphs_) { - if (auto ti = getTI(subgraph)) { + auto compatibility = subgraph.GetCudaGraphCompatibility(); + if (compatibility == CudaGraphCompatibility::FULL) { + graphContext.select_current_graph(graphIndex); + graphContext.get_current_graph_info().launch(stream); + graphIndex++; + } else if (compatibility == CudaGraphCompatibility::SPECIAL) { + // TODO: remove + auto ti = getTI(subgraph); CUDA::DevicePointer mutableBuffer{memoryBlock.view().data()}; const auto& memoryManager = *subgraph.memoryManager(); const auto& inputTensors = memoryManager.inputTensorPointers(*ti, mutableBuffer); @@ -67,10 +74,6 @@ void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const Device graphContext.select_current_graph(graphIndex); ti->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); graphIndex++; - } else if (subgraph.IsCudaGraphCompatible()) { - graphContext.select_current_graph(graphIndex); - graphContext.get_current_graph_info().launch(stream); - graphIndex++; } else { Workbuffers workbuffers{}; workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); @@ -88,10 +91,8 @@ void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, for (const auto& subgraph : subgraphs_) { Workbuffers workbuffers{}; workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); - if (getTI(subgraph)) { - graphContext.add_new_graph_info(); - subgraph.Capture(context, {}, {}, workbuffers); - } else if (subgraph.IsCudaGraphCompatible()) { + auto compatibility = subgraph.GetCudaGraphCompatibility(); + if (compatibility == CudaGraphCompatibility::FULL) { graphContext.add_new_graph_info(); CUDA::GraphCapture capture{stream}; { @@ -100,6 +101,9 @@ void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, } const auto& graph = capture.getGraph(); graphContext.set_current_graph(graph); + } else if (compatibility == CudaGraphCompatibility::SPECIAL) { + graphContext.add_new_graph_info(); + subgraph.Capture(context, {}, {}, workbuffers); } } } diff --git a/modules/nvidia_plugin/src/cuda_operation_base.hpp b/modules/nvidia_plugin/src/cuda_operation_base.hpp index d95273175..261c45950 100644 --- a/modules/nvidia_plugin/src/cuda_operation_base.hpp +++ b/modules/nvidia_plugin/src/cuda_operation_base.hpp @@ -30,6 +30,8 @@ namespace nvidia_gpu { template using DevicePointer = CUDA::DevicePointer; +enum class CudaGraphCompatibility { NONE, FULL, SPECIAL }; + class IOperationExec { public: using Inputs = gsl::span>; @@ -46,7 +48,7 @@ class IOperationExec { Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; - virtual bool IsCudaGraphCompatible() const = 0; + virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0; virtual void InitSharedImmutableWorkbuffers(const Buffers&) = 0; virtual WorkbufferRequest GetWorkBufferRequest() const = 0; virtual const WorkbufferIds& GetWorkbufferIds() const = 0; @@ -81,7 +83,7 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: IndexCollection&& inputIds, IndexCollection&& outputIds); - bool IsCudaGraphCompatible() const override { return false; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::NONE; } WorkbufferRequest GetWorkBufferRequest() const override { return {}; // Most operators do not need workbuffers diff --git a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp index 20a681cbf..8f752f01e 100644 --- a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp +++ b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp @@ -59,7 +59,9 @@ void ActivationForwardCuDnnOpBase::Execute(const InferenceRequestContext& contex outputTensors[0].get()); } -bool ActivationForwardCuDnnOpBase::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ActivationForwardCuDnnOpBase::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp index 05e5c5598..de065efa7 100644 --- a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp +++ b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp @@ -31,7 +31,7 @@ class ActivationForwardCuDnnOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; protected: std::unique_ptr op_desc_; diff --git a/modules/nvidia_plugin/src/ops/avgpool.cpp b/modules/nvidia_plugin/src/ops/avgpool.cpp index 6131ef5a5..858c50a06 100644 --- a/modules/nvidia_plugin/src/ops/avgpool.cpp +++ b/modules/nvidia_plugin/src/ops/avgpool.cpp @@ -30,7 +30,7 @@ void AvgPoolOp::Execute(const InferenceRequestContext& context, outputs[PoolingImpl::output_index].get()); } -bool AvgPoolOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility AvgPoolOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(AvgPoolOp, AvgPool); diff --git a/modules/nvidia_plugin/src/ops/avgpool.hpp b/modules/nvidia_plugin/src/ops/avgpool.hpp index e22e66fa0..184669f37 100644 --- a/modules/nvidia_plugin/src/ops/avgpool.hpp +++ b/modules/nvidia_plugin/src/ops/avgpool.hpp @@ -23,7 +23,7 @@ class AvgPoolOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: PoolingImpl impl_; diff --git a/modules/nvidia_plugin/src/ops/broadcast.cpp b/modules/nvidia_plugin/src/ops/broadcast.cpp index 5f97adf6c..3b6939a33 100644 --- a/modules/nvidia_plugin/src/ops/broadcast.cpp +++ b/modules/nvidia_plugin/src/ops/broadcast.cpp @@ -65,7 +65,7 @@ void BroadcastOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream, inputs[0].get(), broadcast_params_->mapper(workbuffers.immutable_buffers), outputs[0].get()); } -bool BroadcastOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility BroadcastOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest BroadcastOp::GetWorkBufferRequest() const { return {immutable_buffer_sizes_, {}}; } diff --git a/modules/nvidia_plugin/src/ops/broadcast.hpp b/modules/nvidia_plugin/src/ops/broadcast.hpp index e59b1792c..cb89a06ec 100644 --- a/modules/nvidia_plugin/src/ops/broadcast.hpp +++ b/modules/nvidia_plugin/src/ops/broadcast.hpp @@ -27,7 +27,7 @@ class BroadcastOp : public OperationBase { WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::vector immutable_buffer_sizes_; diff --git a/modules/nvidia_plugin/src/ops/clamp_cuda.cpp b/modules/nvidia_plugin/src/ops/clamp_cuda.cpp index bb0b0a87a..9fa5d7e8c 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cuda.cpp +++ b/modules/nvidia_plugin/src/ops/clamp_cuda.cpp @@ -51,7 +51,7 @@ void ClampCudaOp::Execute(const InferenceRequestContext& context, (*kernel_)(context.getThreadContext().stream().get(), inputTensors[0].get(), outputTensors[0].get()); } -bool ClampCudaOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ClampCudaOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/clamp_cuda.hpp b/modules/nvidia_plugin/src/ops/clamp_cuda.hpp index 61ee4153e..78baa91cc 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cuda.hpp +++ b/modules/nvidia_plugin/src/ops/clamp_cuda.hpp @@ -26,7 +26,7 @@ class ClampCudaOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp b/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp index 854ce8a77..7188bce83 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp @@ -97,7 +97,7 @@ void ClampCuDnnOp::Execute(const InferenceRequestContext& context, outputTensors[0].get()); } -bool ClampCuDnnOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ClampCuDnnOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void ClampCuDnnOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { switch (data_type_) { diff --git a/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp b/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp index 544ba081d..2a754af8f 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp @@ -33,7 +33,7 @@ class ClampCuDnnOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/comparison.cpp b/modules/nvidia_plugin/src/ops/comparison.cpp index 3f9bf2ea0..e96f9b4f0 100644 --- a/modules/nvidia_plugin/src/ops/comparison.cpp +++ b/modules/nvidia_plugin/src/ops/comparison.cpp @@ -84,7 +84,7 @@ Comparison::Comparison(const CreationContext& context, threads_per_block}; } -bool Comparison::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility Comparison::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void Comparison::Execute(const InferenceRequestContext& context, Inputs inputs, diff --git a/modules/nvidia_plugin/src/ops/comparison.hpp b/modules/nvidia_plugin/src/ops/comparison.hpp index 64b73b0a3..c1de21a91 100644 --- a/modules/nvidia_plugin/src/ops/comparison.hpp +++ b/modules/nvidia_plugin/src/ops/comparison.hpp @@ -18,7 +18,7 @@ class Comparison : public OperationBase { IndexCollection&& outputIds, kernel::Comparison::Op_t operation_type); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void calculateOffsets(); diff --git a/modules/nvidia_plugin/src/ops/concat.cpp b/modules/nvidia_plugin/src/ops/concat.cpp index 8b1b1bd2a..f0276b0b4 100644 --- a/modules/nvidia_plugin/src/ops/concat.cpp +++ b/modules/nvidia_plugin/src/ops/concat.cpp @@ -95,7 +95,7 @@ void ConcatOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool ConcatOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility ConcatOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(ConcatOp, Concat); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/concat.hpp b/modules/nvidia_plugin/src/ops/concat.hpp index 223e9e337..566edb231 100644 --- a/modules/nvidia_plugin/src/ops/concat.hpp +++ b/modules/nvidia_plugin/src/ops/concat.hpp @@ -28,7 +28,7 @@ class ConcatOp : public OperationBase { WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: size_t immutableWbSize() const { return concat_kernel_.value().immutableWbSize(); } diff --git a/modules/nvidia_plugin/src/ops/convert.cpp b/modules/nvidia_plugin/src/ops/convert.cpp index c27d141a5..502ac9b11 100644 --- a/modules/nvidia_plugin/src/ops/convert.cpp +++ b/modules/nvidia_plugin/src/ops/convert.cpp @@ -55,7 +55,7 @@ void ConvertOp::Execute(const InferenceRequestContext& context, (*convert_kernel_)(stream.get(), outputs[0].get(), inputs[0].get()); } -bool ConvertOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ConvertOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(ConvertOp, Convert); diff --git a/modules/nvidia_plugin/src/ops/convert.hpp b/modules/nvidia_plugin/src/ops/convert.hpp index 471a27351..c1ac63f14 100644 --- a/modules/nvidia_plugin/src/ops/convert.hpp +++ b/modules/nvidia_plugin/src/ops/convert.hpp @@ -24,7 +24,7 @@ class ConvertOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; using Type_t = ov::element::Type_t; using convert_t = void (*)( diff --git a/modules/nvidia_plugin/src/ops/convert_color_i420.hpp b/modules/nvidia_plugin/src/ops/convert_color_i420.hpp index f9c4e9602..58ff845da 100644 --- a/modules/nvidia_plugin/src/ops/convert_color_i420.hpp +++ b/modules/nvidia_plugin/src/ops/convert_color_i420.hpp @@ -91,7 +91,7 @@ class I420ConvertColorBase : public OperationBase { } } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp b/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp index 20168f6d7..abb5aa5a2 100644 --- a/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp +++ b/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp @@ -90,7 +90,7 @@ class NV12ConvertColorBase : public OperationBase { } } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp b/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp index 59d0aa762..f23ff2ed9 100644 --- a/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp @@ -43,8 +43,8 @@ void ConvBackpropDataOp::Execute(const InferenceRequestContext& context, } template -bool ConvBackpropDataOp::IsCudaGraphCompatible() const { - return true; +CudaGraphCompatibility ConvBackpropDataOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(ConvolutionBackpropDataOp, ConvolutionBackpropData); diff --git a/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp b/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp index 213195d75..e761825bf 100644 --- a/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp @@ -32,7 +32,7 @@ class ConvBackpropDataOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp b/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp index 66d25ab31..a5a7484ec 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp @@ -44,7 +44,7 @@ void ConvolutionCuDnn::Execute(const InferenceRequestContext& context, throwIfError(status); } -bool ConvolutionCuDnn::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ConvolutionCuDnn::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest ConvolutionCuDnn::GetWorkBufferRequest() const { if (descs_.Algo().memory != 0) diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp b/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp index abe858d98..3e0f63519 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp @@ -29,7 +29,7 @@ class ConvolutionCuDnn : public OperationCuDnn { WorkbufferRequest GetWorkBufferRequest() const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: Convolution::Details::ConvolutionDescriptorsCuDnn descs_; diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp index d4557a936..d0ac64568 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp @@ -148,7 +148,7 @@ void ConvolutionCuDnnBE::Execute(const InferenceRequestContext& context, throwIfError(::cudnnBackendExecute(context.getThreadContext().dnnHandle().get(), plan->get(), variantPack->get())); } -bool ConvolutionCuDnnBE::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility ConvolutionCuDnnBE::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } std::shared_ptr ConvolutionCuDnnBE::MakeTensorDescriptor(int64_t id, cudnnDataType_t element_type, diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp index 1a68d8560..ac348e28f 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp @@ -33,7 +33,7 @@ class ConvolutionCuDnnBE : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp index d40e301e5..0903adae1 100644 --- a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp +++ b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp @@ -166,7 +166,7 @@ void CuDnnTensorOpBase::Execute(const InferenceRequestContext& context, outputTensors[0].get()); } -bool CuDnnTensorOpBase::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility CuDnnTensorOpBase::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } CuDnnTensorOpBase::IoParams::IoParams(const ov::Node& node, const Type& io_type, int index) : type_(convertDataType(io_type == Type::INPUT ? node.get_input_element_type(index) diff --git a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp index 0dce5eaf5..f9fc14181 100644 --- a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp +++ b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp @@ -24,7 +24,7 @@ class CuDnnTensorOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: struct IoParams { diff --git a/modules/nvidia_plugin/src/ops/detection_output.cpp b/modules/nvidia_plugin/src/ops/detection_output.cpp index 20fff51cd..3418b552a 100644 --- a/modules/nvidia_plugin/src/ops/detection_output.cpp +++ b/modules/nvidia_plugin/src/ops/detection_output.cpp @@ -107,7 +107,7 @@ void DetectionOutputOp::Execute(const InferenceRequestContext& context, } } -bool DetectionOutputOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility DetectionOutputOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void DetectionOutputOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { kernel_.value().initSharedImmutableWorkbuffers(buffers); diff --git a/modules/nvidia_plugin/src/ops/detection_output.hpp b/modules/nvidia_plugin/src/ops/detection_output.hpp index 5a29d95e2..f2e65303f 100644 --- a/modules/nvidia_plugin/src/ops/detection_output.hpp +++ b/modules/nvidia_plugin/src/ops/detection_output.hpp @@ -25,7 +25,7 @@ class DetectionOutputOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/elementwise_binary.hpp b/modules/nvidia_plugin/src/ops/elementwise_binary.hpp index 3ada107a8..e6ed7454e 100644 --- a/modules/nvidia_plugin/src/ops/elementwise_binary.hpp +++ b/modules/nvidia_plugin/src/ops/elementwise_binary.hpp @@ -59,7 +59,7 @@ class ElementwiseBinaryOp : public OperationBase { static_cast(outputTensors[0].get())); } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) override { in0_broadcast_params_->initWorkbuffers(buffers); diff --git a/modules/nvidia_plugin/src/ops/elementwise_unary.hpp b/modules/nvidia_plugin/src/ops/elementwise_unary.hpp index bbdac1cfb..0f99c41ea 100644 --- a/modules/nvidia_plugin/src/ops/elementwise_unary.hpp +++ b/modules/nvidia_plugin/src/ops/elementwise_unary.hpp @@ -46,7 +46,7 @@ class ElementwiseUnaryOp : public OperationBase { (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/elu.cpp b/modules/nvidia_plugin/src/ops/elu.cpp index a747c3eb0..0d87bd6cf 100644 --- a/modules/nvidia_plugin/src/ops/elu.cpp +++ b/modules/nvidia_plugin/src/ops/elu.cpp @@ -45,7 +45,7 @@ void EluOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } -bool EluOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility EluOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(EluOp, Elu); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/elu.hpp b/modules/nvidia_plugin/src/ops/elu.hpp index 57e9f1ea2..16ddaac4c 100644 --- a/modules/nvidia_plugin/src/ops/elu.hpp +++ b/modules/nvidia_plugin/src/ops/elu.hpp @@ -23,7 +23,7 @@ class EluOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/fake_quantize.cpp b/modules/nvidia_plugin/src/ops/fake_quantize.cpp index aa1a8bee9..2752fd083 100644 --- a/modules/nvidia_plugin/src/ops/fake_quantize.cpp +++ b/modules/nvidia_plugin/src/ops/fake_quantize.cpp @@ -45,7 +45,7 @@ FakeQuantizeOp::FakeQuantizeOp(const CreationContext &context, convertDataType(element_type), output_size, max_threads_per_block, levels}; } -bool FakeQuantizeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FakeQuantizeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void FakeQuantizeOp::Execute(const InferenceRequestContext &context, Inputs inputTensors, diff --git a/modules/nvidia_plugin/src/ops/fake_quantize.hpp b/modules/nvidia_plugin/src/ops/fake_quantize.hpp index a054f4520..9de32fc45 100644 --- a/modules/nvidia_plugin/src/ops/fake_quantize.hpp +++ b/modules/nvidia_plugin/src/ops/fake_quantize.hpp @@ -20,7 +20,7 @@ class FakeQuantizeOp : public OperationBase { IndexCollection&& inputIds, IndexCollection&& outputIds); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void Execute(const InferenceRequestContext& context, diff --git a/modules/nvidia_plugin/src/ops/fully_connected.cpp b/modules/nvidia_plugin/src/ops/fully_connected.cpp index d8fc6d9d1..60adc1fae 100644 --- a/modules/nvidia_plugin/src/ops/fully_connected.cpp +++ b/modules/nvidia_plugin/src/ops/fully_connected.cpp @@ -54,7 +54,7 @@ void FullyConnectedOp::Execute(const InferenceRequestContext& context, matmul_op_.Execute(context, inputs.first(inputs.size() - 1), outputs, workbuffers); } -bool FullyConnectedOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FullyConnectedOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(FullyConnectedOp, FullyConnected); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/fully_connected.hpp b/modules/nvidia_plugin/src/ops/fully_connected.hpp index c60e7d6ad..72e249e67 100644 --- a/modules/nvidia_plugin/src/ops/fully_connected.hpp +++ b/modules/nvidia_plugin/src/ops/fully_connected.hpp @@ -26,7 +26,7 @@ class FullyConnectedOp : public OperationCuBlas { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: MatMulOp matmul_op_; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp index be671ac98..d50d1f499 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp @@ -77,7 +77,9 @@ void FusedConvolutionBackpropDataOp::Execute(const InferenceRequestContext& cont outputs[ArgIndices3Ins::dinput].get())); } -bool FusedConvolutionBackpropDataOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionBackpropDataOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} void FusedConvolutionBackpropDataOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp index 8edfd8035..97625cf3d 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp @@ -26,7 +26,7 @@ class FusedConvolutionBackpropDataOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp index 10f117d74..86fcebb35 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp @@ -96,7 +96,7 @@ void FusedConvolutionCuDnn::Execute(const InferenceRequestContext& context, outputs[ArgIndices::output].get())); } -bool FusedConvolutionCuDnn::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionCuDnn::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest FusedConvolutionCuDnn::GetWorkBufferRequest() const { if (conv_descs_->Algo().memory != 0) diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp index ddf7c5d59..b9fc013da 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp @@ -35,7 +35,7 @@ class FusedConvolutionCuDnn : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override {} WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp index 9311177c9..541629fae 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp @@ -327,7 +327,9 @@ void FusedConvolutionCuDnnBE::Execute(const InferenceRequestContext& context, throwIfError(::cudnnBackendExecute(context.getThreadContext().dnnHandle().get(), plan->get(), variantPack->get())); } -bool FusedConvolutionCuDnnBE::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility FusedConvolutionCuDnnBE::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::NONE; +} std::shared_ptr FusedConvolutionCuDnnBE::MakeTensorDescriptor( int64_t id, diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp index e2ddb74ab..910887819 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp @@ -34,7 +34,7 @@ class FusedConvolutionCuDnnBE : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; private: diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp index 088a41790..a4efd1ca7 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp @@ -85,7 +85,9 @@ void FusedConvolutionCuDnnDecomposed::Execute(const InferenceRequestContext& con } } -bool FusedConvolutionCuDnnDecomposed::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionCuDnnDecomposed::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} WorkbufferRequest FusedConvolutionCuDnnDecomposed::GetWorkBufferRequest() const { if (conv_descs_->Algo().memory != 0) { diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp index e92be760a..b426008ed 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp @@ -37,7 +37,7 @@ class FusedConvolutionCuDnnDecomposed : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override {} WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/gather.cpp b/modules/nvidia_plugin/src/ops/gather.cpp index f666cc35e..b954135d3 100644 --- a/modules/nvidia_plugin/src/ops/gather.cpp +++ b/modules/nvidia_plugin/src/ops/gather.cpp @@ -178,7 +178,7 @@ void GatherOp::Execute(const InferenceRequestContext& context, (*gather_kernel_)(context.getThreadContext().stream().get(), inputs[0].get(), inputs[1].get(), outputs[0].get()); } -bool GatherOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GatherOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(GatherOp, Gather); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/gather.hpp b/modules/nvidia_plugin/src/ops/gather.hpp index 6adb48be8..9753a8fc2 100644 --- a/modules/nvidia_plugin/src/ops/gather.hpp +++ b/modules/nvidia_plugin/src/ops/gather.hpp @@ -22,7 +22,7 @@ class GatherOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional gather_kernel_; diff --git a/modules/nvidia_plugin/src/ops/group_convolution.cpp b/modules/nvidia_plugin/src/ops/group_convolution.cpp index b381bb47f..280fa7337 100644 --- a/modules/nvidia_plugin/src/ops/group_convolution.cpp +++ b/modules/nvidia_plugin/src/ops/group_convolution.cpp @@ -25,7 +25,7 @@ void GroupConvolutionOp::Execute(const InferenceRequestContext &context, convolution_.Execute(context, inputTensors, outputTensors, buffers); } -bool GroupConvolutionOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GroupConvolutionOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest GroupConvolutionOp::GetWorkBufferRequest() const { return convolution_.GetWorkBufferRequest(); } diff --git a/modules/nvidia_plugin/src/ops/group_convolution.hpp b/modules/nvidia_plugin/src/ops/group_convolution.hpp index f44ac6936..5b1bb3d1a 100644 --- a/modules/nvidia_plugin/src/ops/group_convolution.hpp +++ b/modules/nvidia_plugin/src/ops/group_convolution.hpp @@ -27,7 +27,7 @@ class GroupConvolutionOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override final; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override final; private: diff --git a/modules/nvidia_plugin/src/ops/gru_cell.cpp b/modules/nvidia_plugin/src/ops/gru_cell.cpp index b8bf2736d..43d0b0f47 100644 --- a/modules/nvidia_plugin/src/ops/gru_cell.cpp +++ b/modules/nvidia_plugin/src/ops/gru_cell.cpp @@ -61,7 +61,7 @@ void GRUCellOp::Execute(const InferenceRequestContext& context, nullptr); } -bool GRUCellOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GRUCellOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void GRUCellOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1 || buffers.size() == 2, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/gru_cell.hpp b/modules/nvidia_plugin/src/ops/gru_cell.hpp index 6e00cd5b4..2fcebcb0d 100644 --- a/modules/nvidia_plugin/src/ops/gru_cell.hpp +++ b/modules/nvidia_plugin/src/ops/gru_cell.hpp @@ -27,7 +27,7 @@ class GRUCellOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/gru_sequence.cpp b/modules/nvidia_plugin/src/ops/gru_sequence.cpp index 7317aef73..84a6893b4 100644 --- a/modules/nvidia_plugin/src/ops/gru_sequence.cpp +++ b/modules/nvidia_plugin/src/ops/gru_sequence.cpp @@ -19,7 +19,9 @@ GRUSequenceOp::GRUSequenceOp(const CreationContext& context, : OperationCuDnn(context, node, std::move(inputIds), std::move(outputIds)), params_{node}, descs_{context, params_, config()}, - is_cuda_graph_compatible_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device())} { + graph_compatibility_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device()) + ? CudaGraphCompatibility::FULL + : CudaGraphCompatibility::NONE} { ib_seq_lengths_.addRequest(immut_sizes_, descs_.seqLengthArraySizeBytes()); ib_weight_space_.addRequest(immut_sizes_, descs_.weightSpaceSize()); @@ -71,7 +73,7 @@ void GRUSequenceOp::Execute(const InferenceRequestContext& context, nullptr); } -bool GRUSequenceOp::IsCudaGraphCompatible() const { return is_cuda_graph_compatible_; } +CudaGraphCompatibility GRUSequenceOp::GetCudaGraphCompatibility() const { return graph_compatibility_; } void GRUSequenceOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { descs_.initDevSeqLengthArray(CUDA::DevicePointer{ib_seq_lengths_.requiredPtr(buffers)}); diff --git a/modules/nvidia_plugin/src/ops/gru_sequence.hpp b/modules/nvidia_plugin/src/ops/gru_sequence.hpp index 6b2335901..00153a193 100644 --- a/modules/nvidia_plugin/src/ops/gru_sequence.hpp +++ b/modules/nvidia_plugin/src/ops/gru_sequence.hpp @@ -32,7 +32,7 @@ class GRUSequenceOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: static Config config(); @@ -50,7 +50,7 @@ class GRUSequenceOp : public OperationCuDnn { WorkbufferDesc ib_weight_space_; WorkbufferDesc mb_work_space_; - bool is_cuda_graph_compatible_; + CudaGraphCompatibility graph_compatibility_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp b/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp index 0fc3cfe91..a89764cee 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp @@ -68,7 +68,7 @@ void InterpolateCubicOp::Execute(const InferenceRequestContext& context, (*interpolate_)(context.getThreadContext().stream().get(), inputs[0].get(), outputs[0].get()); } -bool InterpolateCubicOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateCubicOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest InterpolateCubicOp::GetWorkBufferRequest() const { return {interpolate_->immutableWorkbufferSizes(), {}}; diff --git a/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp b/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp index 32e06cc00..ee5348657 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp @@ -25,7 +25,7 @@ class InterpolateCubicOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/interpolate_linear.cpp b/modules/nvidia_plugin/src/ops/interpolate_linear.cpp index b19d1228f..2c70981ed 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_linear.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_linear.cpp @@ -69,7 +69,7 @@ void InterpolateLinearOp::Execute(const InferenceRequestContext& context, (*interpolate_)(context.getThreadContext().stream().get(), inputs[0].get(), outputs[0].get()); } -bool InterpolateLinearOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateLinearOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest InterpolateLinearOp::GetWorkBufferRequest() const { return {interpolate_->immutableWorkbufferSizes(), {}}; diff --git a/modules/nvidia_plugin/src/ops/interpolate_linear.hpp b/modules/nvidia_plugin/src/ops/interpolate_linear.hpp index eb5e6539a..22b911040 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_linear.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_linear.hpp @@ -25,7 +25,7 @@ class InterpolateLinearOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp b/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp index 89caf4b47..6367e38fc 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp @@ -158,7 +158,7 @@ void InterpolateNearestOp::Execute(const InferenceRequestContext& context, dst); } -bool InterpolateNearestOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateNearestOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } template static auto size_in_bytes(const std::vector& v) noexcept { diff --git a/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp b/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp index 311b09177..487ef3829 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp @@ -26,7 +26,7 @@ class InterpolateNearestOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/logical_not.cpp b/modules/nvidia_plugin/src/ops/logical_not.cpp index 97d121e75..456955343 100644 --- a/modules/nvidia_plugin/src/ops/logical_not.cpp +++ b/modules/nvidia_plugin/src/ops/logical_not.cpp @@ -35,7 +35,7 @@ void LogicalNotOp::Execute(const InferenceRequestContext& context, throwIfError(cudaPeekAtLastError()); } -bool LogicalNotOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility LogicalNotOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(LogicalNotOp, LogicalNot); diff --git a/modules/nvidia_plugin/src/ops/logical_not.hpp b/modules/nvidia_plugin/src/ops/logical_not.hpp index 8c95dc415..681737c7d 100644 --- a/modules/nvidia_plugin/src/ops/logical_not.hpp +++ b/modules/nvidia_plugin/src/ops/logical_not.hpp @@ -21,7 +21,7 @@ class LogicalNotOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: kernel::LogicalNot kernel_; diff --git a/modules/nvidia_plugin/src/ops/lstm_cell.cpp b/modules/nvidia_plugin/src/ops/lstm_cell.cpp index b53232793..5d7f45cb4 100644 --- a/modules/nvidia_plugin/src/ops/lstm_cell.cpp +++ b/modules/nvidia_plugin/src/ops/lstm_cell.cpp @@ -57,7 +57,7 @@ void LSTMCellOp::Execute(const InferenceRequestContext& context, nullptr); } -bool LSTMCellOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility LSTMCellOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void LSTMCellOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1 || buffers.size() == 2, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/lstm_cell.hpp b/modules/nvidia_plugin/src/ops/lstm_cell.hpp index b36a4b36b..348b92116 100644 --- a/modules/nvidia_plugin/src/ops/lstm_cell.hpp +++ b/modules/nvidia_plugin/src/ops/lstm_cell.hpp @@ -27,7 +27,7 @@ class LSTMCellOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp b/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp index 5873378cc..ab5e35a5d 100644 --- a/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp +++ b/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp @@ -20,7 +20,9 @@ LSTMSequenceOpBase::LSTMSequenceOpBase(const CreationContext& context, : OperationCuDnn(context, node, std::move(inputIds), std::move(outputIds)), params_{params}, descs_{context, params_, config}, - is_cuda_graph_compatible_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device())} { + graph_compatibility_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device()) + ? CudaGraphCompatibility::FULL + : CudaGraphCompatibility::NONE} { ib_seq_lengths_.addRequest(immut_sizes_, descs_.seqLengthArraySizeBytes()); ib_weight_space_.addRequest(immut_sizes_, descs_.weightSpaceSize()); @@ -76,7 +78,7 @@ void LSTMSequenceOpBase::Execute(const InferenceRequestContext& context, if (cy_adapter) cy_adapter->execute(context, mb, outputs[ArgIndices::cell_output]); } -bool LSTMSequenceOpBase::IsCudaGraphCompatible() const { return is_cuda_graph_compatible_; } +CudaGraphCompatibility LSTMSequenceOpBase::GetCudaGraphCompatibility() const { return graph_compatibility_; } void LSTMSequenceOpBase::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { descs_.initDevSeqLengthArray(CUDA::DevicePointer{ib_seq_lengths_.requiredPtr(buffers)}); diff --git a/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp b/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp index 6046c1753..6a459cf45 100644 --- a/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp +++ b/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp @@ -30,7 +30,7 @@ class LSTMSequenceOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; @@ -59,7 +59,7 @@ class LSTMSequenceOpBase : public OperationCuDnn { OutputTensorAdapterPtr cy_adapter; private: - bool is_cuda_graph_compatible_; + CudaGraphCompatibility graph_compatibility_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/matmul.cpp b/modules/nvidia_plugin/src/ops/matmul.cpp index 85f97adad..52de989f1 100644 --- a/modules/nvidia_plugin/src/ops/matmul.cpp +++ b/modules/nvidia_plugin/src/ops/matmul.cpp @@ -226,7 +226,7 @@ void MatMulOp::Execute(const InferenceRequestContext& context, CUBLAS_GEMM_DEFAULT)); } -bool MatMulOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MatMulOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(MatMulOp, MatMul); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/matmul.hpp b/modules/nvidia_plugin/src/ops/matmul.hpp index 10dbe2884..d30b46bfa 100644 --- a/modules/nvidia_plugin/src/ops/matmul.hpp +++ b/modules/nvidia_plugin/src/ops/matmul.hpp @@ -29,7 +29,7 @@ class MatMulOp : public OperationCuBlas { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; int GetBatchCount() const { return batch_count_; } diff --git a/modules/nvidia_plugin/src/ops/maxpool.cpp b/modules/nvidia_plugin/src/ops/maxpool.cpp index f6a696112..1b9a5c58d 100644 --- a/modules/nvidia_plugin/src/ops/maxpool.cpp +++ b/modules/nvidia_plugin/src/ops/maxpool.cpp @@ -30,7 +30,7 @@ void MaxPoolOp::Execute(const InferenceRequestContext& context, outputs[PoolingImpl::output_index].get()); } -bool MaxPoolOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MaxPoolOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(MaxPoolOp, MaxPool); diff --git a/modules/nvidia_plugin/src/ops/maxpool.hpp b/modules/nvidia_plugin/src/ops/maxpool.hpp index a43554c42..b12e39525 100644 --- a/modules/nvidia_plugin/src/ops/maxpool.hpp +++ b/modules/nvidia_plugin/src/ops/maxpool.hpp @@ -23,7 +23,7 @@ class MaxPoolOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: PoolingImpl impl_; diff --git a/modules/nvidia_plugin/src/ops/mvn.cpp b/modules/nvidia_plugin/src/ops/mvn.cpp index 365ef9d69..d05a2f7a5 100644 --- a/modules/nvidia_plugin/src/ops/mvn.cpp +++ b/modules/nvidia_plugin/src/ops/mvn.cpp @@ -99,7 +99,7 @@ void MvnOp::Execute(const InferenceRequestContext& context, {tensor_desc_, outputTensors[0]}); } -bool MvnOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MvnOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void MvnOp::Context::reduceMean(ConstTensor input, Tensor output) { context.getThreadContext().dnnHandle().reduceTensor(op.reduce_mean_desc_, diff --git a/modules/nvidia_plugin/src/ops/mvn.hpp b/modules/nvidia_plugin/src/ops/mvn.hpp index bb2c3e228..9420a42ea 100644 --- a/modules/nvidia_plugin/src/ops/mvn.hpp +++ b/modules/nvidia_plugin/src/ops/mvn.hpp @@ -25,7 +25,7 @@ class MvnOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; private: diff --git a/modules/nvidia_plugin/src/ops/nop_op.hpp b/modules/nvidia_plugin/src/ops/nop_op.hpp index dcb555fb3..a734d7351 100644 --- a/modules/nvidia_plugin/src/ops/nop_op.hpp +++ b/modules/nvidia_plugin/src/ops/nop_op.hpp @@ -39,7 +39,7 @@ class NopOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override {} - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/pad.cpp b/modules/nvidia_plugin/src/ops/pad.cpp index 6bf760670..da33feaca 100644 --- a/modules/nvidia_plugin/src/ops/pad.cpp +++ b/modules/nvidia_plugin/src/ops/pad.cpp @@ -58,7 +58,7 @@ void PadOp::Execute(const InferenceRequestContext& context, inputTensors[InputIndex::kPadValue].get()); } -bool PadOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility PadOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest PadOp::GetWorkBufferRequest() const { auto rank = src_shape_.size(); diff --git a/modules/nvidia_plugin/src/ops/pad.hpp b/modules/nvidia_plugin/src/ops/pad.hpp index c819118c2..846a65feb 100644 --- a/modules/nvidia_plugin/src/ops/pad.hpp +++ b/modules/nvidia_plugin/src/ops/pad.hpp @@ -23,7 +23,7 @@ class PadOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; diff --git a/modules/nvidia_plugin/src/ops/parameter.cpp b/modules/nvidia_plugin/src/ops/parameter.cpp index 54c5dbe14..233c8dec0 100644 --- a/modules/nvidia_plugin/src/ops/parameter.cpp +++ b/modules/nvidia_plugin/src/ops/parameter.cpp @@ -32,7 +32,7 @@ void ParameterOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().upload(outputs[0], tensor->data(), tensor->get_byte_size()); } -bool ParameterOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ParameterOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::string ParameterOp::GetInputTensorName(const ov::Node& node) { return node.get_friendly_name(); } diff --git a/modules/nvidia_plugin/src/ops/parameter.hpp b/modules/nvidia_plugin/src/ops/parameter.hpp index 4cbbe40dc..decd83aff 100644 --- a/modules/nvidia_plugin/src/ops/parameter.hpp +++ b/modules/nvidia_plugin/src/ops/parameter.hpp @@ -27,7 +27,7 @@ class ParameterOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; static std::string GetInputTensorName(const ov::Node& node); private: diff --git a/modules/nvidia_plugin/src/ops/range.cpp b/modules/nvidia_plugin/src/ops/range.cpp index f8df8a09a..df08e04d4 100644 --- a/modules/nvidia_plugin/src/ops/range.cpp +++ b/modules/nvidia_plugin/src/ops/range.cpp @@ -64,7 +64,7 @@ void RangeOp::Execute(const InferenceRequestContext& context, outputs[OUTPUT_INDX].get()); } -bool RangeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility RangeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(RangeOp, Range); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/range.hpp b/modules/nvidia_plugin/src/ops/range.hpp index 89da68a6a..67a90f15c 100644 --- a/modules/nvidia_plugin/src/ops/range.hpp +++ b/modules/nvidia_plugin/src/ops/range.hpp @@ -26,7 +26,7 @@ class RangeOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: size_t output_size_; diff --git a/modules/nvidia_plugin/src/ops/reduce.cpp b/modules/nvidia_plugin/src/ops/reduce.cpp index d57d75f17..36b7df0ff 100644 --- a/modules/nvidia_plugin/src/ops/reduce.cpp +++ b/modules/nvidia_plugin/src/ops/reduce.cpp @@ -58,7 +58,7 @@ void ReduceOp::Execute(const InferenceRequestContext& context, outputTensors[0]); } -bool ReduceOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ReduceOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/reduce.hpp b/modules/nvidia_plugin/src/ops/reduce.hpp index 35d9b4822..275eefbd3 100644 --- a/modules/nvidia_plugin/src/ops/reduce.hpp +++ b/modules/nvidia_plugin/src/ops/reduce.hpp @@ -22,7 +22,7 @@ class ReduceOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; static cudnnDataType_t reduceCompType(const ov::Node& node); diff --git a/modules/nvidia_plugin/src/ops/result.cpp b/modules/nvidia_plugin/src/ops/result.cpp index 7f7aaf757..95b0f2672 100644 --- a/modules/nvidia_plugin/src/ops/result.cpp +++ b/modules/nvidia_plugin/src/ops/result.cpp @@ -43,7 +43,7 @@ void ResultOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().download(tensor->data(), inputs[0], tensor->get_byte_size()); } -bool ResultOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ResultOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::optional ResultOp::GetOutputTensorSubIndex(const ov::Output& node) { const auto& opRegistry = OperationRegistry::getInstance(); diff --git a/modules/nvidia_plugin/src/ops/result.hpp b/modules/nvidia_plugin/src/ops/result.hpp index 7e66794a2..275074583 100644 --- a/modules/nvidia_plugin/src/ops/result.hpp +++ b/modules/nvidia_plugin/src/ops/result.hpp @@ -29,7 +29,7 @@ class ResultOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; static std::vector GetOutputTensorName(const ov::op::v0::Result& node); diff --git a/modules/nvidia_plugin/src/ops/round.cpp b/modules/nvidia_plugin/src/ops/round.cpp index 4edd12e55..9fa98f216 100644 --- a/modules/nvidia_plugin/src/ops/round.cpp +++ b/modules/nvidia_plugin/src/ops/round.cpp @@ -48,7 +48,7 @@ void RoundOp::Execute(const InferenceRequestContext& context, (*kernel_)(context.getThreadContext().stream().get(), inputTensors[0].get(), outputTensors[0].get()); } -bool RoundOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility RoundOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(RoundOp, Round); diff --git a/modules/nvidia_plugin/src/ops/round.hpp b/modules/nvidia_plugin/src/ops/round.hpp index 86762f3cc..baa62db95 100644 --- a/modules/nvidia_plugin/src/ops/round.hpp +++ b/modules/nvidia_plugin/src/ops/round.hpp @@ -25,7 +25,7 @@ class RoundOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp b/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp index bd5b72eb4..bb00b459a 100644 --- a/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp +++ b/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp @@ -100,7 +100,7 @@ void ScatterNDUpdateOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool ScatterNDUpdateOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ScatterNDUpdateOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } template static auto size_in_bytes(const std::vector& v) noexcept { diff --git a/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp b/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp index 394413f0f..778a16c55 100644 --- a/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp +++ b/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp @@ -22,7 +22,7 @@ class ScatterNDUpdateOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/select.cpp b/modules/nvidia_plugin/src/ops/select.cpp index c8dc04f01..708b57c74 100644 --- a/modules/nvidia_plugin/src/ops/select.cpp +++ b/modules/nvidia_plugin/src/ops/select.cpp @@ -90,7 +90,7 @@ void SelectOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool SelectOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SelectOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest SelectOp::GetWorkBufferRequest() const { return {std::vector(SIZES + 1, kOffsetBufferSize), {}}; diff --git a/modules/nvidia_plugin/src/ops/select.hpp b/modules/nvidia_plugin/src/ops/select.hpp index 61287a269..8eb0e800e 100644 --- a/modules/nvidia_plugin/src/ops/select.hpp +++ b/modules/nvidia_plugin/src/ops/select.hpp @@ -26,7 +26,7 @@ class SelectOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/softmax.cpp b/modules/nvidia_plugin/src/ops/softmax.cpp index e3ecedaa2..0019cd7db 100644 --- a/modules/nvidia_plugin/src/ops/softmax.cpp +++ b/modules/nvidia_plugin/src/ops/softmax.cpp @@ -192,7 +192,7 @@ void SoftmaxOp::Execute(const InferenceRequestContext& context, outputs[0].get())); } -bool SoftmaxOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SoftmaxOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(SoftmaxOp, Softmax); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/softmax.hpp b/modules/nvidia_plugin/src/ops/softmax.hpp index 608e1e657..abafaed63 100644 --- a/modules/nvidia_plugin/src/ops/softmax.hpp +++ b/modules/nvidia_plugin/src/ops/softmax.hpp @@ -27,7 +27,7 @@ class SoftmaxOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void mapRankAxis(const ov::Shape& shape, int axis); diff --git a/modules/nvidia_plugin/src/ops/split.cpp b/modules/nvidia_plugin/src/ops/split.cpp index b53aeadce..b5f66cbad 100644 --- a/modules/nvidia_plugin/src/ops/split.cpp +++ b/modules/nvidia_plugin/src/ops/split.cpp @@ -89,7 +89,7 @@ void SplitOp::Execute(const InferenceRequestContext& context, (*split_kernel_)(stream.get(), reinterpret_cast(in.get()), reinterpret_cast(outputPtrs.get())); } -bool SplitOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility SplitOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(SplitOp, Split); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/split.hpp b/modules/nvidia_plugin/src/ops/split.hpp index b38be6a0d..f6eda60bc 100644 --- a/modules/nvidia_plugin/src/ops/split.hpp +++ b/modules/nvidia_plugin/src/ops/split.hpp @@ -26,7 +26,7 @@ class SplitOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/strided_slice.cpp b/modules/nvidia_plugin/src/ops/strided_slice.cpp index 992eab97b..5d028ceb6 100644 --- a/modules/nvidia_plugin/src/ops/strided_slice.cpp +++ b/modules/nvidia_plugin/src/ops/strided_slice.cpp @@ -103,7 +103,9 @@ void StridedSliceOp::Execute(const InferenceRequestContext& context, } template -bool StridedSliceOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility StridedSliceOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} template WorkbufferRequest StridedSliceOp::GetWorkBufferRequest() const { diff --git a/modules/nvidia_plugin/src/ops/strided_slice.hpp b/modules/nvidia_plugin/src/ops/strided_slice.hpp index f154e8967..9754cafb3 100644 --- a/modules/nvidia_plugin/src/ops/strided_slice.hpp +++ b/modules/nvidia_plugin/src/ops/strided_slice.hpp @@ -39,7 +39,7 @@ class StridedSliceOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/subgraph.cpp b/modules/nvidia_plugin/src/ops/subgraph.cpp index 012cbb7a4..4d2a72d42 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.cpp +++ b/modules/nvidia_plugin/src/ops/subgraph.cpp @@ -154,17 +154,21 @@ void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); } -bool SubGraph::IsCudaGraphCompatible() const { - if (is_cuda_graph_compatible_ == CompatibleState::NOT_INITIALIZED) { - is_cuda_graph_compatible_ = CompatibleState::COMPATIBLE; +CudaGraphCompatibility SubGraph::GetCudaGraphCompatibility() const { + if (!is_compatibility_analyzed_) { + graph_compatibility_ = CudaGraphCompatibility::FULL; for (const auto& op : exec_sequence_) { - if (!op->IsCudaGraphCompatible()) { - is_cuda_graph_compatible_ = CompatibleState::NOT_COMPATIBLE; + auto opCompatability = op->GetCudaGraphCompatibility(); + if (opCompatability == CudaGraphCompatibility::SPECIAL) { + graph_compatibility_ = opCompatability; + } else if (opCompatability == CudaGraphCompatibility::NONE) { + graph_compatibility_ = opCompatability; break; } } + is_compatibility_analyzed_ = true; } - return is_cuda_graph_compatible_ == CompatibleState::COMPATIBLE; + return graph_compatibility_; } } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/subgraph.hpp b/modules/nvidia_plugin/src/ops/subgraph.hpp index 443aeea96..5fb38be02 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.hpp +++ b/modules/nvidia_plugin/src/ops/subgraph.hpp @@ -37,7 +37,7 @@ class SubGraph : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; inline std::shared_ptr memoryManager() const { return memory_manager_; } @@ -78,8 +78,6 @@ class SubGraph : public OperationBase { ov::Shape shape_{}; }; - enum class CompatibleState { NOT_INITIALIZED = -1, NOT_COMPATIBLE, COMPATIBLE }; - std::shared_ptr memory_manager_; std::vector params_; std::vector params_info_; @@ -88,7 +86,8 @@ class SubGraph : public OperationBase { std::vector results_info_; std::shared_ptr model_; - mutable CompatibleState is_cuda_graph_compatible_ = CompatibleState::NOT_INITIALIZED; + mutable CudaGraphCompatibility graph_compatibility_; + mutable bool is_compatibility_analyzed_ = false; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/swish.cpp b/modules/nvidia_plugin/src/ops/swish.cpp index 61244cef9..7308d7e07 100644 --- a/modules/nvidia_plugin/src/ops/swish.cpp +++ b/modules/nvidia_plugin/src/ops/swish.cpp @@ -69,7 +69,7 @@ void SwishOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } -bool SwishOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SwishOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(SwishOp, Swish); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/swish.hpp b/modules/nvidia_plugin/src/ops/swish.hpp index 26a353850..8f1ddbb3a 100644 --- a/modules/nvidia_plugin/src/ops/swish.hpp +++ b/modules/nvidia_plugin/src/ops/swish.hpp @@ -24,7 +24,7 @@ class SwishOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index b0dbd0c5f..524be06cd 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -235,13 +235,14 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, graphInfo.launch_results_graph(stream); } -bool TensorIteratorOp::IsCudaGraphCompatible() const { +CudaGraphCompatibility TensorIteratorOp::GetCudaGraphCompatibility() const { // This implementation is CUDA graph compatible only if this is the standard TI with output only of the last // iteration (which is handled outside of the iterations loop) if (iterations_results_map_.size() != 1 || iterations_results_map_.count(num_iterations_ - 1) == 0) { - return false; + return CudaGraphCompatibility::NONE; } - return SubGraph::IsCudaGraphCompatible(); + return SubGraph::GetCudaGraphCompatibility() == CudaGraphCompatibility::NONE ? CudaGraphCompatibility::NONE + : CudaGraphCompatibility::SPECIAL; } void TensorIteratorOp::Capture(InferenceRequestContext& context, diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index 44f22bc01..54c44ce09 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -33,7 +33,7 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void Capture(InferenceRequestContext& context, Inputs inputTensors, diff --git a/modules/nvidia_plugin/src/ops/topk.cpp b/modules/nvidia_plugin/src/ops/topk.cpp index 7b72deaa6..83af29d21 100644 --- a/modules/nvidia_plugin/src/ops/topk.cpp +++ b/modules/nvidia_plugin/src/ops/topk.cpp @@ -172,7 +172,7 @@ void TopKOp::Execute(const InferenceRequestContext& context, static_cast(kernel_param.get())); } -bool TopKOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility TopKOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void TopKOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/topk.hpp b/modules/nvidia_plugin/src/ops/topk.hpp index bf311c1ed..aeb1ea01d 100644 --- a/modules/nvidia_plugin/src/ops/topk.hpp +++ b/modules/nvidia_plugin/src/ops/topk.hpp @@ -25,7 +25,7 @@ class TopKOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/transpose.cpp b/modules/nvidia_plugin/src/ops/transpose.cpp index 1ab7085e0..0cbe59947 100644 --- a/modules/nvidia_plugin/src/ops/transpose.cpp +++ b/modules/nvidia_plugin/src/ops/transpose.cpp @@ -113,7 +113,7 @@ void TransposeOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().get())); } -bool TransposeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility TransposeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::vector TransposeOp::extractInputExtents(const ov::Node& node) { std::vector result; diff --git a/modules/nvidia_plugin/src/ops/transpose.hpp b/modules/nvidia_plugin/src/ops/transpose.hpp index cab45c730..bfb9fd099 100644 --- a/modules/nvidia_plugin/src/ops/transpose.hpp +++ b/modules/nvidia_plugin/src/ops/transpose.hpp @@ -24,7 +24,7 @@ class TransposeOp : public OperationCuTensor { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: using ExtentsMap = std::unordered_map; diff --git a/modules/nvidia_plugin/src/ops/variadic_split.cpp b/modules/nvidia_plugin/src/ops/variadic_split.cpp index 807c2bdca..e83ba5ec5 100644 --- a/modules/nvidia_plugin/src/ops/variadic_split.cpp +++ b/modules/nvidia_plugin/src/ops/variadic_split.cpp @@ -199,7 +199,7 @@ void VariadicSplitOp::Execute(const InferenceRequestContext& context, static_cast(axis_offset_sizes.get())); } -bool VariadicSplitOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility VariadicSplitOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(VariadicSplitOp, VariadicSplit); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/variadic_split.hpp b/modules/nvidia_plugin/src/ops/variadic_split.hpp index 82580d546..0ede2a7dc 100644 --- a/modules/nvidia_plugin/src/ops/variadic_split.hpp +++ b/modules/nvidia_plugin/src/ops/variadic_split.hpp @@ -28,7 +28,7 @@ class VariadicSplitOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: enum { kOutputPtrsMWBIdx = 0, kNumberOfMWBIdx }; diff --git a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp similarity index 95% rename from modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp rename to modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp index 02ab32e2f..08766acdb 100644 --- a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp @@ -22,7 +22,7 @@ using namespace ov::nvidia_gpu; using DevPtr = CUDA::DevicePointer; using CDevPtr = CUDA::DevicePointer; -struct IsCudaGraphCompatibleTest : testing::Test { +struct CudaGraphCompatibilityTest : testing::Test { template static void generate(C& c) { std::random_device randDevice; @@ -40,7 +40,7 @@ struct IsCudaGraphCompatibleTest : testing::Test { OperationBase::Outputs outputs, const Workbuffers& workbuffers) { auto& stream = context.getThreadContext().stream(); - if (operation->IsCudaGraphCompatible()) { + if (operation->GetCudaGraphCompatibility() == CudaGraphCompatibility::FULL) { stream.synchronize(); CUDA::GraphCapture capture{stream}; { @@ -59,7 +59,7 @@ struct IsCudaGraphCompatibleTest : testing::Test { } }; -struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { +struct ReluCudaGraphCompatibilityTest : CudaGraphCompatibilityTest { void run() { using ElementType = float; @@ -130,9 +130,9 @@ struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { } }; -TEST_F(ReluIsCudaGraphCompatibleTest, Compatibile) { run(); } +TEST_F(ReluCudaGraphCompatibilityTest, Compatibile) { run(); } -struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { +struct ConcatCudaGraphCompatibilityTest : CudaGraphCompatibilityTest { void run() { using ElementType = float; @@ -228,6 +228,6 @@ struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { } }; -TEST_F(ConcatIsCudaGraphCompatibleTest, NotCompatible) { run(); } +TEST_F(ConcatCudaGraphCompatibilityTest, NotCompatible) { run(); } } // namespace diff --git a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp index 1bbebc52c..12026e6ca 100644 --- a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp @@ -79,7 +79,7 @@ class AddMul { static void checkSubGraph(const SubGraph& subGraph) { // Original SubGraph for AddMul network should be CUDA Graph compatible - EXPECT_TRUE(subGraph.IsCudaGraphCompatible()); + EXPECT_EQ(subGraph.GetCudaGraphCompatibility(), CudaGraphCompatibility::FULL); } static std::vector> calcRefs( @@ -126,7 +126,7 @@ class AddConcat { static void checkSubGraph(const SubGraph& subGraph) { // Original SubGraph for AddConcat network should not be CUDA Graph compatible - EXPECT_FALSE(subGraph.IsCudaGraphCompatible()); + EXPECT_EQ(subGraph.GetCudaGraphCompatibility(), CudaGraphCompatibility::NONE); } static std::vector> calcRefs( From dad499cd54f9756e41e59211c3478d15e01fe318 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Wed, 15 Nov 2023 20:17:22 +0200 Subject: [PATCH 13/14] [NVIDIA] Add ExecuteGraph() to IOperationExec/OperationBase --- .../src/cuda_graph_topology_runner.cpp | 23 +------- .../src/cuda_iexecution_delegator.hpp | 12 ++++ .../nvidia_plugin/src/cuda_operation_base.hpp | 26 ++++++--- modules/nvidia_plugin/src/cuda_profiler.cpp | 12 ++++ modules/nvidia_plugin/src/cuda_profiler.hpp | 19 +++++++ .../src/cuda_simple_execution_delegator.hpp | 19 +++++++ modules/nvidia_plugin/src/ops/subgraph.cpp | 44 +++++++++------ modules/nvidia_plugin/src/ops/subgraph.hpp | 7 ++- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 56 +++++++++---------- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 10 ++-- 10 files changed, 151 insertions(+), 77 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index cb456c4e5..32e5ac57a 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -10,18 +10,6 @@ namespace ov { namespace nvidia_gpu { -namespace { - -std::shared_ptr getTI(const SubGraph& sg) { - auto& seq = sg.getExecSequence(); - if (seq.size() != 1) { - return nullptr; - } - return std::dynamic_pointer_cast(seq[0]); -} - -} // namespace - CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model) : orig_subgraph_{context, model}, cuda_graphs_count_{0} { @@ -64,15 +52,10 @@ void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const Device graphContext.get_current_graph_info().launch(stream); graphIndex++; } else if (compatibility == CudaGraphCompatibility::SPECIAL) { - // TODO: remove - auto ti = getTI(subgraph); - CUDA::DevicePointer mutableBuffer{memoryBlock.view().data()}; - const auto& memoryManager = *subgraph.memoryManager(); - const auto& inputTensors = memoryManager.inputTensorPointers(*ti, mutableBuffer); - const auto& outputTensors = memoryManager.outputTensorPointers(*ti, mutableBuffer); - const auto& workBuffers = memoryManager.workBuffers(*ti, mutableBuffer); + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); graphContext.select_current_graph(graphIndex); - ti->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); + subgraph.ExecuteGraph(context, {}, {}, workbuffers); graphIndex++; } else { Workbuffers workbuffers{}; diff --git a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp index 7147da7d3..c28bfb7c3 100644 --- a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp +++ b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp @@ -64,6 +64,18 @@ class IExecutionDelegator { const Workbuffers::mutable_buffer& buffer, InferenceRequestContext& context) = 0; + /** + * Execute CUDA graph sequence from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) = 0; + /** * Returns performance counters * @return Performance counters diff --git a/modules/nvidia_plugin/src/cuda_operation_base.hpp b/modules/nvidia_plugin/src/cuda_operation_base.hpp index 261c45950..182730422 100644 --- a/modules/nvidia_plugin/src/cuda_operation_base.hpp +++ b/modules/nvidia_plugin/src/cuda_operation_base.hpp @@ -44,11 +44,17 @@ class IOperationExec { Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; + + virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0; + virtual void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; - virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0; + virtual void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const = 0; virtual void InitSharedImmutableWorkbuffers(const Buffers&) = 0; virtual WorkbufferRequest GetWorkBufferRequest() const = 0; virtual const WorkbufferIds& GetWorkbufferIds() const = 0; @@ -85,6 +91,18 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::NONE; } + void Capture(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override { + Execute(context, inputTensors, outputTensors, workbuffers); + } + // For operations with CudaGraphCompatibility::SPECIAL, e.g. TI; the vast majority or operations doesn't use this + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override {} + WorkbufferRequest GetWorkBufferRequest() const override { return {}; // Most operators do not need workbuffers } @@ -111,12 +129,6 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: workbuffer_ids_ = workbufferIds; return workbuffer_ids_.immutableIds.empty() ? WorkbufferStatus::NoInitNeeded : WorkbufferStatus::InitNeeded; } - void Capture(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers) const override { - Execute(context, inputTensors, outputTensors, workbuffers); - } protected: std::string node_name_; diff --git a/modules/nvidia_plugin/src/cuda_profiler.cpp b/modules/nvidia_plugin/src/cuda_profiler.cpp index be8a3a61c..4c8f96ce7 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.cpp +++ b/modules/nvidia_plugin/src/cuda_profiler.cpp @@ -147,6 +147,18 @@ void Profiler::capture_sequence(const SubGraph* subGraphPtr, } } +void Profiler::execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) { + for (const auto& op : create_exec_sequence(subGraphPtr)) { + const auto& inTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->execute_graph(context, inTensors, outTensors, workBuffers); + } +} + Profiler::ProfilerSequence Profiler::create_exec_sequence(const SubGraph* subGraphPtr) { OPENVINO_ASSERT(active_stream_); ++infer_count_; diff --git a/modules/nvidia_plugin/src/cuda_profiler.hpp b/modules/nvidia_plugin/src/cuda_profiler.hpp index cea8b53c7..b4e078e7b 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.hpp +++ b/modules/nvidia_plugin/src/cuda_profiler.hpp @@ -70,6 +70,18 @@ class Profiler : public IExecutionDelegator { const Workbuffers::mutable_buffer& buffer, InferenceRequestContext& context) override; + /** + * Execute CUDA graph sequence from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override; + /** * Returns performance counters * @return Performance counters @@ -140,6 +152,13 @@ class Profiler::ProfileExecStep { timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); } + template + void execute_graph(TArgs&&... args) const { + timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + exec_step_.ExecuteGraph(std::forward(args)...); + timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + } + /** * Adapter method for pointer of operation * @return Reference to ProfileExecStep diff --git a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp index 97d174b76..1c7371476 100644 --- a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp +++ b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp @@ -76,6 +76,25 @@ class SimpleExecutionDelegator : public IExecutionDelegator { } }; + /** + * Call ExecuteGraph for all operations from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override { + for (auto& op : subGraphPtr->getExecSequence()) { + const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); + } + }; + /** * Dummy get_performance_counts implementation */ diff --git a/modules/nvidia_plugin/src/ops/subgraph.cpp b/modules/nvidia_plugin/src/ops/subgraph.cpp index 4d2a72d42..acda4f30f 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.cpp +++ b/modules/nvidia_plugin/src/ops/subgraph.cpp @@ -128,22 +128,6 @@ std::vector> SubGraph::getSharedWorkbuffers(const IOperatio return result; } -void SubGraph::Capture(InferenceRequestContext &context, Inputs, Outputs, - const Workbuffers &workbuffers) const { - const auto& stream = context.getThreadContext().stream(); - const auto& memoryManager = *memory_manager_; - auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - - auto& executionDelegator = context.getExecutionDelegator(); - executionDelegator.set_stream(stream); - executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); -} - -WorkbufferRequest SubGraph::GetWorkBufferRequest() const { - const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize(); - return {{}, {memoryBlockSize}}; -} - void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; @@ -171,5 +155,33 @@ CudaGraphCompatibility SubGraph::GetCudaGraphCompatibility() const { return graph_compatibility_; } +void SubGraph::Capture(InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); +} + +void SubGraph::ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + executionDelegator.execute_graph_sequence(this, memoryManager, mutableBuffer, context); +} + +WorkbufferRequest SubGraph::GetWorkBufferRequest() const { + const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize(); + return {{}, {memoryBlockSize}}; +} + } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/subgraph.hpp b/modules/nvidia_plugin/src/ops/subgraph.hpp index 5fb38be02..2bf4f5651 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.hpp +++ b/modules/nvidia_plugin/src/ops/subgraph.hpp @@ -32,12 +32,17 @@ class SubGraph : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; + void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const override; - CudaGraphCompatibility GetCudaGraphCompatibility() const override; + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override; inline std::shared_ptr memoryManager() const { return memory_manager_; } diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 524be06cd..4eeb99172 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -207,34 +207,6 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, } } -void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers) { - const auto& stream = context.getThreadContext().stream(); - const auto& memoryManager = *memory_manager_; - const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - - auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); - - graphInfo.launch_params_graph(stream); - - OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), - "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); - - for (int64_t iter = 0; iter < num_iterations_; ++iter) { - for (std::size_t i = 0; i < slices_.size(); ++i) { - slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter); - } - for (std::size_t i = 0; i < inserts_.size(); ++i) { - inserts_[i].update_kernel_node(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); - } - graphInfo.launch(stream); - } - - graphInfo.launch_results_graph(stream); -} - CudaGraphCompatibility TensorIteratorOp::GetCudaGraphCompatibility() const { // This implementation is CUDA graph compatible only if this is the standard TI with output only of the last // iteration (which is handled outside of the iterations loop) @@ -305,6 +277,34 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, graphInfo.set_results_graph(capture.getGraph()); } +void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); + + graphInfo.launch_params_graph(stream); + + OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), + "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + + for (int64_t iter = 0; iter < num_iterations_; ++iter) { + for (std::size_t i = 0; i < slices_.size(); ++i) { + slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter); + } + for (std::size_t i = 0; i < inserts_.size(); ++i) { + inserts_[i].update_kernel_node(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); + } + graphInfo.launch(stream); + } + + graphInfo.launch_results_graph(stream); +} + TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx) : input_idx_{inputIdx}, param_{*ti.params_[paramIdx]}, diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index 54c44ce09..d176b742f 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -28,11 +28,6 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers) const override; - void ExecuteGraph(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers); - CudaGraphCompatibility GetCudaGraphCompatibility() const override; void Capture(InferenceRequestContext& context, @@ -40,6 +35,11 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers) const override; + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override; + private: struct PortMap { int64_t start{0}; From 802ecf595b4f53ccdc2c971382b9f2e72c86d2d3 Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Wed, 15 Nov 2023 20:33:02 +0200 Subject: [PATCH 14/14] [NVIDIA] Remove paramsGraph_/resultsGraph_ from CudaGraphInfo --- .../nvidia_plugin/src/cuda_graph_context.cpp | 14 ------ .../nvidia_plugin/src/cuda_graph_context.hpp | 10 ---- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 50 ++++++++----------- 3 files changed, 20 insertions(+), 54 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index 4033b5df5..a4374be63 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -49,22 +49,8 @@ void CudaGraphInfo::set_graph(const CUDA::Graph& graph) { graphExec_.emplace(graph); } -void CudaGraphInfo::set_params_graph(const CUDA::Graph& graph) { - paramsGraph_.emplace(graph); - paramsGraphExec_.emplace(graph); -} - -void CudaGraphInfo::set_results_graph(const CUDA::Graph& graph) { - resultsGraph_.emplace(graph); - resultsGraphExec_.emplace(graph); -} - void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } -void CudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); } - -void CudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { resultsGraphExec_.value().launch(stream); } - bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) { return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && lhs.resultNodes_ == rhs.resultNodes_ && lhs.transferNodes_ == rhs.transferNodes_ && diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index 484bc15c5..295a8fbad 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -51,12 +51,8 @@ class CudaGraphInfo { std::size_t get_kernels_count() const { return kernelNodes_.size(); } void set_graph(const CUDA::Graph& graph); - void set_params_graph(const CUDA::Graph& graph); - void set_results_graph(const CUDA::Graph& graph); void launch(const CUDA::Stream& stream) const; - void launch_params_graph(const CUDA::Stream& stream) const; - void launch_results_graph(const CUDA::Stream& stream) const; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); @@ -65,12 +61,6 @@ class CudaGraphInfo { std::optional graph_{}; std::optional graphExec_{}; - std::optional paramsGraph_{}; - std::optional paramsGraphExec_{}; - - std::optional resultsGraph_{}; - std::optional resultsGraphExec_{}; - std::map parameterNodes_; std::map resultNodes_; diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 4eeb99172..0d5eee11b 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -226,23 +226,9 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); + CUDA::GraphCapture capture{stream}; - { - auto scope = capture.getScope(); - // First iteration - for (const auto inputIdx : invariant_inputs_) { - const auto paramIdx = inputs_parameters_map_.at(inputIdx); - transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); - } - for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { - if (portmap_inputs_.count(inputIdx) == 0) { - transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); - } - } - } - graphInfo.set_params_graph(capture.getGraph()); { auto scope = capture.getScope(); // Input mapping of ports @@ -264,17 +250,6 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, } } graphInfo.set_graph(capture.getGraph()); - { - auto scope = capture.getScope(); - // Copy data to output - if (iterations_results_map_.count(num_iterations_ - 1) > 0) { - for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { - const auto& outputIdx = results_outputs_map_.at(resultIdx); - transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); - } - } - } - graphInfo.set_results_graph(capture.getGraph()); } void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, @@ -285,13 +260,22 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, const auto& memoryManager = *memory_manager_; const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); - - graphInfo.launch_params_graph(stream); + // First iteration; this part doesn't use CUDA graphs yet + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + // TI body loop for (int64_t iter = 0; iter < num_iterations_; ++iter) { for (std::size_t i = 0; i < slices_.size(); ++i) { slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter); @@ -302,7 +286,13 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, graphInfo.launch(stream); } - graphInfo.launch_results_graph(stream); + // Copy data to output; this part doesn't use CUDA graphs yet + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } + } } TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx)