diff --git a/modules/nvidia_plugin/src/cuda/device_pointers.hpp b/modules/nvidia_plugin/src/cuda/device_pointers.hpp index d9452bea1..f39bab3e6 100644 --- a/modules/nvidia_plugin/src/cuda/device_pointers.hpp +++ b/modules/nvidia_plugin/src/cuda/device_pointers.hpp @@ -52,6 +52,16 @@ auto operator-(DevicePointer l, DevicePointer r) noexcept { return static_cast(l.get()) - static_cast(r); } +template +bool operator==(const DevicePointer& lhs, const DevicePointer& rhs) { + return lhs.get() == rhs.get(); +} + +template +bool operator!=(const DevicePointer& lhs, const DevicePointer& rhs) { + return lhs.get() != rhs.get(); +} + template class DeviceBuffer : private gsl::span { public: diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index 3bf743a1a..ceae4a8c2 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -17,7 +17,7 @@ Graph::Graph(cudaGraph_t graph) : Handle { createFromNative, cudaGraphDestroy, graph } { } -cudaError_t Graph::createFromNative(cudaGraph_t *pGraph, const cudaGraph_t anotherGraph) { +cudaError_t Graph::createFromNative(cudaGraph_t* pGraph, const cudaGraph_t anotherGraph) { *pGraph = anotherGraph; return cudaSuccess; } @@ -28,52 +28,52 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) { return g; } -bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); } +bool operator==(const Graph& rhs, const Graph& lhs) { return rhs.get() == lhs.get(); } -GraphExec::GraphExec(const Graph &g) +GraphExec::GraphExec(const Graph& g) #if !defined(NDEBUG) || defined(_DEBUG) try #endif : Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), - static_cast(nullptr), + static_cast(nullptr), #if !defined(NDEBUG) || defined(_DEBUG) errorMsg_, kErrorStringLen) #else - static_cast(nullptr), + static_cast(nullptr), static_cast(0ul)) #endif { } #if !defined(NDEBUG) || defined(_DEBUG) -catch (std::exception &e) { +catch (std::exception& e) { OPENVINO_THROW(e.what(), ": ", errorMsg_); } #endif #if defined(CUDA_VERSION) && CUDA_VERSION >= 12020 -cudaGraphExecUpdateResultInfo GraphExec::update(const Graph &g) const { +cudaGraphExecUpdateResultInfo GraphExec::update(const Graph& g) const { cudaGraphExecUpdateResultInfo res; throwIfError(cudaGraphExecUpdate(get(), g.get(), &res)); return res; } #else -cudaGraphExecUpdateResult GraphExec::update(const Graph &g) const { +cudaGraphExecUpdateResult GraphExec::update(const Graph& g) const { cudaGraphExecUpdateResult res; throwIfError(cudaGraphExecUpdate(get(), g.get(), nullptr, &res)); return res; } #endif -void GraphExec::launch(const Stream &stream) const { +void GraphExec::launch(const Stream& stream) const { throwIfError(cudaGraphLaunch(get(), stream.get())); } -bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); } +bool operator==(const GraphExec& lhs, const GraphExec& rhs) { return rhs.get() == lhs.get(); } -GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} { +GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture& graphCapture) : graphCapture_{graphCapture} { throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal)); } @@ -81,7 +81,7 @@ GraphCapture::GraphCaptureScope::~GraphCaptureScope() { graphCapture_.capturedError_ = cudaStreamEndCapture(graphCapture_.stream_.get(), &graphCapture_.cudaGraph_); } -GraphCapture::GraphCapture(const Stream &capturedStream) : +GraphCapture::GraphCapture(const Stream& capturedStream) : stream_ { capturedStream } { } @@ -100,12 +100,12 @@ const Graph& GraphCapture::getGraph() { return graph_.value(); } -CaptureInfo::CaptureInfo(const Stream &capturedStream) : stream_{capturedStream} { +CaptureInfo::CaptureInfo(const Stream& capturedStream) : stream_{capturedStream} { throwIfError(cudaStreamGetCaptureInfo_v2(capturedStream.get(), &captureStatus_, nullptr, &capturingGraph_, &deps_, &depCount_)); } -UploadNode CaptureInfo::addUploadNode(DevicePointer dst, const void *src, std::size_t size) { +UploadNode CaptureInfo::addUploadNode(DevicePointer dst, const void* src, std::size_t size) { cudaGraphNode_t newNode; throwIfError(cudaGraphAddMemcpyNode1D(&newNode, capturingGraph_, deps_, depCount_, dst.get(), src, size, cudaMemcpyHostToDevice)); @@ -113,7 +113,7 @@ UploadNode CaptureInfo::addUploadNode(DevicePointer dst, const void *src, return UploadNode{newNode, dst, src, size}; } -DownloadNode CaptureInfo::addDownloadNode(void *dst, DevicePointer src, +DownloadNode CaptureInfo::addDownloadNode(void* dst, DevicePointer src, std::size_t size) { cudaGraphNode_t newNode; throwIfError(cudaGraphAddMemcpyNode1D(&newNode, capturingGraph_, deps_, depCount_, @@ -122,7 +122,17 @@ DownloadNode CaptureInfo::addDownloadNode(void *dst, DevicePointer return DownloadNode{newNode, dst, src, size}; } -void UploadNode::update_src(const GraphExec& exec, const void *src) { +TransferNode CaptureInfo::addTransferNode(CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + cudaGraphNode_t newNode; + throwIfError(cudaGraphAddMemcpyNode1D( + &newNode, capturingGraph_, deps_, depCount_, dst.get(), src.get(), size, cudaMemcpyDeviceToDevice)); + throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); + return TransferNode{newNode, dst, src, size}; +} + +void UploadNode::update_src(const GraphExec& exec, const void* src) { if (src_ != src) { throwIfError(cudaGraphExecMemcpyNodeSetParams1D(exec.get(), node_, dst_.get(), src, size_, cudaMemcpyHostToDevice)); @@ -130,15 +140,14 @@ void UploadNode::update_src(const GraphExec& exec, const void *src) { } } -UploadNode::UploadNode(cudaGraphNode_t node, DevicePointer dst, const void *src, - std::size_t size) +UploadNode::UploadNode(cudaGraphNode_t node, DevicePointer dst, const void* src, std::size_t size) : node_{node}, dst_{dst}, src_{src}, size_{size} { } -void DownloadNode::update_dst(const GraphExec& exec, void *dst) { +void DownloadNode::update_dst(const GraphExec& exec, void* dst) { if (dst_ != dst) { throwIfError(cudaGraphExecMemcpyNodeSetParams1D(exec.get(), node_, dst, src_.get(), size_, cudaMemcpyDeviceToHost)); @@ -146,20 +155,42 @@ void DownloadNode::update_dst(const GraphExec& exec, void *dst) { } } -DownloadNode::DownloadNode(cudaGraphNode_t node, void *dst, DevicePointer src, - std::size_t size) - : node_{node}, - dst_{dst}, - src_{src}, - size_{size} { +DownloadNode::DownloadNode(cudaGraphNode_t node, void* dst, DevicePointer src, std::size_t size) + : node_{node}, dst_{dst}, src_{src}, size_{size} {} + +void CUDA::TransferNode::update_ptrs(const GraphExec& exec, + CUDA::DevicePointer dst, + CUDA::DevicePointer src) { + if (dst_ != dst || src_ != src) { + dst_ = dst; + src_ = src; + throwIfError(cudaGraphExecMemcpyNodeSetParams1D( + exec.get(), node_, dst_.get(), src_.get(), size_, cudaMemcpyDeviceToDevice)); + } } -bool UploadNode::operator ==(const UploadNode &rhs) const { +CUDA::TransferNode::TransferNode(cudaGraphNode_t node, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) + : node_{node}, dst_{dst}, src_{src}, size_{size} {} + +CUDA::KernelNode::KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params) : node_{node}, node_params_{params} {} + +bool UploadNode::operator==(const UploadNode& rhs) const { return size_ == rhs.size_ && src_ == rhs.src_ && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; } -bool DownloadNode::operator ==(const DownloadNode &rhs) const { +bool DownloadNode::operator==(const DownloadNode& rhs) const { return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_ == rhs.dst_ && node_ == rhs.node_; } -} // namespace CUDA +bool CUDA::TransferNode::operator==(const TransferNode& rhs) const { + return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; +} + +bool KernelNode::operator==(const KernelNode& rhs) const { + return node_ == rhs.node_ && node_params_ == rhs.node_params_; +} + +} // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp index b014e2131..4360af27b 100644 --- a/modules/nvidia_plugin/src/cuda/graph.hpp +++ b/modules/nvidia_plugin/src/cuda/graph.hpp @@ -6,6 +6,7 @@ #include +#include #include "runtime.hpp" namespace CUDA { @@ -92,6 +93,7 @@ class UploadNode { private: UploadNode(cudaGraphNode_t node, CUDA::DevicePointer dst, const void* src, std::size_t size); + cudaGraphNode_t node_; CUDA::DevicePointer dst_; const void* src_; @@ -107,17 +109,64 @@ class DownloadNode { private: DownloadNode(cudaGraphNode_t node, void* dst, CUDA::DevicePointer src, std::size_t size); + cudaGraphNode_t node_; void* dst_; CUDA::DevicePointer src_; std::size_t size_; }; +class TransferNode { + friend CaptureInfo; + +public: + void update_ptrs(const GraphExec& exec, CUDA::DevicePointer dst, CUDA::DevicePointer src); + bool operator==(const TransferNode& rhs) const; + +private: + TransferNode(cudaGraphNode_t node, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + + cudaGraphNode_t node_; + CUDA::DevicePointer dst_; + CUDA::DevicePointer src_; + std::size_t size_; +}; + +bool operator==(const cudaKernelNodeParams& lhs, const cudaKernelNodeParams& rhs); + +class KernelNode { + friend CaptureInfo; + +public: + template + void update_params(const GraphExec& exec, Args&&... args) { + node_params_.reset_args(); + node_params_.add_args(std::forward(args)...); + throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, &node_params_.get_knp())); + } + + bool operator==(const KernelNode& rhs) const; + +private: + KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params); + + cudaGraphNode_t node_; + CUDA::NodeParams node_params_; +}; + class CaptureInfo { public: CaptureInfo(const Stream& capturedStream); UploadNode addUploadNode(CUDA::DevicePointer dst, const void* src, std::size_t size); DownloadNode addDownloadNode(void* dst, CUDA::DevicePointer src, std::size_t size); + TransferNode addTransferNode(CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + template + KernelNode addKernelNode(void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args); private: const Stream& stream_; @@ -127,4 +176,14 @@ class CaptureInfo { size_t depCount_; }; +template +KernelNode CaptureInfo::addKernelNode(void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args) { + cudaGraphNode_t newNode; + CUDA::NodeParams params{kernel, gridDim, blockDim}; + params.add_args(std::forward(args)...); + throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, ¶ms.get_knp())); + throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); + return KernelNode{newNode, std::move(params)}; +} + } // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/node_params.hpp b/modules/nvidia_plugin/src/cuda/node_params.hpp new file mode 100644 index 000000000..aadea48fa --- /dev/null +++ b/modules/nvidia_plugin/src/cuda/node_params.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include +#include + +namespace CUDA { + +struct NodeParams { + NodeParams(void* kernel, dim3 gridDim, dim3 blockDim) : knp_{kernel, gridDim, blockDim, 0u, nullptr, nullptr} { + ptrs_.reserve(20); + } + + template + void add_args(const T& value) { + ptrs_.emplace_back(const_cast(&value)); + } + + template + void add_args(const T& arg, Args&&... args) { + add_args(std::forward(arg)); + add_args(std::forward(args)...); + }; + + const cudaKernelNodeParams& get_knp() { + knp_.kernelParams = ptrs_.data(); + return knp_; + } + + void reset_args() { ptrs_.clear(); } + + friend bool operator==(const NodeParams& lhs, const NodeParams& rhs); + +private: + std::vector ptrs_; + cudaKernelNodeParams knp_; +}; + +inline bool operator==(const NodeParams& lhs, const NodeParams& rhs) { + return lhs.ptrs_ == rhs.ptrs_ && rhs.knp_.func == lhs.knp_.func && rhs.knp_.gridDim == lhs.knp_.gridDim && + rhs.knp_.blockDim == lhs.knp_.blockDim && rhs.knp_.sharedMemBytes == lhs.knp_.sharedMemBytes && + rhs.knp_.extra == lhs.knp_.extra; +} + +} // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda/utils.hpp b/modules/nvidia_plugin/src/cuda/utils.hpp new file mode 100644 index 000000000..1ac504076 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda/utils.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace CUDA { + +inline bool operator==(dim3 rhs, dim3 lhs) { return rhs.x == lhs.x && rhs.y == lhs.y && rhs.z == lhs.z; } + +inline bool operator!=(dim3 rhs, dim3 lhs) { return !(rhs == lhs); } + +} // namespace CUDA diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp index 383cff255..892d48b49 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp @@ -10,7 +10,7 @@ namespace nvidia_gpu { EagerTopologyRunner::EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model) : SubGraph(context, model) {} -void EagerTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { +void EagerTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { Workbuffers workbuffers{}; workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); SubGraph::Execute(context, {}, {}, workbuffers); diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp index 67230df6b..ef03cf251 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp @@ -16,7 +16,7 @@ class EagerTopologyRunner final : public SubGraph, public ITopologyRunner { EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model); ~EagerTopologyRunner() override = default; - void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override{}; const SubGraph& GetSubGraph() const override; }; diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index e1f9e2487..a4374be63 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -7,14 +7,61 @@ namespace ov { namespace nvidia_gpu { -void CudaGraphContext::reset() { - graphs_.clear(); - currentGraphIndex_ = 0; +void CudaGraphInfo::add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); } -void CudaGraphContext::start_next_graph_addition() { - currentGraphIndex_ = graphs_.size(); - graphs_.emplace_back(); +void CudaGraphInfo::add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); +} + +void CudaGraphInfo::add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); +} + +bool CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } + +void CudaGraphInfo::update_capture(const TensorMappingContext& context) { + for (auto&& [tensorName, node] : parameterNodes_) { + node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); + } + for (auto&& [tensorName, node] : resultNodes_) { + node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); + } +} + +void CudaGraphInfo::set_graph(const CUDA::Graph& graph) { + graph_.emplace(graph); + graphExec_.emplace(graph); +} + +void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } + +bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) { + return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && + lhs.resultNodes_ == rhs.resultNodes_ && lhs.transferNodes_ == rhs.transferNodes_ && + lhs.kernelNodes_ == rhs.kernelNodes_; +} + +bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) { return !(lhs == rhs); } + +void CudaGraphContext::reset() { + graph_infos_.clear(); + currentGraphIndex_ = 0; } void CudaGraphContext::add_parameter(const std::string& tensorName, @@ -22,8 +69,8 @@ void CudaGraphContext::add_parameter(const std::string& tensorName, CUDA::DevicePointer dst, const void* src, std::size_t size) { - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size); + OPENVINO_ASSERT(currentGraphIndex_ < graph_infos_.size(), "Graph index/vector size incosistency"); + graph_infos_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size); } void CudaGraphContext::add_result(const std::string& tensorName, @@ -31,35 +78,43 @@ void CudaGraphContext::add_result(const std::string& tensorName, void* dst, CUDA::DevicePointer src, std::size_t size) { - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); + OPENVINO_ASSERT(currentGraphIndex_ < graph_infos_.size(), "Graph index/vector size incosistency"); + graph_infos_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); } -void CudaGraphContext::add_graph(const CUDA::Graph& graph) { - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].set_graph(graph); +void CudaGraphContext::set_current_graph(const CUDA::Graph& graph) { + OPENVINO_ASSERT(currentGraphIndex_ < graph_infos_.size(), "Graph index/vector size incosistency"); + graph_infos_[currentGraphIndex_].set_graph(graph); } bool CudaGraphContext::is_initialized() const { - const auto size = graphs_.size(); - return size != 0 && graphs_[size - 1].is_initialized(); + const auto size = graph_infos_.size(); + return size != 0 && graph_infos_[size - 1].is_initialized(); } void CudaGraphContext::update_capture(const TensorMappingContext& context) { - for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) { - graphs_[currentGraphIndex_].update_capture(context); + for (currentGraphIndex_ = 0; currentGraphIndex_ < graph_infos_.size(); ++currentGraphIndex_) { + graph_infos_[currentGraphIndex_].update_capture(context); } } -void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const { +void CudaGraphContext::add_new_graph_info() { + currentGraphIndex_ = graph_infos_.size(); + graph_infos_.emplace_back(); +} + +const CudaGraphInfo& CudaGraphContext::get_current_graph_info() const { return graph_infos_[currentGraphIndex_]; } + +CudaGraphInfo& CudaGraphContext::get_current_graph_info() { return graph_infos_[currentGraphIndex_]; } + +void CudaGraphContext::select_current_graph(std::size_t index) { + OPENVINO_ASSERT(index < graph_infos_.size(), "Graph index/vector size incosistency"); currentGraphIndex_ = index; - OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); - graphs_[currentGraphIndex_].launch(stream); } std::size_t CudaGraphContext::get_params_count() const { std::size_t res = 0; - for (const auto& graph : graphs_) { + for (const auto& graph : graph_infos_) { res += graph.get_params_count(); } return res; @@ -67,64 +122,15 @@ std::size_t CudaGraphContext::get_params_count() const { std::size_t CudaGraphContext::get_results_count() const { std::size_t res = 0; - for (const auto& graph : graphs_) { + for (const auto& graph : graph_infos_) { res += graph.get_results_count(); } return res; } -std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); } - -void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - const void* src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); -} - -void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName, - const CUDA::Stream& stream, - void* dst, - CUDA::DevicePointer src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); -} - -void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) { - graph_.emplace(graph); - graphExec_.emplace(graph); -} - -bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } - -void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) { - for (auto&& [tensorName, node] : parameterNodes_) { - node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); - } - for (auto&& [tensorName, node] : resultNodes_) { - node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); - } -} - -void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } - -std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); } - -std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); } - -bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { - return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && - lhs.resultNodes_ == rhs.resultNodes_; -} - -bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { - return !(lhs == rhs); -} +std::size_t CudaGraphContext::get_graphs_count() const { return graph_infos_.size(); } -bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; } +bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graph_infos_ == rhs.graph_infos_; } bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); } diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index c0ca01e18..295a8fbad 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -11,12 +11,8 @@ namespace ov { namespace nvidia_gpu { -class CudaGraphContext { +class CudaGraphInfo { public: - void reset(); - - void start_next_graph_addition(); - void add_parameter(const std::string& tensorName, const CUDA::Stream& stream, CUDA::DevicePointer dst, @@ -29,67 +25,94 @@ class CudaGraphContext { CUDA::DevicePointer src, std::size_t size); - void add_graph(const CUDA::Graph& graph); + void add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + + template + void add_kernel(const CUDA::Stream& stream, void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args) { + CUDA::CaptureInfo captureInfo{stream}; + kernelNodes_.emplace_back(captureInfo.addKernelNode(kernel, gridDim, blockDim, std::forward(args)...)); + } + + template + void update_kernel(std::size_t index, Args&&... args) { + kernelNodes_[index].update_params(graphExec_.value(), std::forward(args)...); + } bool is_initialized() const; void update_capture(const TensorMappingContext& context); - void launch(std::size_t index, const CUDA::Stream& stream) const; + std::size_t get_params_count() const { return parameterNodes_.size(); } + std::size_t get_results_count() const { return resultNodes_.size(); } + std::size_t get_transfers_count() const { return transferNodes_.size(); } + std::size_t get_kernels_count() const { return kernelNodes_.size(); } - std::size_t get_params_count() const; - std::size_t get_results_count() const; - std::size_t get_graphs_count() const; + void set_graph(const CUDA::Graph& graph); - friend bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); - friend bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + void launch(const CUDA::Stream& stream) const; + + friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); private: - class CudaGraphInfo { - public: - void add_parameter(const std::string& tensorName, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - const void* src, - std::size_t size); + std::optional graph_{}; + std::optional graphExec_{}; - void add_result(const std::string& tensorName, - const CUDA::Stream& stream, - void* dst, - CUDA::DevicePointer src, - std::size_t size); + std::map parameterNodes_; + std::map resultNodes_; - void set_graph(const CUDA::Graph& graph); + std::vector transferNodes_; + std::vector kernelNodes_; +}; - bool is_initialized() const; +class CudaGraphContext { +public: + void reset(); - void update_capture(const TensorMappingContext& context); + void add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size); - void launch(const CUDA::Stream& stream) const; + void add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size); - std::size_t get_params_count() const; - std::size_t get_results_count() const; + void set_current_graph(const CUDA::Graph& graph); - friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + bool is_initialized() const; - private: - std::optional graph_{}; - std::optional graphExec_{}; - std::map parameterNodes_; - std::map resultNodes_; - }; + void update_capture(const TensorMappingContext& context); - friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); - friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + void add_new_graph_info(); + + const CudaGraphInfo& get_current_graph_info() const; + CudaGraphInfo& get_current_graph_info(); - std::vector graphs_{}; - mutable std::size_t currentGraphIndex_ = 0; + void select_current_graph(std::size_t index); + + std::size_t get_params_count() const; + std::size_t get_results_count() const; + + std::size_t get_graphs_count() const; + + friend bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + friend bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + +private: + std::vector graph_infos_{}; + std::size_t currentGraphIndex_ = 0; }; -bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); +bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); -bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); +bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 1e851ec41..32e5ac57a 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -5,26 +5,26 @@ #include "cuda_graph_topology_runner.hpp" #include "cuda/event.hpp" +#include "ops/tensor_iterator.hpp" namespace ov { namespace nvidia_gpu { CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model) - : orig_subgraph_{context, model}, - cuda_graphs_count_{0} { + : orig_subgraph_{context, model}, cuda_graphs_count_{0} { std::vector sequences; SubGraph::ExecSequence currentSequence; const auto& origSequence = orig_subgraph_.getExecSequence(); const auto totalSize = origSequence.size(); OPENVINO_ASSERT(totalSize != 0, "ExecSequence size is 0"); - bool isLastOpCompatible = origSequence[0]->IsCudaGraphCompatible(); + CudaGraphCompatibility lastOpCompatibility = origSequence[0]->GetCudaGraphCompatibility(); currentSequence.push_back(origSequence[0]); for (size_t i = 1; i < totalSize; ++i) { const auto& op = origSequence[i]; - if (op->IsCudaGraphCompatible() != isLastOpCompatible) { - isLastOpCompatible = !isLastOpCompatible; + if (auto c = op->GetCudaGraphCompatibility(); c != lastOpCompatibility) { + lastOpCompatibility = c; sequences.emplace_back(std::move(currentSequence)); currentSequence.clear(); } @@ -35,18 +35,27 @@ CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const auto& memoryManager = orig_subgraph_.memoryManager(); for (auto&& sequence : sequences) { subgraphs_.emplace_back(context, model, std::move(sequence), memoryManager); - if (subgraphs_[subgraphs_.size() - 1].IsCudaGraphCompatible()) { + if (subgraphs_.back().GetCudaGraphCompatibility() != CudaGraphCompatibility::NONE) { ++cuda_graphs_count_; } } } -void CudaGraphTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { +void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { const auto& stream = context.getThreadContext().stream(); + auto& graphContext = context.getCudaGraphContext(); std::size_t graphIndex = 0; for (auto& subgraph : subgraphs_) { - if (subgraph.IsCudaGraphCompatible()) { - context.getCudaGraphContext().launch(graphIndex, stream); + auto compatibility = subgraph.GetCudaGraphCompatibility(); + if (compatibility == CudaGraphCompatibility::FULL) { + graphContext.select_current_graph(graphIndex); + graphContext.get_current_graph_info().launch(stream); + graphIndex++; + } else if (compatibility == CudaGraphCompatibility::SPECIAL) { + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + graphContext.select_current_graph(graphIndex); + subgraph.ExecuteGraph(context, {}, {}, workbuffers); graphIndex++; } else { Workbuffers workbuffers{}; @@ -63,21 +72,23 @@ void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, graphContext.reset(); for (const auto& subgraph : subgraphs_) { - if (subgraph.IsCudaGraphCompatible()) { - graphContext.start_next_graph_addition(); + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + auto compatibility = subgraph.GetCudaGraphCompatibility(); + if (compatibility == CudaGraphCompatibility::FULL) { + graphContext.add_new_graph_info(); CUDA::GraphCapture capture{stream}; { auto scope = capture.getScope(); - Workbuffers workbuffers{}; - workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); subgraph.Capture(context, {}, {}, workbuffers); } const auto& graph = capture.getGraph(); - graphContext.add_graph(graph); + graphContext.set_current_graph(graph); + } else if (compatibility == CudaGraphCompatibility::SPECIAL) { + graphContext.add_new_graph_info(); + subgraph.Capture(context, {}, {}, workbuffers); } } - OPENVINO_ASSERT(graphContext.get_graphs_count() == GetCudaGraphsCount(), - "CudaGraphTopologyRunner/CudaGraphContext graphs count mismatch"); } const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp index 8e7cd1b85..408129fc9 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp @@ -14,7 +14,7 @@ class CudaGraphTopologyRunner final : public ITopologyRunner { CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model); ~CudaGraphTopologyRunner() override = default; - void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; const SubGraph& GetSubGraph() const override; diff --git a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp index 7147da7d3..c28bfb7c3 100644 --- a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp +++ b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp @@ -64,6 +64,18 @@ class IExecutionDelegator { const Workbuffers::mutable_buffer& buffer, InferenceRequestContext& context) = 0; + /** + * Execute CUDA graph sequence from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) = 0; + /** * Returns performance counters * @return Performance counters diff --git a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp index 04cb61bbc..0e450db79 100644 --- a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp @@ -10,7 +10,7 @@ namespace ov { namespace nvidia_gpu { struct ITopologyRunner { - virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; virtual const SubGraph& GetSubGraph() const = 0; virtual ~ITopologyRunner() = default; diff --git a/modules/nvidia_plugin/src/cuda_operation_base.hpp b/modules/nvidia_plugin/src/cuda_operation_base.hpp index d95273175..182730422 100644 --- a/modules/nvidia_plugin/src/cuda_operation_base.hpp +++ b/modules/nvidia_plugin/src/cuda_operation_base.hpp @@ -30,6 +30,8 @@ namespace nvidia_gpu { template using DevicePointer = CUDA::DevicePointer; +enum class CudaGraphCompatibility { NONE, FULL, SPECIAL }; + class IOperationExec { public: using Inputs = gsl::span>; @@ -42,11 +44,17 @@ class IOperationExec { Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; + + virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0; + virtual void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const = 0; - virtual bool IsCudaGraphCompatible() const = 0; + virtual void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const = 0; virtual void InitSharedImmutableWorkbuffers(const Buffers&) = 0; virtual WorkbufferRequest GetWorkBufferRequest() const = 0; virtual const WorkbufferIds& GetWorkbufferIds() const = 0; @@ -81,7 +89,19 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: IndexCollection&& inputIds, IndexCollection&& outputIds); - bool IsCudaGraphCompatible() const override { return false; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::NONE; } + + void Capture(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override { + Execute(context, inputTensors, outputTensors, workbuffers); + } + // For operations with CudaGraphCompatibility::SPECIAL, e.g. TI; the vast majority or operations doesn't use this + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override {} WorkbufferRequest GetWorkBufferRequest() const override { return {}; // Most operators do not need workbuffers @@ -109,12 +129,6 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std:: workbuffer_ids_ = workbufferIds; return workbuffer_ids_.immutableIds.empty() ? WorkbufferStatus::NoInitNeeded : WorkbufferStatus::InitNeeded; } - void Capture(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers) const override { - Execute(context, inputTensors, outputTensors, workbuffers); - } protected: std::string node_name_; diff --git a/modules/nvidia_plugin/src/cuda_profiler.cpp b/modules/nvidia_plugin/src/cuda_profiler.cpp index be8a3a61c..4c8f96ce7 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.cpp +++ b/modules/nvidia_plugin/src/cuda_profiler.cpp @@ -147,6 +147,18 @@ void Profiler::capture_sequence(const SubGraph* subGraphPtr, } } +void Profiler::execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) { + for (const auto& op : create_exec_sequence(subGraphPtr)) { + const auto& inTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->execute_graph(context, inTensors, outTensors, workBuffers); + } +} + Profiler::ProfilerSequence Profiler::create_exec_sequence(const SubGraph* subGraphPtr) { OPENVINO_ASSERT(active_stream_); ++infer_count_; diff --git a/modules/nvidia_plugin/src/cuda_profiler.hpp b/modules/nvidia_plugin/src/cuda_profiler.hpp index cea8b53c7..b4e078e7b 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.hpp +++ b/modules/nvidia_plugin/src/cuda_profiler.hpp @@ -70,6 +70,18 @@ class Profiler : public IExecutionDelegator { const Workbuffers::mutable_buffer& buffer, InferenceRequestContext& context) override; + /** + * Execute CUDA graph sequence from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override; + /** * Returns performance counters * @return Performance counters @@ -140,6 +152,13 @@ class Profiler::ProfileExecStep { timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); } + template + void execute_graph(TArgs&&... args) const { + timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + exec_step_.ExecuteGraph(std::forward(args)...); + timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + } + /** * Adapter method for pointer of operation * @return Reference to ProfileExecStep diff --git a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp index 97d174b76..1c7371476 100644 --- a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp +++ b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp @@ -76,6 +76,25 @@ class SimpleExecutionDelegator : public IExecutionDelegator { } }; + /** + * Call ExecuteGraph for all operations from SubGraph class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_graph_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override { + for (auto& op : subGraphPtr->getExecSequence()) { + const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->ExecuteGraph(context, inputTensors, outputTensors, workBuffers); + } + }; + /** * Dummy get_performance_counts implementation */ diff --git a/modules/nvidia_plugin/src/kernels/insert.cu b/modules/nvidia_plugin/src/kernels/insert.cu index a521d60bc..da96e0a41 100644 --- a/modules/nvidia_plugin/src/kernels/insert.cu +++ b/modules/nvidia_plugin/src/kernels/insert.cu @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -70,11 +70,47 @@ void Insert::operator()(const cudaStream_t stream, const void* src, void* dst, c case Type_t::u64: return call(stream, src, dst, start); default: - throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", + throw_ov_exception(fmt::format("Input element type = {} is not supported by Insert operation !!", static_cast(element_type_))); } } +void* Insert::getKernel() const { + switch (element_type_) { + case Type_t::boolean: + return reinterpret_cast(&insert_part); +#ifdef CUDA_HAS_BF16_TYPE + case Type_t::bf16: + return reinterpret_cast(&insert_part<__nv_bfloat16>); +#endif + case Type_t::f16: + return reinterpret_cast(&insert_part<__half>); + case Type_t::f32: + return reinterpret_cast(&insert_part); + case Type_t::f64: + return reinterpret_cast(&insert_part); + case Type_t::i8: + return reinterpret_cast(&insert_part); + case Type_t::i16: + return reinterpret_cast(&insert_part); + case Type_t::i32: + return reinterpret_cast(&insert_part); + case Type_t::i64: + return reinterpret_cast(&insert_part); + case Type_t::u8: + return reinterpret_cast(&insert_part); + case Type_t::u16: + return reinterpret_cast(&insert_part); + case Type_t::u32: + return reinterpret_cast(&insert_part); + case Type_t::u64: + return reinterpret_cast(&insert_part); + default: + throw_ov_exception(fmt::format("Input element type = {} is not supported by Insert operation !!", + static_cast(element_type_))); + } +} + template void Insert::call(const cudaStream_t stream, const void* src, void* dst, const size_t start) const { assertThrow(props_ptr_, "props_ptr_ == nullptr"); diff --git a/modules/nvidia_plugin/src/kernels/insert.hpp b/modules/nvidia_plugin/src/kernels/insert.hpp index 111edf8db..c9e09b130 100644 --- a/modules/nvidia_plugin/src/kernels/insert.hpp +++ b/modules/nvidia_plugin/src/kernels/insert.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,6 +31,12 @@ class Insert { size_t getImmutableWorkbufferSize() const; void setImmutableWorkbuffer(void* immutableBuffer); + void* getKernel() const; + size_t getSize() const { return size_; } + size_t getNumBlocks() const { return num_blocks_; } + size_t getThreadsPerBlock() const { return threads_per_block_; } + const Props* getPropsPtr() const { return static_cast(props_ptr_); } + private: template void call(const cudaStream_t stream, const void* src, void* dst, const size_t start) const; diff --git a/modules/nvidia_plugin/src/kernels/slice.cu b/modules/nvidia_plugin/src/kernels/slice.cu index 8edb06d62..f358a8db6 100644 --- a/modules/nvidia_plugin/src/kernels/slice.cu +++ b/modules/nvidia_plugin/src/kernels/slice.cu @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -16,7 +16,7 @@ namespace nvidia_gpu { namespace kernel { template -static __global__ void slice_part(const Slice::Props *props, const size_t start, const size_t size, const T *x, T *y) { +static __global__ void slice_part(const Slice::Props* props, const size_t start, const size_t size, const T* x, T* y) { const unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { const size_t old_rank = rank(props->old_shape); @@ -32,13 +32,13 @@ static __global__ void slice_part(const Slice::Props *props, const size_t start, } } -Slice::Slice(const Type_t element_type, const Props &props, const size_t max_threads_per_block) +Slice::Slice(const Type_t element_type, const Props& props, const size_t max_threads_per_block) : element_type_{element_type}, props_{props}, size_{shape_size(props.new_shape)} { TypeValidator::check(element_type_); std::tie(num_blocks_, threads_per_block_) = calculateElementwiseGrid(size_, max_threads_per_block); } -void Slice::operator()(cudaStream_t stream, const void *src, void *dst, const size_t start) const { +void Slice::operator()(cudaStream_t stream, const void* src, void* dst, const size_t start) const { switch (element_type_) { case Type_t::boolean: return call(stream, src, dst, start); @@ -76,11 +76,47 @@ void Slice::operator()(cudaStream_t stream, const void *src, void *dst, const si } } +void* Slice::getKernel() const { + switch (element_type_) { + case Type_t::boolean: + return reinterpret_cast(&slice_part); +#ifdef CUDA_HAS_BF16_TYPE + case Type_t::bf16: + return reinterpret_cast(&slice_part<__nv_bfloat16>); +#endif + case Type_t::f16: + return reinterpret_cast(&slice_part<__half>); + case Type_t::f32: + return reinterpret_cast(&slice_part); + case Type_t::f64: + return reinterpret_cast(&slice_part); + case Type_t::i8: + return reinterpret_cast(&slice_part); + case Type_t::i16: + return reinterpret_cast(&slice_part); + case Type_t::i32: + return reinterpret_cast(&slice_part); + case Type_t::i64: + return reinterpret_cast(&slice_part); + case Type_t::u8: + return reinterpret_cast(&slice_part); + case Type_t::u16: + return reinterpret_cast(&slice_part); + case Type_t::u32: + return reinterpret_cast(&slice_part); + case Type_t::u64: + return reinterpret_cast(&slice_part); + default: + throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", + static_cast(element_type_))); + } +} + template -void Slice::call(cudaStream_t stream, const void *src, void *dst, size_t start) const { +void Slice::call(cudaStream_t stream, const void* src, void* dst, size_t start) const { assertThrow(props_ptr_, "props_ptr_ == nullptr"); slice_part<<>>( - static_cast(props_ptr_), start, size_, static_cast(src), static_cast(dst)); + static_cast(props_ptr_), start, size_, static_cast(src), static_cast(dst)); } } // namespace kernel diff --git a/modules/nvidia_plugin/src/kernels/slice.hpp b/modules/nvidia_plugin/src/kernels/slice.hpp index 64c58434c..e54dd4686 100644 --- a/modules/nvidia_plugin/src/kernels/slice.hpp +++ b/modules/nvidia_plugin/src/kernels/slice.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -29,6 +29,12 @@ class Slice { size_t getImmutableWorkbufferSize() const; void setImmutableWorkbuffer(void* immutableBuffer); + void* getKernel() const; + size_t getSize() const { return size_; } + size_t getNumBlocks() const { return num_blocks_; } + size_t getThreadsPerBlock() const { return threads_per_block_; } + const Props* getPropsPtr() const { return static_cast(props_ptr_); } + private: template void call(cudaStream_t stream, const void* src, void* dst, size_t start) const; diff --git a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp index 20a681cbf..8f752f01e 100644 --- a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp +++ b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.cpp @@ -59,7 +59,9 @@ void ActivationForwardCuDnnOpBase::Execute(const InferenceRequestContext& contex outputTensors[0].get()); } -bool ActivationForwardCuDnnOpBase::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ActivationForwardCuDnnOpBase::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp index 05e5c5598..de065efa7 100644 --- a/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp +++ b/modules/nvidia_plugin/src/ops/activation_forward_cudnn_base.hpp @@ -31,7 +31,7 @@ class ActivationForwardCuDnnOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; protected: std::unique_ptr op_desc_; diff --git a/modules/nvidia_plugin/src/ops/avgpool.cpp b/modules/nvidia_plugin/src/ops/avgpool.cpp index 6131ef5a5..858c50a06 100644 --- a/modules/nvidia_plugin/src/ops/avgpool.cpp +++ b/modules/nvidia_plugin/src/ops/avgpool.cpp @@ -30,7 +30,7 @@ void AvgPoolOp::Execute(const InferenceRequestContext& context, outputs[PoolingImpl::output_index].get()); } -bool AvgPoolOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility AvgPoolOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(AvgPoolOp, AvgPool); diff --git a/modules/nvidia_plugin/src/ops/avgpool.hpp b/modules/nvidia_plugin/src/ops/avgpool.hpp index e22e66fa0..184669f37 100644 --- a/modules/nvidia_plugin/src/ops/avgpool.hpp +++ b/modules/nvidia_plugin/src/ops/avgpool.hpp @@ -23,7 +23,7 @@ class AvgPoolOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: PoolingImpl impl_; diff --git a/modules/nvidia_plugin/src/ops/broadcast.cpp b/modules/nvidia_plugin/src/ops/broadcast.cpp index 5f97adf6c..3b6939a33 100644 --- a/modules/nvidia_plugin/src/ops/broadcast.cpp +++ b/modules/nvidia_plugin/src/ops/broadcast.cpp @@ -65,7 +65,7 @@ void BroadcastOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream, inputs[0].get(), broadcast_params_->mapper(workbuffers.immutable_buffers), outputs[0].get()); } -bool BroadcastOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility BroadcastOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest BroadcastOp::GetWorkBufferRequest() const { return {immutable_buffer_sizes_, {}}; } diff --git a/modules/nvidia_plugin/src/ops/broadcast.hpp b/modules/nvidia_plugin/src/ops/broadcast.hpp index e59b1792c..cb89a06ec 100644 --- a/modules/nvidia_plugin/src/ops/broadcast.hpp +++ b/modules/nvidia_plugin/src/ops/broadcast.hpp @@ -27,7 +27,7 @@ class BroadcastOp : public OperationBase { WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::vector immutable_buffer_sizes_; diff --git a/modules/nvidia_plugin/src/ops/clamp_cuda.cpp b/modules/nvidia_plugin/src/ops/clamp_cuda.cpp index bb0b0a87a..9fa5d7e8c 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cuda.cpp +++ b/modules/nvidia_plugin/src/ops/clamp_cuda.cpp @@ -51,7 +51,7 @@ void ClampCudaOp::Execute(const InferenceRequestContext& context, (*kernel_)(context.getThreadContext().stream().get(), inputTensors[0].get(), outputTensors[0].get()); } -bool ClampCudaOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ClampCudaOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/clamp_cuda.hpp b/modules/nvidia_plugin/src/ops/clamp_cuda.hpp index 61ee4153e..78baa91cc 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cuda.hpp +++ b/modules/nvidia_plugin/src/ops/clamp_cuda.hpp @@ -26,7 +26,7 @@ class ClampCudaOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp b/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp index 854ce8a77..7188bce83 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/clamp_cudnn.cpp @@ -97,7 +97,7 @@ void ClampCuDnnOp::Execute(const InferenceRequestContext& context, outputTensors[0].get()); } -bool ClampCuDnnOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ClampCuDnnOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void ClampCuDnnOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { switch (data_type_) { diff --git a/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp b/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp index 544ba081d..2a754af8f 100644 --- a/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/clamp_cudnn.hpp @@ -33,7 +33,7 @@ class ClampCuDnnOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/comparison.cpp b/modules/nvidia_plugin/src/ops/comparison.cpp index 3f9bf2ea0..e96f9b4f0 100644 --- a/modules/nvidia_plugin/src/ops/comparison.cpp +++ b/modules/nvidia_plugin/src/ops/comparison.cpp @@ -84,7 +84,7 @@ Comparison::Comparison(const CreationContext& context, threads_per_block}; } -bool Comparison::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility Comparison::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void Comparison::Execute(const InferenceRequestContext& context, Inputs inputs, diff --git a/modules/nvidia_plugin/src/ops/comparison.hpp b/modules/nvidia_plugin/src/ops/comparison.hpp index 64b73b0a3..c1de21a91 100644 --- a/modules/nvidia_plugin/src/ops/comparison.hpp +++ b/modules/nvidia_plugin/src/ops/comparison.hpp @@ -18,7 +18,7 @@ class Comparison : public OperationBase { IndexCollection&& outputIds, kernel::Comparison::Op_t operation_type); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void calculateOffsets(); diff --git a/modules/nvidia_plugin/src/ops/concat.cpp b/modules/nvidia_plugin/src/ops/concat.cpp index 8b1b1bd2a..f0276b0b4 100644 --- a/modules/nvidia_plugin/src/ops/concat.cpp +++ b/modules/nvidia_plugin/src/ops/concat.cpp @@ -95,7 +95,7 @@ void ConcatOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool ConcatOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility ConcatOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(ConcatOp, Concat); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/concat.hpp b/modules/nvidia_plugin/src/ops/concat.hpp index 223e9e337..566edb231 100644 --- a/modules/nvidia_plugin/src/ops/concat.hpp +++ b/modules/nvidia_plugin/src/ops/concat.hpp @@ -28,7 +28,7 @@ class ConcatOp : public OperationBase { WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: size_t immutableWbSize() const { return concat_kernel_.value().immutableWbSize(); } diff --git a/modules/nvidia_plugin/src/ops/convert.cpp b/modules/nvidia_plugin/src/ops/convert.cpp index c27d141a5..502ac9b11 100644 --- a/modules/nvidia_plugin/src/ops/convert.cpp +++ b/modules/nvidia_plugin/src/ops/convert.cpp @@ -55,7 +55,7 @@ void ConvertOp::Execute(const InferenceRequestContext& context, (*convert_kernel_)(stream.get(), outputs[0].get(), inputs[0].get()); } -bool ConvertOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ConvertOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(ConvertOp, Convert); diff --git a/modules/nvidia_plugin/src/ops/convert.hpp b/modules/nvidia_plugin/src/ops/convert.hpp index 471a27351..c1ac63f14 100644 --- a/modules/nvidia_plugin/src/ops/convert.hpp +++ b/modules/nvidia_plugin/src/ops/convert.hpp @@ -24,7 +24,7 @@ class ConvertOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; using Type_t = ov::element::Type_t; using convert_t = void (*)( diff --git a/modules/nvidia_plugin/src/ops/convert_color_i420.hpp b/modules/nvidia_plugin/src/ops/convert_color_i420.hpp index f9c4e9602..58ff845da 100644 --- a/modules/nvidia_plugin/src/ops/convert_color_i420.hpp +++ b/modules/nvidia_plugin/src/ops/convert_color_i420.hpp @@ -91,7 +91,7 @@ class I420ConvertColorBase : public OperationBase { } } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp b/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp index 20168f6d7..abb5aa5a2 100644 --- a/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp +++ b/modules/nvidia_plugin/src/ops/convert_color_nv12.hpp @@ -90,7 +90,7 @@ class NV12ConvertColorBase : public OperationBase { } } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp b/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp index 59d0aa762..f23ff2ed9 100644 --- a/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_backprop_data.cpp @@ -43,8 +43,8 @@ void ConvBackpropDataOp::Execute(const InferenceRequestContext& context, } template -bool ConvBackpropDataOp::IsCudaGraphCompatible() const { - return true; +CudaGraphCompatibility ConvBackpropDataOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(ConvolutionBackpropDataOp, ConvolutionBackpropData); diff --git a/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp b/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp index 213195d75..e761825bf 100644 --- a/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_backprop_data.hpp @@ -32,7 +32,7 @@ class ConvBackpropDataOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp b/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp index 66d25ab31..a5a7484ec 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn.cpp @@ -44,7 +44,7 @@ void ConvolutionCuDnn::Execute(const InferenceRequestContext& context, throwIfError(status); } -bool ConvolutionCuDnn::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ConvolutionCuDnn::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest ConvolutionCuDnn::GetWorkBufferRequest() const { if (descs_.Algo().memory != 0) diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp b/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp index abe858d98..3e0f63519 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn.hpp @@ -29,7 +29,7 @@ class ConvolutionCuDnn : public OperationCuDnn { WorkbufferRequest GetWorkBufferRequest() const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: Convolution::Details::ConvolutionDescriptorsCuDnn descs_; diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp index d4557a936..d0ac64568 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.cpp @@ -148,7 +148,7 @@ void ConvolutionCuDnnBE::Execute(const InferenceRequestContext& context, throwIfError(::cudnnBackendExecute(context.getThreadContext().dnnHandle().get(), plan->get(), variantPack->get())); } -bool ConvolutionCuDnnBE::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility ConvolutionCuDnnBE::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } std::shared_ptr ConvolutionCuDnnBE::MakeTensorDescriptor(int64_t id, cudnnDataType_t element_type, diff --git a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp index 1a68d8560..ac348e28f 100644 --- a/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp +++ b/modules/nvidia_plugin/src/ops/convolution_cudnn_be.hpp @@ -33,7 +33,7 @@ class ConvolutionCuDnnBE : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp index d40e301e5..0903adae1 100644 --- a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp +++ b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.cpp @@ -166,7 +166,7 @@ void CuDnnTensorOpBase::Execute(const InferenceRequestContext& context, outputTensors[0].get()); } -bool CuDnnTensorOpBase::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility CuDnnTensorOpBase::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } CuDnnTensorOpBase::IoParams::IoParams(const ov::Node& node, const Type& io_type, int index) : type_(convertDataType(io_type == Type::INPUT ? node.get_input_element_type(index) diff --git a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp index 0dce5eaf5..f9fc14181 100644 --- a/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp +++ b/modules/nvidia_plugin/src/ops/cudnn_tensor_op_base.hpp @@ -24,7 +24,7 @@ class CuDnnTensorOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: struct IoParams { diff --git a/modules/nvidia_plugin/src/ops/detection_output.cpp b/modules/nvidia_plugin/src/ops/detection_output.cpp index 20fff51cd..3418b552a 100644 --- a/modules/nvidia_plugin/src/ops/detection_output.cpp +++ b/modules/nvidia_plugin/src/ops/detection_output.cpp @@ -107,7 +107,7 @@ void DetectionOutputOp::Execute(const InferenceRequestContext& context, } } -bool DetectionOutputOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility DetectionOutputOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void DetectionOutputOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { kernel_.value().initSharedImmutableWorkbuffers(buffers); diff --git a/modules/nvidia_plugin/src/ops/detection_output.hpp b/modules/nvidia_plugin/src/ops/detection_output.hpp index 5a29d95e2..f2e65303f 100644 --- a/modules/nvidia_plugin/src/ops/detection_output.hpp +++ b/modules/nvidia_plugin/src/ops/detection_output.hpp @@ -25,7 +25,7 @@ class DetectionOutputOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/elementwise_binary.hpp b/modules/nvidia_plugin/src/ops/elementwise_binary.hpp index 3ada107a8..e6ed7454e 100644 --- a/modules/nvidia_plugin/src/ops/elementwise_binary.hpp +++ b/modules/nvidia_plugin/src/ops/elementwise_binary.hpp @@ -59,7 +59,7 @@ class ElementwiseBinaryOp : public OperationBase { static_cast(outputTensors[0].get())); } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) override { in0_broadcast_params_->initWorkbuffers(buffers); diff --git a/modules/nvidia_plugin/src/ops/elementwise_unary.hpp b/modules/nvidia_plugin/src/ops/elementwise_unary.hpp index bbdac1cfb..0f99c41ea 100644 --- a/modules/nvidia_plugin/src/ops/elementwise_unary.hpp +++ b/modules/nvidia_plugin/src/ops/elementwise_unary.hpp @@ -46,7 +46,7 @@ class ElementwiseUnaryOp : public OperationBase { (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/elu.cpp b/modules/nvidia_plugin/src/ops/elu.cpp index a747c3eb0..0d87bd6cf 100644 --- a/modules/nvidia_plugin/src/ops/elu.cpp +++ b/modules/nvidia_plugin/src/ops/elu.cpp @@ -45,7 +45,7 @@ void EluOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } -bool EluOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility EluOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(EluOp, Elu); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/elu.hpp b/modules/nvidia_plugin/src/ops/elu.hpp index 57e9f1ea2..16ddaac4c 100644 --- a/modules/nvidia_plugin/src/ops/elu.hpp +++ b/modules/nvidia_plugin/src/ops/elu.hpp @@ -23,7 +23,7 @@ class EluOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/fake_quantize.cpp b/modules/nvidia_plugin/src/ops/fake_quantize.cpp index aa1a8bee9..2752fd083 100644 --- a/modules/nvidia_plugin/src/ops/fake_quantize.cpp +++ b/modules/nvidia_plugin/src/ops/fake_quantize.cpp @@ -45,7 +45,7 @@ FakeQuantizeOp::FakeQuantizeOp(const CreationContext &context, convertDataType(element_type), output_size, max_threads_per_block, levels}; } -bool FakeQuantizeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FakeQuantizeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void FakeQuantizeOp::Execute(const InferenceRequestContext &context, Inputs inputTensors, diff --git a/modules/nvidia_plugin/src/ops/fake_quantize.hpp b/modules/nvidia_plugin/src/ops/fake_quantize.hpp index a054f4520..9de32fc45 100644 --- a/modules/nvidia_plugin/src/ops/fake_quantize.hpp +++ b/modules/nvidia_plugin/src/ops/fake_quantize.hpp @@ -20,7 +20,7 @@ class FakeQuantizeOp : public OperationBase { IndexCollection&& inputIds, IndexCollection&& outputIds); - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void Execute(const InferenceRequestContext& context, diff --git a/modules/nvidia_plugin/src/ops/fully_connected.cpp b/modules/nvidia_plugin/src/ops/fully_connected.cpp index d8fc6d9d1..60adc1fae 100644 --- a/modules/nvidia_plugin/src/ops/fully_connected.cpp +++ b/modules/nvidia_plugin/src/ops/fully_connected.cpp @@ -54,7 +54,7 @@ void FullyConnectedOp::Execute(const InferenceRequestContext& context, matmul_op_.Execute(context, inputs.first(inputs.size() - 1), outputs, workbuffers); } -bool FullyConnectedOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FullyConnectedOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(FullyConnectedOp, FullyConnected); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/fully_connected.hpp b/modules/nvidia_plugin/src/ops/fully_connected.hpp index c60e7d6ad..72e249e67 100644 --- a/modules/nvidia_plugin/src/ops/fully_connected.hpp +++ b/modules/nvidia_plugin/src/ops/fully_connected.hpp @@ -26,7 +26,7 @@ class FullyConnectedOp : public OperationCuBlas { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: MatMulOp matmul_op_; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp index be671ac98..d50d1f499 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.cpp @@ -77,7 +77,9 @@ void FusedConvolutionBackpropDataOp::Execute(const InferenceRequestContext& cont outputs[ArgIndices3Ins::dinput].get())); } -bool FusedConvolutionBackpropDataOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionBackpropDataOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} void FusedConvolutionBackpropDataOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp index 8edfd8035..97625cf3d 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_backprop_data.hpp @@ -26,7 +26,7 @@ class FusedConvolutionBackpropDataOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp index 10f117d74..86fcebb35 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.cpp @@ -96,7 +96,7 @@ void FusedConvolutionCuDnn::Execute(const InferenceRequestContext& context, outputs[ArgIndices::output].get())); } -bool FusedConvolutionCuDnn::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionCuDnn::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest FusedConvolutionCuDnn::GetWorkBufferRequest() const { if (conv_descs_->Algo().memory != 0) diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp index ddf7c5d59..b9fc013da 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn.hpp @@ -35,7 +35,7 @@ class FusedConvolutionCuDnn : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override {} WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp index 9311177c9..541629fae 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.cpp @@ -327,7 +327,9 @@ void FusedConvolutionCuDnnBE::Execute(const InferenceRequestContext& context, throwIfError(::cudnnBackendExecute(context.getThreadContext().dnnHandle().get(), plan->get(), variantPack->get())); } -bool FusedConvolutionCuDnnBE::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility FusedConvolutionCuDnnBE::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::NONE; +} std::shared_ptr FusedConvolutionCuDnnBE::MakeTensorDescriptor( int64_t id, diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp index e2ddb74ab..910887819 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_be.hpp @@ -34,7 +34,7 @@ class FusedConvolutionCuDnnBE : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; private: diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp index 088a41790..a4efd1ca7 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.cpp @@ -85,7 +85,9 @@ void FusedConvolutionCuDnnDecomposed::Execute(const InferenceRequestContext& con } } -bool FusedConvolutionCuDnnDecomposed::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility FusedConvolutionCuDnnDecomposed::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} WorkbufferRequest FusedConvolutionCuDnnDecomposed::GetWorkBufferRequest() const { if (conv_descs_->Algo().memory != 0) { diff --git a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp index e92be760a..b426008ed 100644 --- a/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp +++ b/modules/nvidia_plugin/src/ops/fused_convolution_cudnn_decomposed.hpp @@ -37,7 +37,7 @@ class FusedConvolutionCuDnnDecomposed : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override {} WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/gather.cpp b/modules/nvidia_plugin/src/ops/gather.cpp index f666cc35e..b954135d3 100644 --- a/modules/nvidia_plugin/src/ops/gather.cpp +++ b/modules/nvidia_plugin/src/ops/gather.cpp @@ -178,7 +178,7 @@ void GatherOp::Execute(const InferenceRequestContext& context, (*gather_kernel_)(context.getThreadContext().stream().get(), inputs[0].get(), inputs[1].get(), outputs[0].get()); } -bool GatherOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GatherOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(GatherOp, Gather); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/gather.hpp b/modules/nvidia_plugin/src/ops/gather.hpp index 6adb48be8..9753a8fc2 100644 --- a/modules/nvidia_plugin/src/ops/gather.hpp +++ b/modules/nvidia_plugin/src/ops/gather.hpp @@ -22,7 +22,7 @@ class GatherOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional gather_kernel_; diff --git a/modules/nvidia_plugin/src/ops/group_convolution.cpp b/modules/nvidia_plugin/src/ops/group_convolution.cpp index b381bb47f..280fa7337 100644 --- a/modules/nvidia_plugin/src/ops/group_convolution.cpp +++ b/modules/nvidia_plugin/src/ops/group_convolution.cpp @@ -25,7 +25,7 @@ void GroupConvolutionOp::Execute(const InferenceRequestContext &context, convolution_.Execute(context, inputTensors, outputTensors, buffers); } -bool GroupConvolutionOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GroupConvolutionOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest GroupConvolutionOp::GetWorkBufferRequest() const { return convolution_.GetWorkBufferRequest(); } diff --git a/modules/nvidia_plugin/src/ops/group_convolution.hpp b/modules/nvidia_plugin/src/ops/group_convolution.hpp index f44ac6936..5b1bb3d1a 100644 --- a/modules/nvidia_plugin/src/ops/group_convolution.hpp +++ b/modules/nvidia_plugin/src/ops/group_convolution.hpp @@ -27,7 +27,7 @@ class GroupConvolutionOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override final; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override final; private: diff --git a/modules/nvidia_plugin/src/ops/gru_cell.cpp b/modules/nvidia_plugin/src/ops/gru_cell.cpp index b8bf2736d..43d0b0f47 100644 --- a/modules/nvidia_plugin/src/ops/gru_cell.cpp +++ b/modules/nvidia_plugin/src/ops/gru_cell.cpp @@ -61,7 +61,7 @@ void GRUCellOp::Execute(const InferenceRequestContext& context, nullptr); } -bool GRUCellOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility GRUCellOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void GRUCellOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1 || buffers.size() == 2, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/gru_cell.hpp b/modules/nvidia_plugin/src/ops/gru_cell.hpp index 6e00cd5b4..2fcebcb0d 100644 --- a/modules/nvidia_plugin/src/ops/gru_cell.hpp +++ b/modules/nvidia_plugin/src/ops/gru_cell.hpp @@ -27,7 +27,7 @@ class GRUCellOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/gru_sequence.cpp b/modules/nvidia_plugin/src/ops/gru_sequence.cpp index 7317aef73..84a6893b4 100644 --- a/modules/nvidia_plugin/src/ops/gru_sequence.cpp +++ b/modules/nvidia_plugin/src/ops/gru_sequence.cpp @@ -19,7 +19,9 @@ GRUSequenceOp::GRUSequenceOp(const CreationContext& context, : OperationCuDnn(context, node, std::move(inputIds), std::move(outputIds)), params_{node}, descs_{context, params_, config()}, - is_cuda_graph_compatible_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device())} { + graph_compatibility_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device()) + ? CudaGraphCompatibility::FULL + : CudaGraphCompatibility::NONE} { ib_seq_lengths_.addRequest(immut_sizes_, descs_.seqLengthArraySizeBytes()); ib_weight_space_.addRequest(immut_sizes_, descs_.weightSpaceSize()); @@ -71,7 +73,7 @@ void GRUSequenceOp::Execute(const InferenceRequestContext& context, nullptr); } -bool GRUSequenceOp::IsCudaGraphCompatible() const { return is_cuda_graph_compatible_; } +CudaGraphCompatibility GRUSequenceOp::GetCudaGraphCompatibility() const { return graph_compatibility_; } void GRUSequenceOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { descs_.initDevSeqLengthArray(CUDA::DevicePointer{ib_seq_lengths_.requiredPtr(buffers)}); diff --git a/modules/nvidia_plugin/src/ops/gru_sequence.hpp b/modules/nvidia_plugin/src/ops/gru_sequence.hpp index 6b2335901..00153a193 100644 --- a/modules/nvidia_plugin/src/ops/gru_sequence.hpp +++ b/modules/nvidia_plugin/src/ops/gru_sequence.hpp @@ -32,7 +32,7 @@ class GRUSequenceOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: static Config config(); @@ -50,7 +50,7 @@ class GRUSequenceOp : public OperationCuDnn { WorkbufferDesc ib_weight_space_; WorkbufferDesc mb_work_space_; - bool is_cuda_graph_compatible_; + CudaGraphCompatibility graph_compatibility_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp b/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp index 0fc3cfe91..a89764cee 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_cubic.cpp @@ -68,7 +68,7 @@ void InterpolateCubicOp::Execute(const InferenceRequestContext& context, (*interpolate_)(context.getThreadContext().stream().get(), inputs[0].get(), outputs[0].get()); } -bool InterpolateCubicOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateCubicOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest InterpolateCubicOp::GetWorkBufferRequest() const { return {interpolate_->immutableWorkbufferSizes(), {}}; diff --git a/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp b/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp index 32e06cc00..ee5348657 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_cubic.hpp @@ -25,7 +25,7 @@ class InterpolateCubicOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/interpolate_linear.cpp b/modules/nvidia_plugin/src/ops/interpolate_linear.cpp index b19d1228f..2c70981ed 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_linear.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_linear.cpp @@ -69,7 +69,7 @@ void InterpolateLinearOp::Execute(const InferenceRequestContext& context, (*interpolate_)(context.getThreadContext().stream().get(), inputs[0].get(), outputs[0].get()); } -bool InterpolateLinearOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateLinearOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest InterpolateLinearOp::GetWorkBufferRequest() const { return {interpolate_->immutableWorkbufferSizes(), {}}; diff --git a/modules/nvidia_plugin/src/ops/interpolate_linear.hpp b/modules/nvidia_plugin/src/ops/interpolate_linear.hpp index eb5e6539a..22b911040 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_linear.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_linear.hpp @@ -25,7 +25,7 @@ class InterpolateLinearOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp b/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp index 89caf4b47..6367e38fc 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp +++ b/modules/nvidia_plugin/src/ops/interpolate_nearest.cpp @@ -158,7 +158,7 @@ void InterpolateNearestOp::Execute(const InferenceRequestContext& context, dst); } -bool InterpolateNearestOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility InterpolateNearestOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } template static auto size_in_bytes(const std::vector& v) noexcept { diff --git a/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp b/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp index 311b09177..487ef3829 100644 --- a/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp +++ b/modules/nvidia_plugin/src/ops/interpolate_nearest.hpp @@ -26,7 +26,7 @@ class InterpolateNearestOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/logical_not.cpp b/modules/nvidia_plugin/src/ops/logical_not.cpp index 97d121e75..456955343 100644 --- a/modules/nvidia_plugin/src/ops/logical_not.cpp +++ b/modules/nvidia_plugin/src/ops/logical_not.cpp @@ -35,7 +35,7 @@ void LogicalNotOp::Execute(const InferenceRequestContext& context, throwIfError(cudaPeekAtLastError()); } -bool LogicalNotOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility LogicalNotOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(LogicalNotOp, LogicalNot); diff --git a/modules/nvidia_plugin/src/ops/logical_not.hpp b/modules/nvidia_plugin/src/ops/logical_not.hpp index 8c95dc415..681737c7d 100644 --- a/modules/nvidia_plugin/src/ops/logical_not.hpp +++ b/modules/nvidia_plugin/src/ops/logical_not.hpp @@ -21,7 +21,7 @@ class LogicalNotOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: kernel::LogicalNot kernel_; diff --git a/modules/nvidia_plugin/src/ops/lstm_cell.cpp b/modules/nvidia_plugin/src/ops/lstm_cell.cpp index b53232793..5d7f45cb4 100644 --- a/modules/nvidia_plugin/src/ops/lstm_cell.cpp +++ b/modules/nvidia_plugin/src/ops/lstm_cell.cpp @@ -57,7 +57,7 @@ void LSTMCellOp::Execute(const InferenceRequestContext& context, nullptr); } -bool LSTMCellOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility LSTMCellOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void LSTMCellOp::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1 || buffers.size() == 2, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/lstm_cell.hpp b/modules/nvidia_plugin/src/ops/lstm_cell.hpp index b36a4b36b..348b92116 100644 --- a/modules/nvidia_plugin/src/ops/lstm_cell.hpp +++ b/modules/nvidia_plugin/src/ops/lstm_cell.hpp @@ -27,7 +27,7 @@ class LSTMCellOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp b/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp index 5873378cc..ab5e35a5d 100644 --- a/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp +++ b/modules/nvidia_plugin/src/ops/lstm_sequence_base.cpp @@ -20,7 +20,9 @@ LSTMSequenceOpBase::LSTMSequenceOpBase(const CreationContext& context, : OperationCuDnn(context, node, std::move(inputIds), std::move(outputIds)), params_{params}, descs_{context, params_, config}, - is_cuda_graph_compatible_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device())} { + graph_compatibility_{RNN::Details::isRNNSequenceCudaGraphCompatible(context.device()) + ? CudaGraphCompatibility::FULL + : CudaGraphCompatibility::NONE} { ib_seq_lengths_.addRequest(immut_sizes_, descs_.seqLengthArraySizeBytes()); ib_weight_space_.addRequest(immut_sizes_, descs_.weightSpaceSize()); @@ -76,7 +78,7 @@ void LSTMSequenceOpBase::Execute(const InferenceRequestContext& context, if (cy_adapter) cy_adapter->execute(context, mb, outputs[ArgIndices::cell_output]); } -bool LSTMSequenceOpBase::IsCudaGraphCompatible() const { return is_cuda_graph_compatible_; } +CudaGraphCompatibility LSTMSequenceOpBase::GetCudaGraphCompatibility() const { return graph_compatibility_; } void LSTMSequenceOpBase::InitSharedImmutableWorkbuffers(const IOperationExec::Buffers& buffers) { descs_.initDevSeqLengthArray(CUDA::DevicePointer{ib_seq_lengths_.requiredPtr(buffers)}); diff --git a/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp b/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp index 6046c1753..6a459cf45 100644 --- a/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp +++ b/modules/nvidia_plugin/src/ops/lstm_sequence_base.hpp @@ -30,7 +30,7 @@ class LSTMSequenceOpBase : public OperationCuDnn { Outputs outputTensors, const Workbuffers&) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const IOperationExec::Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; @@ -59,7 +59,7 @@ class LSTMSequenceOpBase : public OperationCuDnn { OutputTensorAdapterPtr cy_adapter; private: - bool is_cuda_graph_compatible_; + CudaGraphCompatibility graph_compatibility_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/matmul.cpp b/modules/nvidia_plugin/src/ops/matmul.cpp index 85f97adad..52de989f1 100644 --- a/modules/nvidia_plugin/src/ops/matmul.cpp +++ b/modules/nvidia_plugin/src/ops/matmul.cpp @@ -226,7 +226,7 @@ void MatMulOp::Execute(const InferenceRequestContext& context, CUBLAS_GEMM_DEFAULT)); } -bool MatMulOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MatMulOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(MatMulOp, MatMul); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/matmul.hpp b/modules/nvidia_plugin/src/ops/matmul.hpp index 10dbe2884..d30b46bfa 100644 --- a/modules/nvidia_plugin/src/ops/matmul.hpp +++ b/modules/nvidia_plugin/src/ops/matmul.hpp @@ -29,7 +29,7 @@ class MatMulOp : public OperationCuBlas { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; int GetBatchCount() const { return batch_count_; } diff --git a/modules/nvidia_plugin/src/ops/maxpool.cpp b/modules/nvidia_plugin/src/ops/maxpool.cpp index f6a696112..1b9a5c58d 100644 --- a/modules/nvidia_plugin/src/ops/maxpool.cpp +++ b/modules/nvidia_plugin/src/ops/maxpool.cpp @@ -30,7 +30,7 @@ void MaxPoolOp::Execute(const InferenceRequestContext& context, outputs[PoolingImpl::output_index].get()); } -bool MaxPoolOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MaxPoolOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(MaxPoolOp, MaxPool); diff --git a/modules/nvidia_plugin/src/ops/maxpool.hpp b/modules/nvidia_plugin/src/ops/maxpool.hpp index a43554c42..b12e39525 100644 --- a/modules/nvidia_plugin/src/ops/maxpool.hpp +++ b/modules/nvidia_plugin/src/ops/maxpool.hpp @@ -23,7 +23,7 @@ class MaxPoolOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: PoolingImpl impl_; diff --git a/modules/nvidia_plugin/src/ops/mvn.cpp b/modules/nvidia_plugin/src/ops/mvn.cpp index 365ef9d69..d05a2f7a5 100644 --- a/modules/nvidia_plugin/src/ops/mvn.cpp +++ b/modules/nvidia_plugin/src/ops/mvn.cpp @@ -99,7 +99,7 @@ void MvnOp::Execute(const InferenceRequestContext& context, {tensor_desc_, outputTensors[0]}); } -bool MvnOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility MvnOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void MvnOp::Context::reduceMean(ConstTensor input, Tensor output) { context.getThreadContext().dnnHandle().reduceTensor(op.reduce_mean_desc_, diff --git a/modules/nvidia_plugin/src/ops/mvn.hpp b/modules/nvidia_plugin/src/ops/mvn.hpp index bb2c3e228..9420a42ea 100644 --- a/modules/nvidia_plugin/src/ops/mvn.hpp +++ b/modules/nvidia_plugin/src/ops/mvn.hpp @@ -25,7 +25,7 @@ class MvnOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; private: diff --git a/modules/nvidia_plugin/src/ops/nop_op.hpp b/modules/nvidia_plugin/src/ops/nop_op.hpp index dcb555fb3..a734d7351 100644 --- a/modules/nvidia_plugin/src/ops/nop_op.hpp +++ b/modules/nvidia_plugin/src/ops/nop_op.hpp @@ -39,7 +39,7 @@ class NopOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override {} - bool IsCudaGraphCompatible() const override { return true; } + CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::FULL; } }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/pad.cpp b/modules/nvidia_plugin/src/ops/pad.cpp index 6bf760670..da33feaca 100644 --- a/modules/nvidia_plugin/src/ops/pad.cpp +++ b/modules/nvidia_plugin/src/ops/pad.cpp @@ -58,7 +58,7 @@ void PadOp::Execute(const InferenceRequestContext& context, inputTensors[InputIndex::kPadValue].get()); } -bool PadOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility PadOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest PadOp::GetWorkBufferRequest() const { auto rank = src_shape_.size(); diff --git a/modules/nvidia_plugin/src/ops/pad.hpp b/modules/nvidia_plugin/src/ops/pad.hpp index c819118c2..846a65feb 100644 --- a/modules/nvidia_plugin/src/ops/pad.hpp +++ b/modules/nvidia_plugin/src/ops/pad.hpp @@ -23,7 +23,7 @@ class PadOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; diff --git a/modules/nvidia_plugin/src/ops/parameter.cpp b/modules/nvidia_plugin/src/ops/parameter.cpp index 54c5dbe14..233c8dec0 100644 --- a/modules/nvidia_plugin/src/ops/parameter.cpp +++ b/modules/nvidia_plugin/src/ops/parameter.cpp @@ -32,7 +32,7 @@ void ParameterOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().upload(outputs[0], tensor->data(), tensor->get_byte_size()); } -bool ParameterOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ParameterOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::string ParameterOp::GetInputTensorName(const ov::Node& node) { return node.get_friendly_name(); } diff --git a/modules/nvidia_plugin/src/ops/parameter.hpp b/modules/nvidia_plugin/src/ops/parameter.hpp index 4cbbe40dc..decd83aff 100644 --- a/modules/nvidia_plugin/src/ops/parameter.hpp +++ b/modules/nvidia_plugin/src/ops/parameter.hpp @@ -27,7 +27,7 @@ class ParameterOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; static std::string GetInputTensorName(const ov::Node& node); private: diff --git a/modules/nvidia_plugin/src/ops/range.cpp b/modules/nvidia_plugin/src/ops/range.cpp index f8df8a09a..df08e04d4 100644 --- a/modules/nvidia_plugin/src/ops/range.cpp +++ b/modules/nvidia_plugin/src/ops/range.cpp @@ -64,7 +64,7 @@ void RangeOp::Execute(const InferenceRequestContext& context, outputs[OUTPUT_INDX].get()); } -bool RangeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility RangeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(RangeOp, Range); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/range.hpp b/modules/nvidia_plugin/src/ops/range.hpp index 89da68a6a..67a90f15c 100644 --- a/modules/nvidia_plugin/src/ops/range.hpp +++ b/modules/nvidia_plugin/src/ops/range.hpp @@ -26,7 +26,7 @@ class RangeOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: size_t output_size_; diff --git a/modules/nvidia_plugin/src/ops/reduce.cpp b/modules/nvidia_plugin/src/ops/reduce.cpp index d57d75f17..36b7df0ff 100644 --- a/modules/nvidia_plugin/src/ops/reduce.cpp +++ b/modules/nvidia_plugin/src/ops/reduce.cpp @@ -58,7 +58,7 @@ void ReduceOp::Execute(const InferenceRequestContext& context, outputTensors[0]); } -bool ReduceOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ReduceOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/reduce.hpp b/modules/nvidia_plugin/src/ops/reduce.hpp index 35d9b4822..275eefbd3 100644 --- a/modules/nvidia_plugin/src/ops/reduce.hpp +++ b/modules/nvidia_plugin/src/ops/reduce.hpp @@ -22,7 +22,7 @@ class ReduceOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; static cudnnDataType_t reduceCompType(const ov::Node& node); diff --git a/modules/nvidia_plugin/src/ops/result.cpp b/modules/nvidia_plugin/src/ops/result.cpp index 7f7aaf757..95b0f2672 100644 --- a/modules/nvidia_plugin/src/ops/result.cpp +++ b/modules/nvidia_plugin/src/ops/result.cpp @@ -43,7 +43,7 @@ void ResultOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().download(tensor->data(), inputs[0], tensor->get_byte_size()); } -bool ResultOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ResultOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::optional ResultOp::GetOutputTensorSubIndex(const ov::Output& node) { const auto& opRegistry = OperationRegistry::getInstance(); diff --git a/modules/nvidia_plugin/src/ops/result.hpp b/modules/nvidia_plugin/src/ops/result.hpp index 7e66794a2..275074583 100644 --- a/modules/nvidia_plugin/src/ops/result.hpp +++ b/modules/nvidia_plugin/src/ops/result.hpp @@ -29,7 +29,7 @@ class ResultOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; static std::vector GetOutputTensorName(const ov::op::v0::Result& node); diff --git a/modules/nvidia_plugin/src/ops/round.cpp b/modules/nvidia_plugin/src/ops/round.cpp index 4edd12e55..9fa98f216 100644 --- a/modules/nvidia_plugin/src/ops/round.cpp +++ b/modules/nvidia_plugin/src/ops/round.cpp @@ -48,7 +48,7 @@ void RoundOp::Execute(const InferenceRequestContext& context, (*kernel_)(context.getThreadContext().stream().get(), inputTensors[0].get(), outputTensors[0].get()); } -bool RoundOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility RoundOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(RoundOp, Round); diff --git a/modules/nvidia_plugin/src/ops/round.hpp b/modules/nvidia_plugin/src/ops/round.hpp index 86762f3cc..baa62db95 100644 --- a/modules/nvidia_plugin/src/ops/round.hpp +++ b/modules/nvidia_plugin/src/ops/round.hpp @@ -25,7 +25,7 @@ class RoundOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp b/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp index bd5b72eb4..bb00b459a 100644 --- a/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp +++ b/modules/nvidia_plugin/src/ops/scatter_nd_update.cpp @@ -100,7 +100,7 @@ void ScatterNDUpdateOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool ScatterNDUpdateOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility ScatterNDUpdateOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } template static auto size_in_bytes(const std::vector& v) noexcept { diff --git a/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp b/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp index 394413f0f..778a16c55 100644 --- a/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp +++ b/modules/nvidia_plugin/src/ops/scatter_nd_update.hpp @@ -22,7 +22,7 @@ class ScatterNDUpdateOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/select.cpp b/modules/nvidia_plugin/src/ops/select.cpp index c8dc04f01..708b57c74 100644 --- a/modules/nvidia_plugin/src/ops/select.cpp +++ b/modules/nvidia_plugin/src/ops/select.cpp @@ -90,7 +90,7 @@ void SelectOp::Execute(const InferenceRequestContext& context, outputs[0].get()); } -bool SelectOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SelectOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } WorkbufferRequest SelectOp::GetWorkBufferRequest() const { return {std::vector(SIZES + 1, kOffsetBufferSize), {}}; diff --git a/modules/nvidia_plugin/src/ops/select.hpp b/modules/nvidia_plugin/src/ops/select.hpp index 61287a269..8eb0e800e 100644 --- a/modules/nvidia_plugin/src/ops/select.hpp +++ b/modules/nvidia_plugin/src/ops/select.hpp @@ -26,7 +26,7 @@ class SelectOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/softmax.cpp b/modules/nvidia_plugin/src/ops/softmax.cpp index e3ecedaa2..0019cd7db 100644 --- a/modules/nvidia_plugin/src/ops/softmax.cpp +++ b/modules/nvidia_plugin/src/ops/softmax.cpp @@ -192,7 +192,7 @@ void SoftmaxOp::Execute(const InferenceRequestContext& context, outputs[0].get())); } -bool SoftmaxOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SoftmaxOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(SoftmaxOp, Softmax); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/softmax.hpp b/modules/nvidia_plugin/src/ops/softmax.hpp index 608e1e657..abafaed63 100644 --- a/modules/nvidia_plugin/src/ops/softmax.hpp +++ b/modules/nvidia_plugin/src/ops/softmax.hpp @@ -27,7 +27,7 @@ class SoftmaxOp : public OperationCuDnn { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: void mapRankAxis(const ov::Shape& shape, int axis); diff --git a/modules/nvidia_plugin/src/ops/split.cpp b/modules/nvidia_plugin/src/ops/split.cpp index b53aeadce..b5f66cbad 100644 --- a/modules/nvidia_plugin/src/ops/split.cpp +++ b/modules/nvidia_plugin/src/ops/split.cpp @@ -89,7 +89,7 @@ void SplitOp::Execute(const InferenceRequestContext& context, (*split_kernel_)(stream.get(), reinterpret_cast(in.get()), reinterpret_cast(outputPtrs.get())); } -bool SplitOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility SplitOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(SplitOp, Split); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/split.hpp b/modules/nvidia_plugin/src/ops/split.hpp index b38be6a0d..f6eda60bc 100644 --- a/modules/nvidia_plugin/src/ops/split.hpp +++ b/modules/nvidia_plugin/src/ops/split.hpp @@ -26,7 +26,7 @@ class SplitOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/strided_slice.cpp b/modules/nvidia_plugin/src/ops/strided_slice.cpp index 992eab97b..5d028ceb6 100644 --- a/modules/nvidia_plugin/src/ops/strided_slice.cpp +++ b/modules/nvidia_plugin/src/ops/strided_slice.cpp @@ -103,7 +103,9 @@ void StridedSliceOp::Execute(const InferenceRequestContext& context, } template -bool StridedSliceOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility StridedSliceOp::GetCudaGraphCompatibility() const { + return CudaGraphCompatibility::FULL; +} template WorkbufferRequest StridedSliceOp::GetWorkBufferRequest() const { diff --git a/modules/nvidia_plugin/src/ops/strided_slice.hpp b/modules/nvidia_plugin/src/ops/strided_slice.hpp index f154e8967..9754cafb3 100644 --- a/modules/nvidia_plugin/src/ops/strided_slice.hpp +++ b/modules/nvidia_plugin/src/ops/strided_slice.hpp @@ -39,7 +39,7 @@ class StridedSliceOp : public OperationBase { Outputs outputs, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; diff --git a/modules/nvidia_plugin/src/ops/subgraph.cpp b/modules/nvidia_plugin/src/ops/subgraph.cpp index 012cbb7a4..acda4f30f 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.cpp +++ b/modules/nvidia_plugin/src/ops/subgraph.cpp @@ -128,43 +128,59 @@ std::vector> SubGraph::getSharedWorkbuffers(const IOperatio return result; } -void SubGraph::Capture(InferenceRequestContext &context, Inputs, Outputs, - const Workbuffers &workbuffers) const { +void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); + executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); } -WorkbufferRequest SubGraph::GetWorkBufferRequest() const { - const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize(); - return {{}, {memoryBlockSize}}; +CudaGraphCompatibility SubGraph::GetCudaGraphCompatibility() const { + if (!is_compatibility_analyzed_) { + graph_compatibility_ = CudaGraphCompatibility::FULL; + for (const auto& op : exec_sequence_) { + auto opCompatability = op->GetCudaGraphCompatibility(); + if (opCompatability == CudaGraphCompatibility::SPECIAL) { + graph_compatibility_ = opCompatability; + } else if (opCompatability == CudaGraphCompatibility::NONE) { + graph_compatibility_ = opCompatability; + break; + } + } + is_compatibility_analyzed_ = true; + } + return graph_compatibility_; } -void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { +void SubGraph::Capture(InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const { const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); } -bool SubGraph::IsCudaGraphCompatible() const { - if (is_cuda_graph_compatible_ == CompatibleState::NOT_INITIALIZED) { - is_cuda_graph_compatible_ = CompatibleState::COMPATIBLE; - for (const auto& op : exec_sequence_) { - if (!op->IsCudaGraphCompatible()) { - is_cuda_graph_compatible_ = CompatibleState::NOT_COMPATIBLE; - break; - } - } - } - return is_cuda_graph_compatible_ == CompatibleState::COMPATIBLE; +void SubGraph::ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + executionDelegator.execute_graph_sequence(this, memoryManager, mutableBuffer, context); +} + +WorkbufferRequest SubGraph::GetWorkBufferRequest() const { + const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize(); + return {{}, {memoryBlockSize}}; } } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/subgraph.hpp b/modules/nvidia_plugin/src/ops/subgraph.hpp index 443aeea96..2bf4f5651 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.hpp +++ b/modules/nvidia_plugin/src/ops/subgraph.hpp @@ -32,12 +32,17 @@ class SubGraph : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; + void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override; inline std::shared_ptr memoryManager() const { return memory_manager_; } @@ -78,8 +83,6 @@ class SubGraph : public OperationBase { ov::Shape shape_{}; }; - enum class CompatibleState { NOT_INITIALIZED = -1, NOT_COMPATIBLE, COMPATIBLE }; - std::shared_ptr memory_manager_; std::vector params_; std::vector params_info_; @@ -88,7 +91,8 @@ class SubGraph : public OperationBase { std::vector results_info_; std::shared_ptr model_; - mutable CompatibleState is_cuda_graph_compatible_ = CompatibleState::NOT_INITIALIZED; + mutable CudaGraphCompatibility graph_compatibility_; + mutable bool is_compatibility_analyzed_ = false; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/swish.cpp b/modules/nvidia_plugin/src/ops/swish.cpp index 61244cef9..7308d7e07 100644 --- a/modules/nvidia_plugin/src/ops/swish.cpp +++ b/modules/nvidia_plugin/src/ops/swish.cpp @@ -69,7 +69,7 @@ void SwishOp::Execute(const InferenceRequestContext& context, (*kernel_)(stream.get(), inputTensors[0].get(), outputTensors[0].get()); } -bool SwishOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility SwishOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } OPERATION_REGISTER(SwishOp, Swish); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/swish.hpp b/modules/nvidia_plugin/src/ops/swish.hpp index 26a353850..8f1ddbb3a 100644 --- a/modules/nvidia_plugin/src/ops/swish.hpp +++ b/modules/nvidia_plugin/src/ops/swish.hpp @@ -24,7 +24,7 @@ class SwishOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: std::optional kernel_; diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index c92238aec..0d5eee11b 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -133,6 +133,28 @@ TensorIteratorOp::TensorIteratorOp(const CreationContext& context, } updateExecSequence(); + + // Input mapping of ports + slices_.reserve(portmap_inputs_.size()); + for (const auto& it : portmap_inputs_) { + const auto& inputIdx = it.first; + const auto& paramIdx = inputs_parameters_map_.at(inputIdx); + slices_.emplace_back(*this, inputIdx, paramIdx); + } + + // Back-edge mapping + transfers_.reserve(results_parameters_map_.size()); + for (const auto& [resultIdx, paramIdx] : results_parameters_map_) { + transfers_.emplace_back(*this, resultIdx, paramIdx); + } + + // Output mapping of ports + inserts_.reserve(results_outputs_map_.size()); + for (const auto& [resultIdx, outputIdx] : results_outputs_map_) { + if (portmap_outputs_.count(outputIdx) > 0) { + inserts_.emplace_back(*this, resultIdx, outputIdx); + } + } } void TensorIteratorOp::Execute(const InferenceRequestContext& context, @@ -142,64 +164,212 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, const auto& stream = context.getThreadContext().stream(); const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& cancellationToken = context.getCancellationToken(); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); // First iteration for (const auto inputIdx : invariant_inputs_) { const auto paramIdx = inputs_parameters_map_.at(inputIdx); - copyParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); } for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { if (portmap_inputs_.count(inputIdx) == 0) { - copyParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); } } for (int64_t iter = 0; iter < num_iterations_; ++iter) { - // Input mapping of ports - for (auto& it : portmap_inputs_) { - const auto& inputIdx = it.first; - const auto& paramIdx = inputs_parameters_map_.at(inputIdx); - copyParam(stream, mutableBuffer, inputTensors, iter, inputIdx, paramIdx); + for (const auto& slice : slices_) { + slice(stream, inputTensors, mutableBuffer, iter); } // Inner loop executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); // Back-edge mapping - for (auto& [resultIdx, paramIdx] : results_parameters_map_) { - copyBackEdge(stream, mutableBuffer, resultIdx, paramIdx); + for (const auto& transfer : transfers_) { + transfer(stream, mutableBuffer); } // Output mapping of ports - for (const auto& [resultIdx, outputIdx] : results_outputs_map_) { - if (portmap_outputs_.count(outputIdx) > 0) { - copyResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); - } + for (const auto& insert : inserts_) { + insert(stream, mutableBuffer, outputTensors, iter); } // Copy data to output if (iterations_results_map_.count(iter) > 0) { for (const auto& resultIdx : iterations_results_map_.at(iter)) { const auto& outputIdx = results_outputs_map_.at(resultIdx); - copyResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); + transferResult(stream, mutableBuffer, outputTensors, iter, resultIdx, outputIdx); } } } } -// TODO: Investigate problem with multi-graphs in some networks -// benchmark_app may hang in throughput mode -bool TensorIteratorOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility TensorIteratorOp::GetCudaGraphCompatibility() const { + // This implementation is CUDA graph compatible only if this is the standard TI with output only of the last + // iteration (which is handled outside of the iterations loop) + if (iterations_results_map_.size() != 1 || iterations_results_map_.count(num_iterations_ - 1) == 0) { + return CudaGraphCompatibility::NONE; + } + return SubGraph::GetCudaGraphCompatibility() == CudaGraphCompatibility::NONE ? CudaGraphCompatibility::NONE + : CudaGraphCompatibility::SPECIAL; +} void TensorIteratorOp::Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const { - Execute(context, inputTensors, outputTensors, workbuffers); + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); + + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + // Input mapping of ports + for (auto& slice : slices_) { + slice.add_kernel_node(graphInfo, stream, mutableBuffer, inputTensors); + } + + // Inner loop + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); + + // Back-edge mapping + for (auto& transfer : transfers_) { + transfer.add_transfer_node(graphInfo, stream, mutableBuffer); + } + + // Output mapping of ports + for (auto& insert : inserts_) { + insert.add_kernel_node(graphInfo, stream, mutableBuffer, outputTensors); + } + } + graphInfo.set_graph(capture.getGraph()); +} + +void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + const auto& stream = context.getThreadContext().stream(); + const auto& memoryManager = *memory_manager_; + const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); + + // First iteration; this part doesn't use CUDA graphs yet + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); + OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), + "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + + // TI body loop + for (int64_t iter = 0; iter < num_iterations_; ++iter) { + for (std::size_t i = 0; i < slices_.size(); ++i) { + slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter); + } + for (std::size_t i = 0; i < inserts_.size(); ++i) { + inserts_[i].update_kernel_node(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter); + } + graphInfo.launch(stream); + } + + // Copy data to output; this part doesn't use CUDA graphs yet + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } + } +} + +TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx) + : input_idx_{inputIdx}, + param_{*ti.params_[paramIdx]}, + memory_manager_{*ti.memory_manager_}, + slice_{ti.kernelmap_inputs_.at(inputIdx)} { + OPENVINO_ASSERT(ti.portmap_inputs_.count(inputIdx) != 0, "Node name: ", ti.GetName()); + const auto& portMap = ti.portmap_inputs_.at(input_idx_); + const auto& inputShape = ti.inputs_info_[input_idx_].shape_; + start_ = portMap.start < 0 ? inputShape[portMap.axis] + portMap.start : portMap.start; + stride_ = portMap.stride; +} + +void TensorIteratorOp::SliceLauncher::add_kernel_node(CudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors) { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + info.add_kernel(stream, + slice_.getKernel(), + slice_.getNumBlocks(), + slice_.getThreadsPerBlock(), + slice_.getPropsPtr(), + start_, + slice_.getSize(), + src, + dst); +} + +TensorIteratorOp::TransferLauncher::TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx) + : param_{*ti.params_[paramIdx]}, result_{*ti.results_[resultIdx]}, memory_manager_{*ti.memory_manager_} { + param_size_ = ti.params_info_[paramIdx].size_; + const auto resultSize = ti.results_info_[resultIdx].size_; + OPENVINO_ASSERT(param_size_ == resultSize, "Node name: ", ti.GetName()); +} + +void TensorIteratorOp::TransferLauncher::add_transfer_node(CudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer) { + const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); + auto dst = paramTensors[0]; + const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); + const auto src = resultTensors[0]; + info.add_transfer(stream, dst, src, param_size_); +} + +TensorIteratorOp::InsertLauncher::InsertLauncher(const TensorIteratorOp& ti, + const std::size_t resultIdx, + const std::size_t outputIdx) + : output_idx_{outputIdx}, + result_{*ti.results_[resultIdx]}, + memory_manager_{*ti.memory_manager_}, + insert_{ti.kernelmap_outputs_.at(outputIdx)} { + OPENVINO_ASSERT(ti.portmap_outputs_.count(outputIdx) != 0, "Node name: ", ti.GetName()); + const auto& portMap = ti.portmap_outputs_.at(output_idx_); + const auto& outputShape = ti.outputs_info_[output_idx_].shape_; + start_ = portMap.start < 0 ? outputShape[portMap.axis] + portMap.start : portMap.start; + stride_ = portMap.stride; +} + +void TensorIteratorOp::InsertLauncher::add_kernel_node(CudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors) { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + info.add_kernel(stream, + insert_.getKernel(), + insert_.getNumBlocks(), + insert_.getThreadsPerBlock(), + insert_.getPropsPtr(), + start_, + insert_.getSize(), + src, + dst); } WorkbufferRequest TensorIteratorOp::GetWorkBufferRequest() const { @@ -227,87 +397,42 @@ void TensorIteratorOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { } } -void TensorIteratorOp::copyParam(const CUDA::Stream& stream, - const CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - const std::int64_t iter, - const uint64_t inputIdx, - const uint64_t paramIdx) const { +void TensorIteratorOp::transferParam(const CUDA::Stream& stream, + const CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + const std::int64_t iter, + const uint64_t inputIdx, + const uint64_t paramIdx) const { + OPENVINO_ASSERT(portmap_inputs_.count(inputIdx) == 0, "Node name: ", GetName()); auto& memoryManager = *memory_manager_; const std::size_t inputSize = inputs_info_[inputIdx].size_; const std::size_t paramSize = params_info_[paramIdx].size_; - if (portmap_inputs_.count(inputIdx) == 0) { - auto& input = inputTensors[inputIdx]; - const auto& param = params_[paramIdx]; - auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - OPENVINO_ASSERT(inputSize == paramSize, "Node name: ", GetName()); - stream.transfer(outputTensors[0], input, inputSize); - } else { - const auto& portMap = portmap_inputs_.at(inputIdx); - const auto& param = params_[paramIdx]; - auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - const auto inputShape = inputs_info_[inputIdx].shape_; - - const auto& slice = kernelmap_inputs_.at(inputIdx); - std::size_t start; - if (portMap.start < 0) { - start = inputShape[portMap.axis] + portMap.start; - } else { - start = portMap.start; - } - start += iter * portMap.stride; - auto input = inputTensors[inputIdx]; - slice(stream.get(), input.get(), outputTensors[0].get(), start); - } -} -void TensorIteratorOp::copyBackEdge(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const uint64_t resultIdx, - const uint64_t paramIdx) const { - auto& memoryManager = *memory_manager_; - const auto& result = results_[resultIdx]; + auto& input = inputTensors[inputIdx]; const auto& param = params_[paramIdx]; - auto paramTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); - auto resultTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const std::size_t paramSize = params_info_[paramIdx].size_; - const std::size_t resultSize = results_info_[resultIdx].size_; - OPENVINO_ASSERT(paramSize == resultSize, "Node name: ", GetName()); - stream.transfer(paramTensors[0], resultTensors[0], paramSize); + auto outputTensors = memoryManager.outputTensorPointers(*param, mutableBuffer); + OPENVINO_ASSERT(inputSize == paramSize, "Node name: ", GetName()); + + stream.transfer(outputTensors[0], input, inputSize); } -void TensorIteratorOp::copyResult(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - const std::int64_t iter, - const std::size_t resultIdx, - const std::size_t outputIdx) const { +void TensorIteratorOp::transferResult(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + const std::int64_t iter, + const std::size_t resultIdx, + const std::size_t outputIdx) const { + OPENVINO_ASSERT(portmap_outputs_.count(outputIdx) == 0, "Node name: ", GetName()); auto& memoryManager = *memory_manager_; const auto resultSize = results_info_[resultIdx].size_; const std::size_t outputSize = outputs_info_[outputIdx].size_; - if (portmap_outputs_.count(outputIdx) == 0) { - const auto result = results_[resultIdx]; - auto inTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const auto output = outputTensors[outputIdx]; - OPENVINO_ASSERT(resultSize == outputSize, "Node name: ", GetName()); - stream.transfer(output, inTensors[0], outputSize); - } else { - auto output = outputTensors[outputIdx]; - const auto& result = results_[resultIdx]; - auto inputTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); - const auto portMap = portmap_outputs_.at(outputIdx); - const auto outputShape = outputs_info_[outputIdx].shape_; - - const auto& insert = kernelmap_outputs_.at(outputIdx); - std::size_t start; - if (portMap.start < 0) { - start = outputShape[portMap.axis] + portMap.start; - } else { - start = portMap.start; - } - start += iter * portMap.stride; - insert(stream.get(), inputTensors[0].get(), output.get(), start); - } + + const auto result = results_[resultIdx]; + auto inTensors = memoryManager.inputTensorPointers(*result, mutableBuffer); + const auto output = outputTensors[outputIdx]; + OPENVINO_ASSERT(resultSize == outputSize, "Node name: ", GetName()); + + stream.transfer(output, inTensors[0], outputSize); } void TensorIteratorOp::updateExecSequence() { diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index d172f360f..d176b742f 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -27,13 +28,18 @@ class TensorIteratorOp : public SubGraph { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void Capture(InferenceRequestContext& context, Inputs inputTensors, Outputs outputTensors, const Workbuffers& workbuffers) const override; + void ExecuteGraph(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const override; + private: struct PortMap { int64_t start{0}; @@ -43,25 +49,119 @@ class TensorIteratorOp : public SubGraph { int64_t axis{0}; }; + class SliceLauncher { + public: + SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx); + + void operator()(const CUDA::Stream& stream, + const IOperationExec::Inputs& inputTensors, + CUDA::DevicePointer mutableBuffer, + int64_t iter) const { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + slice_(stream.get(), src, dst, start_ + iter * stride_); + } + + void add_kernel_node(CudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors); + + void update_kernel_node(CudaGraphInfo& info, + std::size_t index, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + int64_t iter) { + const auto* src = inputTensors[input_idx_].get(); + auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); + info.update_kernel(index, slice_.getPropsPtr(), start_ + iter * stride_, slice_.getSize(), src, dst); + } + + private: + uint64_t input_idx_; + const OperationBase& param_; + const MemoryManager& memory_manager_; + const kernel::Slice& slice_; + size_t start_; + int64_t stride_; + }; + + class TransferLauncher { + public: + TransferLauncher(const TensorIteratorOp& ti, uint64_t resultIdx, uint64_t paramIdx); + + void operator()(const CUDA::Stream& stream, CUDA::DevicePointer mutableBuffer) const { + const auto& paramTensors = memory_manager_.outputTensorPointers(param_, mutableBuffer); + const auto& resultTensors = memory_manager_.inputTensorPointers(result_, mutableBuffer); + auto* dst = paramTensors[0].get(); + const auto* src = resultTensors[0].get(); + throwIfError(cudaMemcpyAsync(dst, src, param_size_, cudaMemcpyDeviceToDevice, stream.get())); + } + + void add_transfer_node(CudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer); + + private: + const OperationBase& param_; + const OperationBase& result_; + const MemoryManager& memory_manager_; + std::size_t param_size_; + }; + + class InsertLauncher { + public: + InsertLauncher(const TensorIteratorOp& ti, const std::size_t resultIdx, const std::size_t outputIdx); + + void operator()(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) const { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + insert_(stream.get(), src, dst, start_ + iter * stride_); + } + + void add_kernel_node(CudaGraphInfo& info, + const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors); + + void update_kernel_node(CudaGraphInfo& info, + std::size_t index, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) { + const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); + auto* dst = outputTensors[output_idx_].get(); + info.update_kernel(index, insert_.getPropsPtr(), start_ + iter * stride_, insert_.getSize(), src, dst); + } + + private: + uint64_t output_idx_; + const OperationBase& result_; + const MemoryManager& memory_manager_; + size_t start_; + int64_t stride_; + const kernel::Insert& insert_; + }; + WorkbufferRequest GetWorkBufferRequest() const override; void InitSharedImmutableWorkbuffers(const Buffers& buffers) override; - void copyParam(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - std::int64_t iter, - uint64_t inputIdx, - uint64_t paramIdx) const; - void copyBackEdge(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - uint64_t resultIdx, - uint64_t paramIdx) const; - void copyResult(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - int64_t iter, - std::size_t resultIdx, - std::size_t outputIdx) const; + void transferParam(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + std::int64_t iter, + uint64_t inputIdx, + uint64_t paramIdx) const; + + void transferResult(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter, + std::size_t resultIdx, + std::size_t outputIdx) const; void updateExecSequence(); @@ -78,6 +178,10 @@ class TensorIteratorOp : public SubGraph { std::unordered_map portmap_outputs_; std::unordered_map kernelmap_outputs_; std::unordered_map results_parameters_map_; + + mutable std::vector slices_; + mutable std::vector transfers_; + mutable std::vector inserts_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/topk.cpp b/modules/nvidia_plugin/src/ops/topk.cpp index 7b72deaa6..83af29d21 100644 --- a/modules/nvidia_plugin/src/ops/topk.cpp +++ b/modules/nvidia_plugin/src/ops/topk.cpp @@ -172,7 +172,7 @@ void TopKOp::Execute(const InferenceRequestContext& context, static_cast(kernel_param.get())); } -bool TopKOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility TopKOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } void TopKOp::InitSharedImmutableWorkbuffers(const Buffers& buffers) { OPENVINO_ASSERT(buffers.size() == 1, "Node name: ", GetName()); diff --git a/modules/nvidia_plugin/src/ops/topk.hpp b/modules/nvidia_plugin/src/ops/topk.hpp index bf311c1ed..aeb1ea01d 100644 --- a/modules/nvidia_plugin/src/ops/topk.hpp +++ b/modules/nvidia_plugin/src/ops/topk.hpp @@ -25,7 +25,7 @@ class TopKOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; void InitSharedImmutableWorkbuffers(const Buffers&) override; WorkbufferRequest GetWorkBufferRequest() const override; diff --git a/modules/nvidia_plugin/src/ops/transpose.cpp b/modules/nvidia_plugin/src/ops/transpose.cpp index 1ab7085e0..0cbe59947 100644 --- a/modules/nvidia_plugin/src/ops/transpose.cpp +++ b/modules/nvidia_plugin/src/ops/transpose.cpp @@ -113,7 +113,7 @@ void TransposeOp::Execute(const InferenceRequestContext& context, context.getThreadContext().stream().get())); } -bool TransposeOp::IsCudaGraphCompatible() const { return true; } +CudaGraphCompatibility TransposeOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::FULL; } std::vector TransposeOp::extractInputExtents(const ov::Node& node) { std::vector result; diff --git a/modules/nvidia_plugin/src/ops/transpose.hpp b/modules/nvidia_plugin/src/ops/transpose.hpp index cab45c730..bfb9fd099 100644 --- a/modules/nvidia_plugin/src/ops/transpose.hpp +++ b/modules/nvidia_plugin/src/ops/transpose.hpp @@ -24,7 +24,7 @@ class TransposeOp : public OperationCuTensor { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: using ExtentsMap = std::unordered_map; diff --git a/modules/nvidia_plugin/src/ops/variadic_split.cpp b/modules/nvidia_plugin/src/ops/variadic_split.cpp index 807c2bdca..e83ba5ec5 100644 --- a/modules/nvidia_plugin/src/ops/variadic_split.cpp +++ b/modules/nvidia_plugin/src/ops/variadic_split.cpp @@ -199,7 +199,7 @@ void VariadicSplitOp::Execute(const InferenceRequestContext& context, static_cast(axis_offset_sizes.get())); } -bool VariadicSplitOp::IsCudaGraphCompatible() const { return false; } +CudaGraphCompatibility VariadicSplitOp::GetCudaGraphCompatibility() const { return CudaGraphCompatibility::NONE; } OPERATION_REGISTER(VariadicSplitOp, VariadicSplit); } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/variadic_split.hpp b/modules/nvidia_plugin/src/ops/variadic_split.hpp index 82580d546..0ede2a7dc 100644 --- a/modules/nvidia_plugin/src/ops/variadic_split.hpp +++ b/modules/nvidia_plugin/src/ops/variadic_split.hpp @@ -28,7 +28,7 @@ class VariadicSplitOp : public OperationBase { Outputs outputTensors, const Workbuffers& workbuffers) const override; - bool IsCudaGraphCompatible() const override; + CudaGraphCompatibility GetCudaGraphCompatibility() const override; private: enum { kOutputPtrsMWBIdx = 0, kNumberOfMWBIdx }; diff --git a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp similarity index 95% rename from modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp rename to modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp index 02ab32e2f..08766acdb 100644 --- a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_compatibility.cpp @@ -22,7 +22,7 @@ using namespace ov::nvidia_gpu; using DevPtr = CUDA::DevicePointer; using CDevPtr = CUDA::DevicePointer; -struct IsCudaGraphCompatibleTest : testing::Test { +struct CudaGraphCompatibilityTest : testing::Test { template static void generate(C& c) { std::random_device randDevice; @@ -40,7 +40,7 @@ struct IsCudaGraphCompatibleTest : testing::Test { OperationBase::Outputs outputs, const Workbuffers& workbuffers) { auto& stream = context.getThreadContext().stream(); - if (operation->IsCudaGraphCompatible()) { + if (operation->GetCudaGraphCompatibility() == CudaGraphCompatibility::FULL) { stream.synchronize(); CUDA::GraphCapture capture{stream}; { @@ -59,7 +59,7 @@ struct IsCudaGraphCompatibleTest : testing::Test { } }; -struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { +struct ReluCudaGraphCompatibilityTest : CudaGraphCompatibilityTest { void run() { using ElementType = float; @@ -130,9 +130,9 @@ struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { } }; -TEST_F(ReluIsCudaGraphCompatibleTest, Compatibile) { run(); } +TEST_F(ReluCudaGraphCompatibilityTest, Compatibile) { run(); } -struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { +struct ConcatCudaGraphCompatibilityTest : CudaGraphCompatibilityTest { void run() { using ElementType = float; @@ -228,6 +228,6 @@ struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { } }; -TEST_F(ConcatIsCudaGraphCompatibleTest, NotCompatible) { run(); } +TEST_F(ConcatCudaGraphCompatibilityTest, NotCompatible) { run(); } } // namespace diff --git a/modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp new file mode 100644 index 000000000..9b990faf3 --- /dev/null +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_nodes_test.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +using namespace ov::nvidia_gpu; + +class KernelNodeTest : public testing::Test {}; + +TEST_F(KernelNodeTest, InsertKernel) { + constexpr size_t in_size = 2 * 1 * 4; + constexpr size_t out_size = 2 * 3 * 4; + kernel::Type_t element_type = kernel::Type_t::i32; + kernel::Insert::Props props; + props.old_shape[0] = 2; + props.old_shape[1] = 1; + props.old_shape[2] = 4; + props.new_shape[0] = 2; + props.new_shape[1] = 3; + props.new_shape[2] = 4; + props.axe = 1; + const size_t start = 1; + auto insert = kernel::Insert(element_type, props, CUDA::Device{}.props().maxThreadsPerBlock); + + CUDA::Stream stream{}; + auto iwb = stream.malloc(insert.getImmutableWorkbufferSize()); + insert.setImmutableWorkbuffer(iwb.get()); + + // Regular kernel + graph with KernelNode + const int32_t in_arr1[2][1][4] = {{{1, 42, 38, 17}}, {{1, 2, 18, 17}}}; + auto src1 = stream.malloc(sizeof(int32_t) * in_size); + auto dst1 = stream.malloc(sizeof(int32_t) * out_size); + auto host_out_arr1 = std::make_unique(out_size); + + stream.upload(src1, in_arr1, sizeof(int32_t) * in_size); + insert(stream.get(), src1.get(), dst1.get(), start); + stream.download(host_out_arr1.get(), dst1, sizeof(int32_t) * out_size); + + auto dst1_graph = stream.malloc(sizeof(int32_t) * out_size); + + std::optional kernel_node; + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + CUDA::CaptureInfo captureInfo{stream}; + kernel_node.emplace(captureInfo.addKernelNode(insert.getKernel(), + insert.getNumBlocks(), + insert.getThreadsPerBlock(), + insert.getPropsPtr(), + start, + insert.getSize(), + src1.get(), + dst1_graph.get())); + } + CUDA::GraphExec graph_exec{capture.getGraph()}; + graph_exec.launch(stream); + + auto host_out_arr1_graph = std::make_unique(out_size); + stream.download(host_out_arr1_graph.get(), dst1_graph, sizeof(int32_t) * out_size); + stream.synchronize(); + + ASSERT_TRUE(std::equal(host_out_arr1.get(), host_out_arr1.get() + out_size, host_out_arr1_graph.get())); + + // Regular kernel + updated graph with KernelNode + const int32_t in_arr2[2][1][4] = {{{31, 2, 8, 10}}, {{20, 12, 1, 7}}}; + + auto src2 = stream.malloc(sizeof(int32_t) * in_size); + auto dst2 = stream.malloc(sizeof(int32_t) * out_size); + auto host_out_arr2 = std::make_unique(out_size); + + stream.upload(src2, in_arr2, sizeof(int32_t) * in_size); + insert(stream.get(), src2.get(), dst2.get(), start); + stream.download(host_out_arr2.get(), dst2, sizeof(int32_t) * out_size); + + auto dst2_graph = stream.malloc(sizeof(int32_t) * out_size); + auto host_out_arr2_graph = std::make_unique(out_size); + + kernel_node.value().update_params( + graph_exec, insert.getPropsPtr(), start, insert.getSize(), src2.get(), dst2_graph.get()); + graph_exec.launch(stream); + stream.download(host_out_arr2_graph.get(), dst2_graph, sizeof(int32_t) * out_size); + stream.synchronize(); + + ASSERT_TRUE(std::equal(host_out_arr2.get(), host_out_arr2.get() + out_size, host_out_arr2_graph.get())); +} + +class TransferNodeTest : public testing::Test {}; + +TEST_F(TransferNodeTest, Transfer) { + constexpr size_t size = 2 * 1 * 4; + const int32_t host_arr1[2][1][4] = {{{1, 42, 38, 17}}, {{1, 2, 18, 17}}}; + CUDA::Stream stream{}; + + // Transfer with graph and TransferNode + auto src1 = stream.malloc(sizeof(int32_t) * size); + auto dst1 = stream.malloc(sizeof(int32_t) * size); + const auto host_out_arr1 = std::make_unique(size); + + stream.upload(src1, host_arr1, sizeof(int32_t) * size); + + std::optional transfer_node; + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + CUDA::CaptureInfo captureInfo{stream}; + transfer_node.emplace(captureInfo.addTransferNode(dst1, src1, sizeof(int32_t) * size)); + } + CUDA::GraphExec graph_exec{capture.getGraph()}; + graph_exec.launch(stream); + + stream.download(host_out_arr1.get(), dst1, sizeof(int32_t) * size); + stream.synchronize(); + + const auto* src_ptr1 = static_cast(static_cast(host_arr1)); + ASSERT_TRUE(std::equal(src_ptr1, src_ptr1 + size, host_out_arr1.get())); + + // Transfer with graph and updated TransferNode + const int32_t host_arr2[2][1][4] = {{{31, 2, 8, 10}}, {{20, 12, 1, 7}}}; + + auto src2 = stream.malloc(sizeof(int32_t) * size); + auto dst2 = stream.malloc(sizeof(int32_t) * size); + auto host_out_arr2 = std::make_unique(size); + + stream.upload(src2, host_arr2, sizeof(int32_t) * size); + + transfer_node.value().update_ptrs(graph_exec, dst2, src2); + graph_exec.launch(stream); + + stream.download(host_out_arr2.get(), dst2, sizeof(int32_t) * size); + stream.synchronize(); + + const auto* src_ptr2 = static_cast(static_cast(host_arr2)); + ASSERT_TRUE(std::equal(src_ptr2, src_ptr2 + size, host_out_arr2.get())); +} diff --git a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp index 1bbebc52c..12026e6ca 100644 --- a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp @@ -79,7 +79,7 @@ class AddMul { static void checkSubGraph(const SubGraph& subGraph) { // Original SubGraph for AddMul network should be CUDA Graph compatible - EXPECT_TRUE(subGraph.IsCudaGraphCompatible()); + EXPECT_EQ(subGraph.GetCudaGraphCompatibility(), CudaGraphCompatibility::FULL); } static std::vector> calcRefs( @@ -126,7 +126,7 @@ class AddConcat { static void checkSubGraph(const SubGraph& subGraph) { // Original SubGraph for AddConcat network should not be CUDA Graph compatible - EXPECT_FALSE(subGraph.IsCudaGraphCompatible()); + EXPECT_EQ(subGraph.GetCudaGraphCompatibility(), CudaGraphCompatibility::NONE); } static std::vector> calcRefs(