diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index d9078946b..91bb02411 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -7,6 +7,50 @@ namespace ov { namespace nvidia_gpu { +void TiCudaGraphInfo::add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); +} + +void TiCudaGraphInfo::add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp) { + CUDA::CaptureInfo captureInfo{stream}; + kernelNodes_.emplace_back(captureInfo.addKernelNode(knp)); +} + +void TiCudaGraphInfo::set_params_graph(const CUDA::Graph& graph) { + paramsGraph_.emplace(graph); + paramsGraphExec_.emplace(graph); +} + +void TiCudaGraphInfo::set_body_graph(const CUDA::Graph& graph) { + bodyGraph_.emplace(graph); + bodyGraphExec_.emplace(graph); +} + +void TiCudaGraphInfo::set_results_graph(const CUDA::Graph& graph) { + resultsGraph_.emplace(graph); + resultsGraphExec_.emplace(graph); +} + +// bool TiCudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } + +void TiCudaGraphInfo::update_kernel(std::size_t index, const cudaKernelNodeParams& knp) { + kernelNodes_[index].update_params(bodyGraphExec_.value(), knp); +} + +void TiCudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); } + +void TiCudaGraphInfo::launch_body_graph(const CUDA::Stream& stream) const { bodyGraphExec_.value().launch(stream); } + +void TiCudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { resultsGraphExec_.value().launch(stream); } + +std::size_t TiCudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); } + +std::size_t TiCudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); } + void CudaGraphContext::reset() { graphs_.clear(); currentGraphIndex_ = 0; @@ -17,9 +61,9 @@ void CudaGraphContext::start_next_graph_addition() { graphs_.emplace_back(); } -void CudaGraphContext::start_ti_graph_addition(const std::string& ti_op_name) { - ti_graphs_[ti_op_name] = {}; -} +// void CudaGraphContext::start_ti_graph_addition(const std::string& ti_op_name) { +// ti_graphs_[ti_op_name] = {}; +// } void CudaGraphContext::add_parameter(const std::string& tensorName, const CUDA::Stream& stream, @@ -39,27 +83,35 @@ void CudaGraphContext::add_result(const std::string& tensorName, graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); } -void CudaGraphContext::add_transfer(const std::string& ti_op_name, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - CUDA::DevicePointer src, - std::size_t size) { - ti_graphs_.at(ti_op_name).add_transfer(stream, dst, src, size); -} +// void CudaGraphContext::add_transfer(const std::string& ti_op_name, +// const CUDA::Stream& stream, +// CUDA::DevicePointer dst, +// CUDA::DevicePointer src, +// std::size_t size) { +// ti_graphs_.at(ti_op_name).add_transfer(stream, dst, src, size); +// } -void CudaGraphContext::add_kernel(const std::string& ti_op_name, - const CUDA::Stream& stream, - const cudaKernelNodeParams& knp) { - ti_graphs_.at(ti_op_name).add_kernel(stream, knp); -} +// void CudaGraphContext::add_kernel(const std::string& ti_op_name, +// const CUDA::Stream& stream, +// const cudaKernelNodeParams& knp) { +// ti_graphs_.at(ti_op_name).add_kernel(stream, knp); +// } void CudaGraphContext::add_graph(const CUDA::Graph& graph) { OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); graphs_[currentGraphIndex_].set_graph(graph); } -void CudaGraphContext::add_ti_graph(const std::string& ti_op_name, const CUDA::Graph& graph) { - ti_graphs_.at(ti_op_name).set_graph(graph); +// void CudaGraphContext::add_ti_graph(const std::string& ti_op_name, const CUDA::Graph& graph) { +// ti_graphs_.at(ti_op_name).set_graph(graph); +// } + +// const TiCudaGraphInfo& CudaGraphContext::get_ti_graph(const std::string& ti_op_name) const { +// return ti_graphs_.at(ti_op_name); +// } + +TiCudaGraphInfo& CudaGraphContext::get_ti_graph(const std::string& ti_op_name) const { + return ti_graphs_[ti_op_name]; } bool CudaGraphContext::is_initialized() const { @@ -73,12 +125,12 @@ void CudaGraphContext::update_capture(const TensorMappingContext& context) { } } -void CudaGraphContext::update_kernel(const std::string& ti_op_name, - std::size_t index, - const cudaKernelNodeParams& knp) const { - OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); - ti_graphs_.at(ti_op_name).update_kernel(index, knp); -} +// void CudaGraphContext::update_kernel(const std::string& ti_op_name, +// std::size_t index, +// const cudaKernelNodeParams& knp) const { +// OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); +// ti_graphs_.at(ti_op_name).update_kernel(index, knp); +// } void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const { currentGraphIndex_ = index; @@ -86,10 +138,10 @@ void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) con graphs_[currentGraphIndex_].launch(stream); } -void CudaGraphContext::launch_ti_graph(const std::string& ti_op_name, const CUDA::Stream& stream) const { - OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); - ti_graphs_.at(ti_op_name).launch(stream); -} +// void CudaGraphContext::launch_ti_graph(const std::string& ti_op_name, const CUDA::Stream& stream) const { +// OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); +// ti_graphs_.at(ti_op_name).launch(stream); +// } std::size_t CudaGraphContext::get_params_count() const { std::size_t res = 0; @@ -107,13 +159,13 @@ std::size_t CudaGraphContext::get_results_count() const { return res; } -std::size_t CudaGraphContext::get_transfers_count(const std::string& ti_op_name) const { - return ti_graphs_.at(ti_op_name).get_transfers_count(); -} +// std::size_t CudaGraphContext::get_transfers_count(const std::string& ti_op_name) const { +// return ti_graphs_.at(ti_op_name).get_transfers_count(); +// } -std::size_t CudaGraphContext::get_kernels_count(const std::string& ti_op_name) const { - return ti_graphs_.at(ti_op_name).get_kernels_count(); -} +// std::size_t CudaGraphContext::get_kernels_count(const std::string& ti_op_name) const { +// return ti_graphs_.at(ti_op_name).get_kernels_count(); +// } std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); @@ -137,18 +189,18 @@ void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName, resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); } -void CudaGraphContext::CudaGraphInfo::add_transfer(const CUDA::Stream& stream, - CUDA::DevicePointer dst, - CUDA::DevicePointer src, - std::size_t size) { - CUDA::CaptureInfo captureInfo{stream}; - transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); -} +// void CudaGraphContext::CudaGraphInfo::add_transfer(const CUDA::Stream& stream, +// CUDA::DevicePointer dst, +// CUDA::DevicePointer src, +// std::size_t size) { +// CUDA::CaptureInfo captureInfo{stream}; +// transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); +// } -void CudaGraphContext::CudaGraphInfo::add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp) { - CUDA::CaptureInfo captureInfo{stream}; - kernelNodes_.emplace_back(captureInfo.addKernelNode(knp)); -} +// void CudaGraphContext::CudaGraphInfo::add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp) { +// CUDA::CaptureInfo captureInfo{stream}; +// kernelNodes_.emplace_back(captureInfo.addKernelNode(knp)); +// } void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) { graph_.emplace(graph); @@ -166,9 +218,9 @@ void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& } } -void CudaGraphContext::CudaGraphInfo::update_kernel(std::size_t index, const cudaKernelNodeParams& knp) { - kernelNodes_[index].update_params(graphExec_.value(), knp); -} +// void CudaGraphContext::CudaGraphInfo::update_kernel(std::size_t index, const cudaKernelNodeParams& knp) { +// kernelNodes_[index].update_params(graphExec_.value(), knp); +// } void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } @@ -176,9 +228,9 @@ std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return p std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); } -std::size_t CudaGraphContext::CudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); } +// std::size_t CudaGraphContext::CudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); } -std::size_t CudaGraphContext::CudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); } +// std::size_t CudaGraphContext::CudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); } bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index c030bdfbd..23497def2 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -11,13 +11,55 @@ namespace ov { namespace nvidia_gpu { +class TiCudaGraphInfo { +public: + void add_transfer(const CUDA::Stream& stream, + CUDA::DevicePointer dst, + CUDA::DevicePointer src, + std::size_t size); + + void add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp); + + void set_params_graph(const CUDA::Graph& graph); + void set_body_graph(const CUDA::Graph& graph); + void set_results_graph(const CUDA::Graph& graph); + + // bool is_initialized() const; + + // void update_capture(const TensorMappingContext& context); + void update_kernel(std::size_t index, const cudaKernelNodeParams& knp); + + void launch_params_graph(const CUDA::Stream& stream) const; + void launch_body_graph(const CUDA::Stream& stream) const; + void launch_results_graph(const CUDA::Stream& stream) const; + + std::size_t get_transfers_count() const; + std::size_t get_kernels_count() const; + + // friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + // friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + +private: + std::optional paramsGraph_{}; + std::optional paramsGraphExec_{}; + + std::optional bodyGraph_{}; + std::optional bodyGraphExec_{}; + + std::optional resultsGraph_{}; + std::optional resultsGraphExec_{}; + + std::vector transferNodes_; + std::vector kernelNodes_; +}; + class CudaGraphContext { public: void reset(); void start_next_graph_addition(); - void start_ti_graph_addition(const std::string& ti_op_name); + // void start_ti_graph_addition(const std::string& ti_op_name); void add_parameter(const std::string& tensorName, const CUDA::Stream& stream, @@ -31,34 +73,38 @@ class CudaGraphContext { CUDA::DevicePointer src, std::size_t size); - void add_transfer(const std::string& ti_op_name, - const CUDA::Stream& stream, - CUDA::DevicePointer dst, - CUDA::DevicePointer src, - std::size_t size); + // void add_transfer(const std::string& ti_op_name, + // const CUDA::Stream& stream, + // CUDA::DevicePointer dst, + // CUDA::DevicePointer src, + // std::size_t size); - void add_kernel(const std::string& ti_op_name, - const CUDA::Stream& stream, - const cudaKernelNodeParams& knp); + // void add_kernel(const std::string& ti_op_name, + // const CUDA::Stream& stream, + // const cudaKernelNodeParams& knp); void add_graph(const CUDA::Graph& graph); + void add_ti_graph(const std::string& ti_op_name, const CUDA::Graph& graph); + // const TiCudaGraphInfo& get_ti_graph(const std::string& ti_op_name) const; + TiCudaGraphInfo& get_ti_graph(const std::string& ti_op_name) const; + bool is_initialized() const; void update_capture(const TensorMappingContext& context); - void update_kernel(const std::string& ti_op_name, - std::size_t index, - const cudaKernelNodeParams& knp) const; + // void update_kernel(const std::string& ti_op_name, + // std::size_t index, + // const cudaKernelNodeParams& knp) const; void launch(std::size_t index, const CUDA::Stream& stream) const; - void launch_ti_graph(const std::string& ti_op_name, const CUDA::Stream& stream) const; + // void launch_ti_graph(const std::string& ti_op_name, const CUDA::Stream& stream) const; std::size_t get_params_count() const; std::size_t get_results_count() const; - std::size_t get_transfers_count(const std::string& ti_op_name) const; - std::size_t get_kernels_count(const std::string& ti_op_name) const; + // std::size_t get_transfers_count(const std::string& ti_op_name) const; + // std::size_t get_kernels_count(const std::string& ti_op_name) const; std::size_t get_graphs_count() const; @@ -80,27 +126,27 @@ class CudaGraphContext { CUDA::DevicePointer src, std::size_t size); - void add_transfer(const CUDA::Stream& stream, - CUDA::DevicePointer dst, - CUDA::DevicePointer src, - std::size_t size); + // void add_transfer(const CUDA::Stream& stream, + // CUDA::DevicePointer dst, + // CUDA::DevicePointer src, + // std::size_t size); - void add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp); + // void add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp); void set_graph(const CUDA::Graph& graph); bool is_initialized() const; void update_capture(const TensorMappingContext& context); - void update_kernel(std::size_t index, const cudaKernelNodeParams& knp); + // void update_kernel(std::size_t index, const cudaKernelNodeParams& knp); void launch(const CUDA::Stream& stream) const; std::size_t get_params_count() const; std::size_t get_results_count() const; - std::size_t get_transfers_count() const; - std::size_t get_kernels_count() const; + // std::size_t get_transfers_count() const; + // std::size_t get_kernels_count() const; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); @@ -111,8 +157,8 @@ class CudaGraphContext { std::map parameterNodes_; std::map resultNodes_; - std::vector transferNodes_; - std::vector kernelNodes_; + // std::vector transferNodes_; + // std::vector kernelNodes_; }; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); @@ -120,7 +166,8 @@ class CudaGraphContext { private: std::vector graphs_{}; - mutable std::unordered_map ti_graphs_; + mutable std::unordered_map ti_graphs_; + // std::unordered_map ti_graphs_; mutable std::size_t currentGraphIndex_ = 0; }; diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 2793fa34b..205a26b94 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -216,41 +216,30 @@ void TensorIteratorOp::ExecuteGraph(const InferenceRequestContext& context, const auto& memoryManager = *memory_manager_; const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - // TODO: refactor - // First iteration - for (const auto inputIdx : invariant_inputs_) { - const auto paramIdx = inputs_parameters_map_.at(inputIdx); - transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); - } - for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { - if (portmap_inputs_.count(inputIdx) == 0) { - transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); - } - } + // auto& graphContext = context.getCudaGraphContext(); + // const auto& opName = GetName(); + auto& tiGraphInfo = context.getCudaGraphContext().get_ti_graph(GetName()); + + tiGraphInfo.launch_params_graph(stream); - auto& graphContext = context.getCudaGraphContext(); - const auto& opName = GetName(); - OPENVINO_ASSERT(graphContext.get_kernels_count(opName) == slices_.size() + inserts_.size(), + // OPENVINO_ASSERT(graphContext.get_kernels_count(opName) == slices_.size() + inserts_.size(), + OPENVINO_ASSERT(tiGraphInfo.get_kernels_count() == slices_.size() + inserts_.size(), "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); for (int64_t iter = 0; iter < num_iterations_; ++iter) { for (std::size_t i = 0; i < slices_.size(); ++i) { - graphContext.update_kernel(opName, i, slices_[i].get_knp(stream, mutableBuffer, inputTensors, iter)); + // graphContext.update_kernel(opName, i, slices_[i].get_knp(stream, mutableBuffer, inputTensors, iter)); + tiGraphInfo.update_kernel(i, slices_[i].get_knp(stream, mutableBuffer, inputTensors, iter)); } for (std::size_t i = 0; i < inserts_.size(); ++i) { - graphContext.update_kernel(opName, i + slices_.size(), inserts_[i].get_knp(stream, mutableBuffer, outputTensors, iter)); + // graphContext.update_kernel(opName, i + slices_.size(), inserts_[i].get_knp(stream, mutableBuffer, outputTensors, iter)); + tiGraphInfo.update_kernel(i + slices_.size(), inserts_[i].get_knp(stream, mutableBuffer, outputTensors, iter)); } - graphContext.launch_ti_graph(opName, stream); + // graphContext.launch_ti_graph(opName, stream); + tiGraphInfo.launch_body_graph(stream); } - // TODO: Hadle n-th iteration situation - // Copy data to output - if (iterations_results_map_.count(num_iterations_ - 1) > 0) { - for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { - const auto& outputIdx = results_outputs_map_.at(resultIdx); - transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); - } - } + tiGraphInfo.launch_results_graph(stream); } bool TensorIteratorOp::IsCudaGraphCompatible() const { @@ -267,15 +256,36 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - auto& graphContext = context.getCudaGraphContext(); - const auto& opName = GetName(); - graphContext.start_ti_graph_addition(opName); + // auto& graphContext = context.getCudaGraphContext(); + // const auto& opName = GetName(); + // graphContext.start_ti_graph_addition(opName); + auto& tiGraphInfo = context.getCudaGraphContext().get_ti_graph(GetName()); + + // TODO: refactor + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + // First iteration + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + } + tiGraphInfo.set_params_graph(capture.getGraph()); + + // CUDA::GraphCapture bodyCapture{stream}; { auto scope = capture.getScope(); // Input mapping of ports for (auto& slice : slices_) { - graphContext.add_kernel(opName, stream, slice.get_knp(stream, mutableBuffer, inputTensors, 0)); + // graphContext.add_kernel(opName, stream, slice.get_knp(stream, mutableBuffer, inputTensors, 0)); + tiGraphInfo.add_kernel(stream, slice.get_knp(stream, mutableBuffer, inputTensors, 0)); } // Inner loop @@ -283,20 +293,36 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, // Back-edge mapping for (auto& transfer : transfers_) { - graphContext.add_transfer(opName, - stream, - CUDA::DevicePointer{transfer.get_dst(mutableBuffer)}, - CUDA::DevicePointer{transfer.get_src(mutableBuffer)}, - transfer.get_param_size()); + // graphContext.add_transfer(opName, + tiGraphInfo.add_transfer(stream, + CUDA::DevicePointer{transfer.get_dst(mutableBuffer)}, + CUDA::DevicePointer{transfer.get_src(mutableBuffer)}, + transfer.get_param_size()); } // Output mapping of ports for (auto& insert : inserts_) { - graphContext.add_kernel(opName, stream, insert.get_knp(stream, mutableBuffer, outputTensors, 0)); + // graphContext.add_kernel(opName, stream, insert.get_knp(stream, mutableBuffer, outputTensors, 0)); + tiGraphInfo.add_kernel(stream, insert.get_knp(stream, mutableBuffer, outputTensors, 0)); + } + } + // const auto& graph = bodyCapture.getGraph(); + // graphContext.add_ti_graph(opName, graph); + // tiGraphInfo.set_body_graph(graph); + tiGraphInfo.set_body_graph(capture.getGraph()); + + { + auto scope = capture.getScope(); + // TODO: Hadle n-th iteration situation + // Copy data to output + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } } } - const auto& graph = capture.getGraph(); - graphContext.add_ti_graph(opName, graph); + tiGraphInfo.set_results_graph(capture.getGraph()); } TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti,