diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index 4033b5df5b..a4374be63e 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -49,22 +49,8 @@ void CudaGraphInfo::set_graph(const CUDA::Graph& graph) { graphExec_.emplace(graph); } -void CudaGraphInfo::set_params_graph(const CUDA::Graph& graph) { - paramsGraph_.emplace(graph); - paramsGraphExec_.emplace(graph); -} - -void CudaGraphInfo::set_results_graph(const CUDA::Graph& graph) { - resultsGraph_.emplace(graph); - resultsGraphExec_.emplace(graph); -} - void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } -void CudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); } - -void CudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { resultsGraphExec_.value().launch(stream); } - bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) { return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && lhs.resultNodes_ == rhs.resultNodes_ && lhs.transferNodes_ == rhs.transferNodes_ && diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index 484bc15c58..295a8fbadd 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -51,12 +51,8 @@ class CudaGraphInfo { std::size_t get_kernels_count() const { return kernelNodes_.size(); } void set_graph(const CUDA::Graph& graph); - void set_params_graph(const CUDA::Graph& graph); - void set_results_graph(const CUDA::Graph& graph); void launch(const CUDA::Stream& stream) const; - void launch_params_graph(const CUDA::Stream& stream) const; - void launch_results_graph(const CUDA::Stream& stream) const; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); @@ -65,12 +61,6 @@ class CudaGraphInfo { std::optional graph_{}; std::optional graphExec_{}; - std::optional paramsGraph_{}; - std::optional paramsGraphExec_{}; - - std::optional resultsGraph_{}; - std::optional resultsGraphExec_{}; - std::map parameterNodes_; std::map resultNodes_; diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 4eeb991724..0d5eee11bb 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -226,23 +226,9 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& executionDelegator = context.getExecutionDelegator(); executionDelegator.set_stream(stream); - auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); + CUDA::GraphCapture capture{stream}; - { - auto scope = capture.getScope(); - // First iteration - for (const auto inputIdx : invariant_inputs_) { - const auto paramIdx = inputs_parameters_map_.at(inputIdx); - transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); - } - for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { - if (portmap_inputs_.count(inputIdx) == 0) { - transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); - } - } - } - graphInfo.set_params_graph(capture.getGraph()); { auto scope = capture.getScope(); // Input mapping of ports @@ -264,17 +250,6 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, } } graphInfo.set_graph(capture.getGraph()); - { - auto scope = capture.getScope(); - // Copy data to output - if (iterations_results_map_.count(num_iterations_ - 1) > 0) { - for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { - const auto& outputIdx = results_outputs_map_.at(resultIdx); - transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); - } - } - } - graphInfo.set_results_graph(capture.getGraph()); } void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, @@ -285,13 +260,22 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, const auto& memoryManager = *memory_manager_; const auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); - - graphInfo.launch_params_graph(stream); + // First iteration; this part doesn't use CUDA graphs yet + for (const auto inputIdx : invariant_inputs_) { + const auto paramIdx = inputs_parameters_map_.at(inputIdx); + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) { + if (portmap_inputs_.count(inputIdx) == 0) { + transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx); + } + } + auto& graphInfo = context.getCudaGraphContext().get_current_graph_info(); OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(), "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); + // TI body loop for (int64_t iter = 0; iter < num_iterations_; ++iter) { for (std::size_t i = 0; i < slices_.size(); ++i) { slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter); @@ -302,7 +286,13 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context, graphInfo.launch(stream); } - graphInfo.launch_results_graph(stream); + // Copy data to output; this part doesn't use CUDA graphs yet + if (iterations_results_map_.count(num_iterations_ - 1) > 0) { + for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) { + const auto& outputIdx = results_outputs_map_.at(resultIdx); + transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx); + } + } } TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx)