Skip to content

Commit

Permalink
[NVIDIA] Remove paramsGraph_/resultsGraph_ from CudaGraphInfo
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrii Pavliuk authored and apavliuk55 committed Dec 8, 2023
1 parent 22965f1 commit 69104bd
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 54 deletions.
14 changes: 0 additions & 14 deletions modules/nvidia_plugin/src/cuda_graph_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,8 @@ void CudaGraphInfo::set_graph(const CUDA::Graph& graph) {
graphExec_.emplace(graph);
}

void CudaGraphInfo::set_params_graph(const CUDA::Graph& graph) {
paramsGraph_.emplace(graph);
paramsGraphExec_.emplace(graph);
}

void CudaGraphInfo::set_results_graph(const CUDA::Graph& graph) {
resultsGraph_.emplace(graph);
resultsGraphExec_.emplace(graph);
}

void CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }

void CudaGraphInfo::launch_params_graph(const CUDA::Stream& stream) const { paramsGraphExec_.value().launch(stream); }

void CudaGraphInfo::launch_results_graph(const CUDA::Stream& stream) const { resultsGraphExec_.value().launch(stream); }

bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs) {
return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ &&
lhs.resultNodes_ == rhs.resultNodes_ && lhs.transferNodes_ == rhs.transferNodes_ &&
Expand Down
10 changes: 0 additions & 10 deletions modules/nvidia_plugin/src/cuda_graph_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,8 @@ class CudaGraphInfo {
std::size_t get_kernels_count() const { return kernelNodes_.size(); }

void set_graph(const CUDA::Graph& graph);
void set_params_graph(const CUDA::Graph& graph);
void set_results_graph(const CUDA::Graph& graph);

void launch(const CUDA::Stream& stream) const;
void launch_params_graph(const CUDA::Stream& stream) const;
void launch_results_graph(const CUDA::Stream& stream) const;

friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs);
friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs);
Expand All @@ -65,12 +61,6 @@ class CudaGraphInfo {
std::optional<CUDA::Graph> graph_{};
std::optional<CUDA::GraphExec> graphExec_{};

std::optional<CUDA::Graph> paramsGraph_{};
std::optional<CUDA::GraphExec> paramsGraphExec_{};

std::optional<CUDA::Graph> resultsGraph_{};
std::optional<CUDA::GraphExec> resultsGraphExec_{};

std::map<std::string, CUDA::UploadNode> parameterNodes_;
std::map<std::string, CUDA::DownloadNode> resultNodes_;

Expand Down
50 changes: 20 additions & 30 deletions modules/nvidia_plugin/src/ops/tensor_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,23 +226,9 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context,
auto& mutableBuffer = workbuffers.mutable_buffers.at(0);
auto& executionDelegator = context.getExecutionDelegator();
executionDelegator.set_stream(stream);

auto& graphInfo = context.getCudaGraphContext().get_current_graph_info();

CUDA::GraphCapture capture{stream};
{
auto scope = capture.getScope();
// First iteration
for (const auto inputIdx : invariant_inputs_) {
const auto paramIdx = inputs_parameters_map_.at(inputIdx);
transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx);
}
for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) {
if (portmap_inputs_.count(inputIdx) == 0) {
transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx);
}
}
}
graphInfo.set_params_graph(capture.getGraph());
{
auto scope = capture.getScope();
// Input mapping of ports
Expand All @@ -264,17 +250,6 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context,
}
}
graphInfo.set_graph(capture.getGraph());
{
auto scope = capture.getScope();
// Copy data to output
if (iterations_results_map_.count(num_iterations_ - 1) > 0) {
for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) {
const auto& outputIdx = results_outputs_map_.at(resultIdx);
transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx);
}
}
}
graphInfo.set_results_graph(capture.getGraph());
}

void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context,
Expand All @@ -285,13 +260,22 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context,
const auto& memoryManager = *memory_manager_;
const auto& mutableBuffer = workbuffers.mutable_buffers.at(0);

auto& graphInfo = context.getCudaGraphContext().get_current_graph_info();

graphInfo.launch_params_graph(stream);
// First iteration; this part doesn't use CUDA graphs yet
for (const auto inputIdx : invariant_inputs_) {
const auto paramIdx = inputs_parameters_map_.at(inputIdx);
transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx);
}
for (const auto& [inputIdx, paramIdx] : inputs_parameters_map_) {
if (portmap_inputs_.count(inputIdx) == 0) {
transferParam(stream, mutableBuffer, inputTensors, 0, inputIdx, paramIdx);
}
}

auto& graphInfo = context.getCudaGraphContext().get_current_graph_info();
OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(),
"CudaGraphContext/TensorIteratorOp slices or inserts count incosistency");

// TI body loop
for (int64_t iter = 0; iter < num_iterations_; ++iter) {
for (std::size_t i = 0; i < slices_.size(); ++i) {
slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter);
Expand All @@ -302,7 +286,13 @@ void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context,
graphInfo.launch(stream);
}

graphInfo.launch_results_graph(stream);
// Copy data to output; this part doesn't use CUDA graphs yet
if (iterations_results_map_.count(num_iterations_ - 1) > 0) {
for (const auto& resultIdx : iterations_results_map_.at(num_iterations_ - 1)) {
const auto& outputIdx = results_outputs_map_.at(resultIdx);
transferResult(stream, mutableBuffer, outputTensors, num_iterations_ - 1, resultIdx, outputIdx);
}
}
}

TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx)
Expand Down

0 comments on commit 69104bd

Please sign in to comment.