Skip to content

Commit

Permalink
[NVIDIA] Add ExecuteGraph() to IOperationExec/OperationBase
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrii Pavliuk authored and apavliuk55 committed Dec 8, 2023
1 parent 158144a commit 22965f1
Show file tree
Hide file tree
Showing 10 changed files with 151 additions and 77 deletions.
23 changes: 3 additions & 20 deletions modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,6 @@
namespace ov {
namespace nvidia_gpu {

namespace {

std::shared_ptr<TensorIteratorOp> getTI(const SubGraph& sg) {
auto& seq = sg.getExecSequence();
if (seq.size() != 1) {
return nullptr;
}
return std::dynamic_pointer_cast<TensorIteratorOp>(seq[0]);
}

} // namespace

CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context,
const std::shared_ptr<const ov::Model>& model)
: orig_subgraph_{context, model}, cuda_graphs_count_{0} {
Expand Down Expand Up @@ -64,15 +52,10 @@ void CudaGraphTopologyRunner::Run(InferenceRequestContext& context, const Device
graphContext.get_current_graph_info().launch(stream);
graphIndex++;
} else if (compatibility == CudaGraphCompatibility::SPECIAL) {
// TODO: remove
auto ti = getTI(subgraph);
CUDA::DevicePointer<void*> mutableBuffer{memoryBlock.view().data()};
const auto& memoryManager = *subgraph.memoryManager();
const auto& inputTensors = memoryManager.inputTensorPointers(*ti, mutableBuffer);
const auto& outputTensors = memoryManager.outputTensorPointers(*ti, mutableBuffer);
const auto& workBuffers = memoryManager.workBuffers(*ti, mutableBuffer);
Workbuffers workbuffers{};
workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data());
graphContext.select_current_graph(graphIndex);
ti->ExecuteGraph(context, inputTensors, outputTensors, workBuffers);
subgraph.ExecuteGraph(context, {}, {}, workbuffers);
graphIndex++;
} else {
Workbuffers workbuffers{};
Expand Down
12 changes: 12 additions & 0 deletions modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ class IExecutionDelegator {
const Workbuffers::mutable_buffer& buffer,
InferenceRequestContext& context) = 0;

/**
* Execute CUDA graph sequence from SubGraph class
* @param subGraphPtr Pointer to SubGraph
* @param memoryManager Reference to MemoryManager
* @param buffer Reference to orkbuffers::mutable_buffer
* @param context Reference to InferenceRequestContext
*/
virtual void execute_graph_sequence(const SubGraph* subGraphPtr,
const MemoryManager& memoryManager,
const Workbuffers::mutable_buffer& buffer,
InferenceRequestContext& context) = 0;

/**
* Returns performance counters
* @return Performance counters
Expand Down
26 changes: 19 additions & 7 deletions modules/nvidia_plugin/src/cuda_operation_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,17 @@ class IOperationExec {
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const = 0;

virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0;

virtual void Capture(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const = 0;
virtual CudaGraphCompatibility GetCudaGraphCompatibility() const = 0;
virtual void ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const = 0;
virtual void InitSharedImmutableWorkbuffers(const Buffers&) = 0;
virtual WorkbufferRequest GetWorkBufferRequest() const = 0;
virtual const WorkbufferIds& GetWorkbufferIds() const = 0;
Expand Down Expand Up @@ -85,6 +91,18 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std::

CudaGraphCompatibility GetCudaGraphCompatibility() const override { return CudaGraphCompatibility::NONE; }

void Capture(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override {
Execute(context, inputTensors, outputTensors, workbuffers);
}
// For operations with CudaGraphCompatibility::SPECIAL, e.g. TI; the vast majority or operations doesn't use this
void ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override {}

WorkbufferRequest GetWorkBufferRequest() const override {
return {}; // Most operators do not need workbuffers
}
Expand All @@ -111,12 +129,6 @@ class OperationBase : public IOperationExec, public IOperationMeta, public std::
workbuffer_ids_ = workbufferIds;
return workbuffer_ids_.immutableIds.empty() ? WorkbufferStatus::NoInitNeeded : WorkbufferStatus::InitNeeded;
}
void Capture(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override {
Execute(context, inputTensors, outputTensors, workbuffers);
}

protected:
std::string node_name_;
Expand Down
12 changes: 12 additions & 0 deletions modules/nvidia_plugin/src/cuda_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,18 @@ void Profiler::capture_sequence(const SubGraph* subGraphPtr,
}
}

void Profiler::execute_graph_sequence(const SubGraph* subGraphPtr,
const MemoryManager& memoryManager,
const Workbuffers::mutable_buffer& buffer,
InferenceRequestContext& context) {
for (const auto& op : create_exec_sequence(subGraphPtr)) {
const auto& inTensors = memoryManager.inputTensorPointers(*op, buffer);
const auto& outTensors = memoryManager.outputTensorPointers(*op, buffer);
const auto& workBuffers = memoryManager.workBuffers(*op, buffer);
op->execute_graph(context, inTensors, outTensors, workBuffers);
}
}

Profiler::ProfilerSequence Profiler::create_exec_sequence(const SubGraph* subGraphPtr) {
OPENVINO_ASSERT(active_stream_);
++infer_count_;
Expand Down
19 changes: 19 additions & 0 deletions modules/nvidia_plugin/src/cuda_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@ class Profiler : public IExecutionDelegator {
const Workbuffers::mutable_buffer& buffer,
InferenceRequestContext& context) override;

/**
* Execute CUDA graph sequence from SubGraph class
* @param subGraphPtr Pointer to SubGraph
* @param memoryManager Reference to MemoryManager
* @param buffer Reference to orkbuffers::mutable_buffer
* @param context Reference to InferenceRequestContext
*/
virtual void execute_graph_sequence(const SubGraph* subGraphPtr,
const MemoryManager& memoryManager,
const Workbuffers::mutable_buffer& buffer,
InferenceRequestContext& context) override;

/**
* Returns performance counters
* @return Performance counters
Expand Down Expand Up @@ -140,6 +152,13 @@ class Profiler::ProfileExecStep {
timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_);
}

template <typename... TArgs>
void execute_graph(TArgs&&... args) const {
timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_);
exec_step_.ExecuteGraph(std::forward<TArgs>(args)...);
timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_);
}

/**
* Adapter method for pointer of operation
* @return Reference to ProfileExecStep
Expand Down
19 changes: 19 additions & 0 deletions modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,25 @@ class SimpleExecutionDelegator : public IExecutionDelegator {
}
};

/**
* Call ExecuteGraph for all operations from SubGraph class
* @param subGraphPtr Pointer to SubGraph
* @param memoryManager Reference to MemoryManager
* @param buffer Reference to orkbuffers::mutable_buffer
* @param context Reference to InferenceRequestContext
*/
virtual void execute_graph_sequence(const SubGraph* subGraphPtr,
const MemoryManager& memoryManager,
const Workbuffers::mutable_buffer& buffer,
InferenceRequestContext& context) override {
for (auto& op : subGraphPtr->getExecSequence()) {
const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer);
const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer);
const auto& workBuffers = memoryManager.workBuffers(*op, buffer);
op->ExecuteGraph(context, inputTensors, outputTensors, workBuffers);
}
};

/**
* Dummy get_performance_counts implementation
*/
Expand Down
44 changes: 28 additions & 16 deletions modules/nvidia_plugin/src/ops/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,22 +128,6 @@ std::vector<DevicePointer<void*>> SubGraph::getSharedWorkbuffers(const IOperatio
return result;
}

void SubGraph::Capture(InferenceRequestContext &context, Inputs, Outputs,
const Workbuffers &workbuffers) const {
const auto& stream = context.getThreadContext().stream();
const auto& memoryManager = *memory_manager_;
auto& mutableBuffer = workbuffers.mutable_buffers.at(0);

auto& executionDelegator = context.getExecutionDelegator();
executionDelegator.set_stream(stream);
executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context);
}

WorkbufferRequest SubGraph::GetWorkBufferRequest() const {
const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize();
return {{}, {memoryBlockSize}};
}

void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const {
const auto& stream = context.getThreadContext().stream();
const auto& memoryManager = *memory_manager_;
Expand Down Expand Up @@ -171,5 +155,33 @@ CudaGraphCompatibility SubGraph::GetCudaGraphCompatibility() const {
return graph_compatibility_;
}

void SubGraph::Capture(InferenceRequestContext& context, Inputs, Outputs, const Workbuffers& workbuffers) const {
const auto& stream = context.getThreadContext().stream();
const auto& memoryManager = *memory_manager_;
auto& mutableBuffer = workbuffers.mutable_buffers.at(0);

auto& executionDelegator = context.getExecutionDelegator();
executionDelegator.set_stream(stream);
executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context);
}

void SubGraph::ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const {
const auto& stream = context.getThreadContext().stream();
const auto& memoryManager = *memory_manager_;
auto& mutableBuffer = workbuffers.mutable_buffers.at(0);

auto& executionDelegator = context.getExecutionDelegator();
executionDelegator.set_stream(stream);
executionDelegator.execute_graph_sequence(this, memoryManager, mutableBuffer, context);
}

WorkbufferRequest SubGraph::GetWorkBufferRequest() const {
const auto memoryBlockSize = memory_manager_->mutableTensorsMemoryModel()->deviceMemoryBlockSize();
return {{}, {memoryBlockSize}};
}

} // namespace nvidia_gpu
} // namespace ov
7 changes: 6 additions & 1 deletion modules/nvidia_plugin/src/ops/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,17 @@ class SubGraph : public OperationBase {
Outputs outputTensors,
const Workbuffers& workbuffers) const override;

CudaGraphCompatibility GetCudaGraphCompatibility() const override;

void Capture(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override;

CudaGraphCompatibility GetCudaGraphCompatibility() const override;
void ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override;

inline std::shared_ptr<MemoryManager> memoryManager() const { return memory_manager_; }

Expand Down
56 changes: 28 additions & 28 deletions modules/nvidia_plugin/src/ops/tensor_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,34 +207,6 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context,
}
}

void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) {
const auto& stream = context.getThreadContext().stream();
const auto& memoryManager = *memory_manager_;
const auto& mutableBuffer = workbuffers.mutable_buffers.at(0);

auto& graphInfo = context.getCudaGraphContext().get_current_graph_info();

graphInfo.launch_params_graph(stream);

OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(),
"CudaGraphContext/TensorIteratorOp slices or inserts count incosistency");

for (int64_t iter = 0; iter < num_iterations_; ++iter) {
for (std::size_t i = 0; i < slices_.size(); ++i) {
slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter);
}
for (std::size_t i = 0; i < inserts_.size(); ++i) {
inserts_[i].update_kernel_node(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter);
}
graphInfo.launch(stream);
}

graphInfo.launch_results_graph(stream);
}

CudaGraphCompatibility TensorIteratorOp::GetCudaGraphCompatibility() const {
// This implementation is CUDA graph compatible only if this is the standard TI with output only of the last
// iteration (which is handled outside of the iterations loop)
Expand Down Expand Up @@ -305,6 +277,34 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context,
graphInfo.set_results_graph(capture.getGraph());
}

void TensorIteratorOp::ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const {
const auto& stream = context.getThreadContext().stream();
const auto& memoryManager = *memory_manager_;
const auto& mutableBuffer = workbuffers.mutable_buffers.at(0);

auto& graphInfo = context.getCudaGraphContext().get_current_graph_info();

graphInfo.launch_params_graph(stream);

OPENVINO_ASSERT(graphInfo.get_kernels_count() == slices_.size() + inserts_.size(),
"CudaGraphContext/TensorIteratorOp slices or inserts count incosistency");

for (int64_t iter = 0; iter < num_iterations_; ++iter) {
for (std::size_t i = 0; i < slices_.size(); ++i) {
slices_[i].update_kernel_node(graphInfo, i, mutableBuffer, inputTensors, iter);
}
for (std::size_t i = 0; i < inserts_.size(); ++i) {
inserts_[i].update_kernel_node(graphInfo, i + slices_.size(), mutableBuffer, outputTensors, iter);
}
graphInfo.launch(stream);
}

graphInfo.launch_results_graph(stream);
}

TensorIteratorOp::SliceLauncher::SliceLauncher(const TensorIteratorOp& ti, uint64_t inputIdx, uint64_t paramIdx)
: input_idx_{inputIdx},
param_{*ti.params_[paramIdx]},
Expand Down
10 changes: 5 additions & 5 deletions modules/nvidia_plugin/src/ops/tensor_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,18 @@ class TensorIteratorOp : public SubGraph {
Outputs outputTensors,
const Workbuffers& workbuffers) const override;

void ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers);

CudaGraphCompatibility GetCudaGraphCompatibility() const override;

void Capture(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override;

void ExecuteGraph(InferenceRequestContext& context,
Inputs inputTensors,
Outputs outputTensors,
const Workbuffers& workbuffers) const override;

private:
struct PortMap {
int64_t start{0};
Expand Down

0 comments on commit 22965f1

Please sign in to comment.