Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVIDIA] TensorIterator Body as CUDA Graph #755

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions modules/nvidia_plugin/src/cuda/device_pointers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ auto operator-(DevicePointer<T*> l, DevicePointer<T*> r) noexcept {
return static_cast<const char*>(l.get()) - static_cast<const char*>(r);
}

template <typename T, typename U>
bool operator==(const DevicePointer<T*>& lhs, const DevicePointer<U*>& rhs) {
return lhs.get() == rhs.get();
}

template <typename T, typename U>
bool operator!=(const DevicePointer<T*>& lhs, const DevicePointer<U*>& rhs) {
return lhs.get() != rhs.get();
}

template <typename T, std::size_t Extent = gsl::dynamic_extent>
class DeviceBuffer : private gsl::span<T, Extent> {
public:
Expand Down
87 changes: 59 additions & 28 deletions modules/nvidia_plugin/src/cuda/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Graph::Graph(cudaGraph_t graph) :
Handle { createFromNative, cudaGraphDestroy, graph } {
}

cudaError_t Graph::createFromNative(cudaGraph_t *pGraph, const cudaGraph_t anotherGraph) {
cudaError_t Graph::createFromNative(cudaGraph_t* pGraph, const cudaGraph_t anotherGraph) {
*pGraph = anotherGraph;
return cudaSuccess;
}
Expand All @@ -28,60 +28,60 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) {
return g;
}

bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); }
bool operator==(const Graph& rhs, const Graph& lhs) { return rhs.get() == lhs.get(); }

GraphExec::GraphExec(const Graph &g)
GraphExec::GraphExec(const Graph& g)
#if !defined(NDEBUG) || defined(_DEBUG)
try
#endif
: Handle(cudaGraphInstantiate,
cudaGraphExecDestroy,
g.get(),
static_cast<cudaGraphNode_t *>(nullptr),
static_cast<cudaGraphNode_t*>(nullptr),
#if !defined(NDEBUG) || defined(_DEBUG)
errorMsg_,
kErrorStringLen)
#else
static_cast<char *>(nullptr),
static_cast<char*>(nullptr),
static_cast<size_t>(0ul))
#endif
{
}
#if !defined(NDEBUG) || defined(_DEBUG)
catch (std::exception &e) {
catch (std::exception& e) {
OPENVINO_THROW(e.what(), ": ", errorMsg_);
}
#endif

#if defined(CUDA_VERSION) && CUDA_VERSION >= 12020
cudaGraphExecUpdateResultInfo GraphExec::update(const Graph &g) const {
cudaGraphExecUpdateResultInfo GraphExec::update(const Graph& g) const {
cudaGraphExecUpdateResultInfo res;
throwIfError(cudaGraphExecUpdate(get(), g.get(), &res));
return res;
}
#else
cudaGraphExecUpdateResult GraphExec::update(const Graph &g) const {
cudaGraphExecUpdateResult GraphExec::update(const Graph& g) const {
cudaGraphExecUpdateResult res;
throwIfError(cudaGraphExecUpdate(get(), g.get(), nullptr, &res));
return res;
}
#endif

void GraphExec::launch(const Stream &stream) const {
void GraphExec::launch(const Stream& stream) const {
throwIfError(cudaGraphLaunch(get(), stream.get()));
}

bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); }
bool operator==(const GraphExec& lhs, const GraphExec& rhs) { return rhs.get() == lhs.get(); }

GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} {
GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture& graphCapture) : graphCapture_{graphCapture} {
throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal));
}

GraphCapture::GraphCaptureScope::~GraphCaptureScope() {
graphCapture_.capturedError_ = cudaStreamEndCapture(graphCapture_.stream_.get(), &graphCapture_.cudaGraph_);
}

GraphCapture::GraphCapture(const Stream &capturedStream) :
GraphCapture::GraphCapture(const Stream& capturedStream) :
stream_ { capturedStream } {
}

Expand All @@ -100,20 +100,20 @@ const Graph& GraphCapture::getGraph() {
return graph_.value();
}

CaptureInfo::CaptureInfo(const Stream &capturedStream) : stream_{capturedStream} {
CaptureInfo::CaptureInfo(const Stream& capturedStream) : stream_{capturedStream} {
throwIfError(cudaStreamGetCaptureInfo_v2(capturedStream.get(), &captureStatus_, nullptr,
&capturingGraph_, &deps_, &depCount_));
}

UploadNode CaptureInfo::addUploadNode(DevicePointer<void*> dst, const void *src, std::size_t size) {
UploadNode CaptureInfo::addUploadNode(DevicePointer<void*> dst, const void* src, std::size_t size) {
cudaGraphNode_t newNode;
throwIfError(cudaGraphAddMemcpyNode1D(&newNode, capturingGraph_, deps_, depCount_,
dst.get(), src, size, cudaMemcpyHostToDevice));
throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
return UploadNode{newNode, dst, src, size};
}

DownloadNode CaptureInfo::addDownloadNode(void *dst, DevicePointer<const void*> src,
DownloadNode CaptureInfo::addDownloadNode(void* dst, DevicePointer<const void*> src,
std::size_t size) {
cudaGraphNode_t newNode;
throwIfError(cudaGraphAddMemcpyNode1D(&newNode, capturingGraph_, deps_, depCount_,
Expand All @@ -122,44 +122,75 @@ DownloadNode CaptureInfo::addDownloadNode(void *dst, DevicePointer<const void*>
return DownloadNode{newNode, dst, src, size};
}

void UploadNode::update_src(const GraphExec& exec, const void *src) {
TransferNode CaptureInfo::addTransferNode(CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
cudaGraphNode_t newNode;
throwIfError(cudaGraphAddMemcpyNode1D(
&newNode, capturingGraph_, deps_, depCount_, dst.get(), src.get(), size, cudaMemcpyDeviceToDevice));
throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
return TransferNode{newNode, dst, src, size};
}

void UploadNode::update_src(const GraphExec& exec, const void* src) {
if (src_ != src) {
throwIfError(cudaGraphExecMemcpyNodeSetParams1D(exec.get(), node_,
dst_.get(), src, size_, cudaMemcpyHostToDevice));
src_ = src;
}
}

UploadNode::UploadNode(cudaGraphNode_t node, DevicePointer<void*> dst, const void *src,
std::size_t size)
UploadNode::UploadNode(cudaGraphNode_t node, DevicePointer<void*> dst, const void* src, std::size_t size)
: node_{node},
dst_{dst},
src_{src},
size_{size} {
}

void DownloadNode::update_dst(const GraphExec& exec, void *dst) {
void DownloadNode::update_dst(const GraphExec& exec, void* dst) {
if (dst_ != dst) {
throwIfError(cudaGraphExecMemcpyNodeSetParams1D(exec.get(), node_,
dst, src_.get(), size_, cudaMemcpyDeviceToHost));
dst_ = dst;
}
}

DownloadNode::DownloadNode(cudaGraphNode_t node, void *dst, DevicePointer<const void*> src,
std::size_t size)
: node_{node},
dst_{dst},
src_{src},
size_{size} {
DownloadNode::DownloadNode(cudaGraphNode_t node, void* dst, DevicePointer<const void*> src, std::size_t size)
: node_{node}, dst_{dst}, src_{src}, size_{size} {}

void CUDA::TransferNode::update_ptrs(const GraphExec& exec,
CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src) {
if (dst_ != dst || src_ != src) {
dst_ = dst;
src_ = src;
throwIfError(cudaGraphExecMemcpyNodeSetParams1D(
exec.get(), node_, dst_.get(), src_.get(), size_, cudaMemcpyDeviceToDevice));
}
}

bool UploadNode::operator ==(const UploadNode &rhs) const {
CUDA::TransferNode::TransferNode(cudaGraphNode_t node,
CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src,
std::size_t size)
: node_{node}, dst_{dst}, src_{src}, size_{size} {}

CUDA::KernelNode::KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params) : node_{node}, node_params_{params} {}

bool UploadNode::operator==(const UploadNode& rhs) const {
return size_ == rhs.size_ && src_ == rhs.src_ && dst_.get() == rhs.dst_.get() && node_ == rhs.node_;
}

bool DownloadNode::operator ==(const DownloadNode &rhs) const {
bool DownloadNode::operator==(const DownloadNode& rhs) const {
return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_ == rhs.dst_ && node_ == rhs.node_;
}

} // namespace CUDA
bool CUDA::TransferNode::operator==(const TransferNode& rhs) const {
return size_ == rhs.size_ && src_.get() == rhs.src_.get() && dst_.get() == rhs.dst_.get() && node_ == rhs.node_;
}

bool KernelNode::operator==(const KernelNode& rhs) const {
return node_ == rhs.node_ && node_params_ == rhs.node_params_;
}

} // namespace CUDA
59 changes: 59 additions & 0 deletions modules/nvidia_plugin/src/cuda/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <optional>

#include <cuda/node_params.hpp>
#include "runtime.hpp"

namespace CUDA {
Expand Down Expand Up @@ -92,6 +93,7 @@ class UploadNode {

private:
UploadNode(cudaGraphNode_t node, CUDA::DevicePointer<void*> dst, const void* src, std::size_t size);

cudaGraphNode_t node_;
CUDA::DevicePointer<void*> dst_;
const void* src_;
Expand All @@ -107,17 +109,64 @@ class DownloadNode {

private:
DownloadNode(cudaGraphNode_t node, void* dst, CUDA::DevicePointer<const void*> src, std::size_t size);

cudaGraphNode_t node_;
void* dst_;
CUDA::DevicePointer<const void*> src_;
std::size_t size_;
};

class TransferNode {
friend CaptureInfo;

public:
void update_ptrs(const GraphExec& exec, CUDA::DevicePointer<void*> dst, CUDA::DevicePointer<const void*> src);
bool operator==(const TransferNode& rhs) const;

private:
TransferNode(cudaGraphNode_t node,
CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src,
std::size_t size);

cudaGraphNode_t node_;
CUDA::DevicePointer<void*> dst_;
CUDA::DevicePointer<const void*> src_;
std::size_t size_;
};

bool operator==(const cudaKernelNodeParams& lhs, const cudaKernelNodeParams& rhs);

class KernelNode {
friend CaptureInfo;

public:
template <typename... Args>
void update_params(const GraphExec& exec, Args&&... args) {
node_params_.reset_args();
node_params_.add_args(std::forward<Args>(args)...);
throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, &node_params_.get_knp()));
}

bool operator==(const KernelNode& rhs) const;

private:
KernelNode(cudaGraphNode_t node, CUDA::NodeParams&& params);

cudaGraphNode_t node_;
CUDA::NodeParams node_params_;
};

class CaptureInfo {
public:
CaptureInfo(const Stream& capturedStream);
UploadNode addUploadNode(CUDA::DevicePointer<void*> dst, const void* src, std::size_t size);
DownloadNode addDownloadNode(void* dst, CUDA::DevicePointer<const void*> src, std::size_t size);
TransferNode addTransferNode(CUDA::DevicePointer<void*> dst,
CUDA::DevicePointer<const void*> src,
std::size_t size);
template <typename... Args>
KernelNode addKernelNode(void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args);

private:
const Stream& stream_;
Expand All @@ -127,4 +176,14 @@ class CaptureInfo {
size_t depCount_;
};

template <typename... Args>
KernelNode CaptureInfo::addKernelNode(void* kernel, dim3 gridDim, dim3 blockDim, Args&&... args) {
cudaGraphNode_t newNode;
CUDA::NodeParams params{kernel, gridDim, blockDim};
params.add_args(std::forward<Args>(args)...);
throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, &params.get_knp()));
throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
return KernelNode{newNode, std::move(params)};
}

} // namespace CUDA
50 changes: 50 additions & 0 deletions modules/nvidia_plugin/src/cuda/node_params.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (C) 2020-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <cuda_runtime_api.h>

#include <cuda/utils.hpp>
#include <vector>

namespace CUDA {

struct NodeParams {
NodeParams(void* kernel, dim3 gridDim, dim3 blockDim) : knp_{kernel, gridDim, blockDim, 0u, nullptr, nullptr} {
ptrs_.reserve(20);
}

template <typename T>
void add_args(const T& value) {
ptrs_.emplace_back(const_cast<T*>(&value));
}

template <typename T, typename... Args>
void add_args(const T& arg, Args&&... args) {
add_args(std::forward<const T&>(arg));
add_args(std::forward<Args>(args)...);
};

const cudaKernelNodeParams& get_knp() {
knp_.kernelParams = ptrs_.data();
return knp_;
}

void reset_args() { ptrs_.clear(); }

friend bool operator==(const NodeParams& lhs, const NodeParams& rhs);

private:
std::vector<void*> ptrs_;
cudaKernelNodeParams knp_;
};

inline bool operator==(const NodeParams& lhs, const NodeParams& rhs) {
return lhs.ptrs_ == rhs.ptrs_ && rhs.knp_.func == lhs.knp_.func && rhs.knp_.gridDim == lhs.knp_.gridDim &&
rhs.knp_.blockDim == lhs.knp_.blockDim && rhs.knp_.sharedMemBytes == lhs.knp_.sharedMemBytes &&
rhs.knp_.extra == lhs.knp_.extra;
}

} // namespace CUDA
15 changes: 15 additions & 0 deletions modules/nvidia_plugin/src/cuda/utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (C) 2020-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <cuda_runtime_api.h>

namespace CUDA {

inline bool operator==(dim3 rhs, dim3 lhs) { return rhs.x == lhs.x && rhs.y == lhs.y && rhs.z == lhs.z; }

inline bool operator!=(dim3 rhs, dim3 lhs) { return !(rhs == lhs); }

} // namespace CUDA
2 changes: 1 addition & 1 deletion modules/nvidia_plugin/src/cuda_eager_topology_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace nvidia_gpu {
EagerTopologyRunner::EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model)
: SubGraph(context, model) {}

void EagerTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const {
void EagerTopologyRunner::Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const {
Workbuffers workbuffers{};
workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data());
SubGraph::Execute(context, {}, {}, workbuffers);
Expand Down
2 changes: 1 addition & 1 deletion modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model);
~EagerTopologyRunner() override = default;

void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
void Run(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override{};
const SubGraph& GetSubGraph() const override;
};
Expand Down
Loading
Loading