Skip to content

Commit

Permalink
Replace slice/insert for kernel nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrii Pavliuk committed Nov 7, 2023
1 parent e814b56 commit 6da3886
Show file tree
Hide file tree
Showing 10 changed files with 340 additions and 252 deletions.
60 changes: 35 additions & 25 deletions modules/nvidia_plugin/src/cuda/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,25 @@ TransferNode CaptureInfo::addTransferNode(CUDA::DevicePointer<void *> dst,
return TransferNode{newNode, dst, src, size};
}

InsertNode CaptureInfo::addInsertNode(std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
cudaGraphNode_t newNode;
throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, insertParams->getKnp()));
throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
return InsertNode{newNode, std::move(insertParams)};
}
// InsertNode CaptureInfo::addInsertNode(std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
// cudaGraphNode_t newNode;
// throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, insertParams->getKnp()));
// throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
// return InsertNode{newNode, std::move(insertParams)};
// }

SliceNode CaptureInfo::addSliceNode(std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// SliceNode CaptureInfo::addSliceNode(std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// cudaGraphNode_t newNode;
// throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, sliceParams->getKnp()));
// throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
// return SliceNode{newNode, std::move(sliceParams)};
// }

KernelNode CaptureInfo::addKernelNode(const cudaKernelNodeParams& knp) {
cudaGraphNode_t newNode;
throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, sliceParams->getKnp()));
throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, &knp));
throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1));
return SliceNode{newNode, std::move(sliceParams)};
return KernelNode{newNode, knp};
}

void UploadNode::update_src(const GraphExec& exec, const void *src) {
Expand Down Expand Up @@ -194,25 +201,28 @@ CUDA::TransferNode::TransferNode(cudaGraphNode_t node, CUDA::DevicePointer<void
size_{size} {
}

void CUDA::InsertNode::update_params(const GraphExec &exec, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
insert_params_ = std::move(insertParams);
throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, insert_params_->getKnp()));
}
// void CUDA::InsertNode::update_params(const GraphExec &exec, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
// insert_params_ = std::move(insertParams);
// throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, insert_params_->getKnp()));
// }

CUDA::InsertNode::InsertNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams)
: node_{node},
insert_params_{std::move(insertParams)} {
}
// CUDA::InsertNode::InsertNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams)
// : node_{node},
// insert_params_{std::move(insertParams)} {
// }

void CUDA::SliceNode::update_params(const GraphExec &exec, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
slice_params_ = std::move(sliceParams);
throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, slice_params_->getKnp()));
}
// void CUDA::SliceNode::update_params(const GraphExec &exec, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// slice_params_ = std::move(sliceParams);
// throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, slice_params_->getKnp()));
// }

CUDA::SliceNode::SliceNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams)
: node_{node},
slice_params_{std::move(sliceParams)} {
}
// CUDA::SliceNode::SliceNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams)
// : node_{node},
// slice_params_{std::move(sliceParams)} {
// }

CUDA::KernelNode::KernelNode(cudaGraphNode_t node, const cudaKernelNodeParams &knp) : node_{node}, knp_{&knp} {}
// CUDA::KernelNode::KernelNode(cudaGraphNode_t node) : node_{node} {}

bool UploadNode::operator==(const UploadNode &rhs) const {
return size_ == rhs.size_ && src_ == rhs.src_ && dst_.get() == rhs.dst_.get() && node_ == rhs.node_;
Expand Down
75 changes: 47 additions & 28 deletions modules/nvidia_plugin/src/cuda/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,40 +131,58 @@ class TransferNode {
std::size_t size_;
};

class InsertNode {
// class InsertNode {
// friend CaptureInfo;

// public:
// // InsertNode() = default;
// // InsertNode(const InsertNode&) = default;
// // InsertNode(InsertNode&&) = default;
// void update_params(const GraphExec& exec, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams);
// // bool operator==(const InsertNode& rhs) const;

// private:
// InsertNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> kernelParams);
// cudaGraphNode_t node_;
// std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insert_params_;
// // cudaKernelNodeParams knp_;
// };

// class SliceNode {
// friend CaptureInfo;

// public:
// // SliceNode() = default;
// // SliceNode(const SliceNode&) = default;
// // SliceNode(SliceNode&&) = default;
// void update_params(const GraphExec& exec, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams);
// // bool operator==(const InsertNode& rhs) const;

// private:
// SliceNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> kernelParams);
// cudaGraphNode_t node_;
// // std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> slice_params_;
// // cudaKernelNodeParams knp_;
// };

class KernelNode {
friend CaptureInfo;

public:
// InsertNode() = default;
// InsertNode(const InsertNode&) = default;
// InsertNode(InsertNode&&) = default;
void update_params(const GraphExec& exec, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams);
// bool operator==(const InsertNode& rhs) const;

private:
InsertNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> kernelParams);
cudaGraphNode_t node_;
std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insert_params_;
// cudaKernelNodeParams knp_;

};

class SliceNode {
friend CaptureInfo;
// void update_params(const GraphExec& exec);
// void KernelNode::update_params(const GraphExec &exec) {
inline void update_params(const GraphExec& exec, const cudaKernelNodeParams& knp) {
// slice_params_ = std::move(sliceParams);
knp_ = &knp;
throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, knp_));
}

public:
// SliceNode() = default;
// SliceNode(const SliceNode&) = default;
// SliceNode(SliceNode&&) = default;
void update_params(const GraphExec& exec, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams);
// bool operator==(const InsertNode& rhs) const;

private:
SliceNode(cudaGraphNode_t node, std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> kernelParams);
KernelNode(cudaGraphNode_t node, const cudaKernelNodeParams& knp);
cudaGraphNode_t node_;
std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> slice_params_;
// cudaKernelNodeParams knp_;

const cudaKernelNodeParams* knp_;
};

class CaptureInfo {
Expand All @@ -173,8 +191,9 @@ class CaptureInfo {
UploadNode addUploadNode(CUDA::DevicePointer<void*> dst, const void* src, std::size_t size);
DownloadNode addDownloadNode(void* dst, CUDA::DevicePointer<const void*> src, std::size_t size);
TransferNode addTransferNode(CUDA::DevicePointer<void*> dst, CUDA::DevicePointer<const void*> src, std::size_t size);
InsertNode addInsertNode(std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams);
SliceNode addSliceNode(std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams);
// InsertNode addInsertNode(std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams);
// SliceNode addSliceNode(std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams);
KernelNode addKernelNode(const cudaKernelNodeParams& knp);

private:
const Stream& stream_;
Expand Down
111 changes: 69 additions & 42 deletions modules/nvidia_plugin/src/cuda_graph_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,26 @@ void CudaGraphContext::add_transfer(const std::string& ti_op_name,
ti_graphs_.at(ti_op_name).add_transfer(stream, dst, src, size);
}

void CudaGraphContext::add_slice(const std::string& ti_op_name,
const CUDA::Stream& stream,
std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized");
ti_graphs_.at(ti_op_name).add_slice(stream, std::move(sliceParams));
}

void CudaGraphContext::add_insert(const std::string& ti_op_name,
// void CudaGraphContext::add_slice(const std::string& ti_op_name,
// const CUDA::Stream& stream,
// std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// // OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized");
// ti_graphs_.at(ti_op_name).add_slice(stream, std::move(sliceParams));
// }

void CudaGraphContext::add_kernel(const std::string& ti_op_name,
const CUDA::Stream& stream,
std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
// OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized");
ti_graphs_.at(ti_op_name).add_insert(stream, std::move(insertParams));
const cudaKernelNodeParams& knp) {
ti_graphs_.at(ti_op_name).add_kernel(stream, knp);
}

// void CudaGraphContext::add_insert(const std::string& ti_op_name,
// const CUDA::Stream& stream,
// std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
// // OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized");
// ti_graphs_.at(ti_op_name).add_insert(stream, std::move(insertParams));
// }

void CudaGraphContext::add_graph(const CUDA::Graph& graph) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].set_graph(graph);
Expand All @@ -89,19 +95,26 @@ void CudaGraphContext::update_capture(const TensorMappingContext& context) {
}
}

void CudaGraphContext::update_slice(const std::string& ti_op_name,
// void CudaGraphContext::update_slice(const std::string& ti_op_name,
// std::size_t index,
// std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) const {
// OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized");
// ti_graphs_.at(ti_op_name).update_slice(index, std::move(sliceParams));
// }

void CudaGraphContext::update_kernel(const std::string& ti_op_name,
std::size_t index,
std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) const {
const cudaKernelNodeParams& knp) const {
OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized");
ti_graphs_.at(ti_op_name).update_slice(index, std::move(sliceParams));
ti_graphs_.at(ti_op_name).update_kernel(index, knp);
}

void CudaGraphContext::update_insert(const std::string& ti_op_name,
std::size_t index,
std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) const {
OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized");
ti_graphs_.at(ti_op_name).update_insert(index, std::move(insertParams));
}
// void CudaGraphContext::update_insert(const std::string& ti_op_name,
// std::size_t index,
// std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) const {
// OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized");
// ti_graphs_.at(ti_op_name).update_insert(index, std::move(insertParams));
// }

void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const {
currentGraphIndex_ = index;
Expand Down Expand Up @@ -135,14 +148,18 @@ std::size_t CudaGraphContext::get_transfers_count(const std::string& ti_op_name)
}


std::size_t CudaGraphContext::get_slices_count(const std::string& ti_op_name) const {
return ti_graphs_.at(ti_op_name).get_slices_count();
}
// std::size_t CudaGraphContext::get_slices_count(const std::string& ti_op_name) const {
// return ti_graphs_.at(ti_op_name).get_slices_count();
// }

std::size_t CudaGraphContext::get_inserts_count(const std::string& ti_op_name) const {
return ti_graphs_.at(ti_op_name).get_inserts_count();
std::size_t CudaGraphContext::get_kernels_count(const std::string& ti_op_name) const {
return ti_graphs_.at(ti_op_name).get_kernels_count();
}

// std::size_t CudaGraphContext::get_inserts_count(const std::string& ti_op_name) const {
// return ti_graphs_.at(ti_op_name).get_inserts_count();
// }

std::size_t CudaGraphContext::get_graphs_count() const {
return graphs_.size();
}
Expand Down Expand Up @@ -173,18 +190,23 @@ void CudaGraphContext::CudaGraphInfo::add_transfer(const CUDA::Stream& stream,
transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size));
}

void CudaGraphContext::CudaGraphInfo::add_slice(const CUDA::Stream& stream,
std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
CUDA::CaptureInfo captureInfo{stream};
sliceNodes_.emplace_back(captureInfo.addSliceNode(std::move(sliceParams)));
}
// void CudaGraphContext::CudaGraphInfo::add_slice(const CUDA::Stream& stream,
// std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// CUDA::CaptureInfo captureInfo{stream};
// sliceNodes_.emplace_back(captureInfo.addSliceNode(std::move(sliceParams)));
// }

void CudaGraphContext::CudaGraphInfo::add_insert(const CUDA::Stream& stream,
std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
void CudaGraphContext::CudaGraphInfo::add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp) {
CUDA::CaptureInfo captureInfo{stream};
insertNodes_.emplace_back(captureInfo.addInsertNode(std::move(insertParams)));
kernelNodes_.emplace_back(captureInfo.addKernelNode(knp));
}

// void CudaGraphContext::CudaGraphInfo::add_insert(const CUDA::Stream& stream,
// std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
// CUDA::CaptureInfo captureInfo{stream};
// insertNodes_.emplace_back(captureInfo.addInsertNode(std::move(insertParams)));
// }

void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) {
graph_.emplace(graph);
graphExec_.emplace(graph);
Expand All @@ -201,16 +223,20 @@ void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext&
}
}

void CudaGraphContext::CudaGraphInfo::update_slice(std::size_t index,
std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
sliceNodes_[index].update_params(graphExec_.value(), std::move(sliceParams));
}
// void CudaGraphContext::CudaGraphInfo::update_slice(std::size_t index,
// std::unique_ptr<ov::nvidia_gpu::kernel::Slice::Params> sliceParams) {
// sliceNodes_[index].update_params(graphExec_.value(), std::move(sliceParams));
// }

void CudaGraphContext::CudaGraphInfo::update_insert(
std::size_t index, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
insertNodes_[index].update_params(graphExec_.value(), std::move(insertParams));
void CudaGraphContext::CudaGraphInfo::update_kernel(std::size_t index, const cudaKernelNodeParams& knp) {
kernelNodes_[index].update_params(graphExec_.value(), knp);
}

// void CudaGraphContext::CudaGraphInfo::update_insert(
// std::size_t index, std::unique_ptr<ov::nvidia_gpu::kernel::Insert::Params> insertParams) {
// insertNodes_[index].update_params(graphExec_.value(), std::move(insertParams));
// }

void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }

std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); }
Expand All @@ -219,9 +245,10 @@ std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return

std::size_t CudaGraphContext::CudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); }

std::size_t CudaGraphContext::CudaGraphInfo::get_slices_count() const { return sliceNodes_.size(); }
// std::size_t CudaGraphContext::CudaGraphInfo::get_slices_count() const { return sliceNodes_.size(); }
std::size_t CudaGraphContext::CudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); }

std::size_t CudaGraphContext::CudaGraphInfo::get_inserts_count() const { return insertNodes_.size(); }
// std::size_t CudaGraphContext::CudaGraphInfo::get_inserts_count() const { return insertNodes_.size(); }

bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ &&
Expand Down
Loading

0 comments on commit 6da3886

Please sign in to comment.