From 6da3886469b4549ec15bbeb42500d32a999e362a Mon Sep 17 00:00:00 2001 From: Andrii Pavliuk Date: Tue, 7 Nov 2023 16:24:55 +0200 Subject: [PATCH] Replace slice/insert for kernel nodes --- modules/nvidia_plugin/src/cuda/graph.cpp | 60 +++++---- modules/nvidia_plugin/src/cuda/graph.hpp | 75 +++++++----- .../nvidia_plugin/src/cuda_graph_context.cpp | 111 ++++++++++------- .../nvidia_plugin/src/cuda_graph_context.hpp | 52 +++++--- modules/nvidia_plugin/src/kernels/insert.cu | 103 ++++++++-------- modules/nvidia_plugin/src/kernels/insert.hpp | 16 ++- modules/nvidia_plugin/src/kernels/slice.cu | 115 +++++++++--------- modules/nvidia_plugin/src/kernels/slice.hpp | 20 ++- .../nvidia_plugin/src/ops/tensor_iterator.cpp | 16 ++- .../nvidia_plugin/src/ops/tensor_iterator.hpp | 24 ++-- 10 files changed, 340 insertions(+), 252 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index ef8917197..2bdd3b421 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -132,18 +132,25 @@ TransferNode CaptureInfo::addTransferNode(CUDA::DevicePointer dst, return TransferNode{newNode, dst, src, size}; } -InsertNode CaptureInfo::addInsertNode(std::unique_ptr insertParams) { - cudaGraphNode_t newNode; - throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, insertParams->getKnp())); - throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); - return InsertNode{newNode, std::move(insertParams)}; -} +// InsertNode CaptureInfo::addInsertNode(std::unique_ptr insertParams) { +// cudaGraphNode_t newNode; +// throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, insertParams->getKnp())); +// throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); +// return InsertNode{newNode, std::move(insertParams)}; +// } -SliceNode CaptureInfo::addSliceNode(std::unique_ptr sliceParams) { +// SliceNode CaptureInfo::addSliceNode(std::unique_ptr sliceParams) { +// cudaGraphNode_t newNode; +// throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, sliceParams->getKnp())); +// throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); +// return SliceNode{newNode, std::move(sliceParams)}; +// } + +KernelNode CaptureInfo::addKernelNode(const cudaKernelNodeParams& knp) { cudaGraphNode_t newNode; - throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, sliceParams->getKnp())); + throwIfError(cudaGraphAddKernelNode(&newNode, capturingGraph_, deps_, depCount_, &knp)); throwIfError(cudaStreamUpdateCaptureDependencies(stream_.get(), &newNode, 1, 1)); - return SliceNode{newNode, std::move(sliceParams)}; + return KernelNode{newNode, knp}; } void UploadNode::update_src(const GraphExec& exec, const void *src) { @@ -194,25 +201,28 @@ CUDA::TransferNode::TransferNode(cudaGraphNode_t node, CUDA::DevicePointer insertParams) { - insert_params_ = std::move(insertParams); - throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, insert_params_->getKnp())); -} +// void CUDA::InsertNode::update_params(const GraphExec &exec, std::unique_ptr insertParams) { +// insert_params_ = std::move(insertParams); +// throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, insert_params_->getKnp())); +// } -CUDA::InsertNode::InsertNode(cudaGraphNode_t node, std::unique_ptr insertParams) - : node_{node}, - insert_params_{std::move(insertParams)} { -} +// CUDA::InsertNode::InsertNode(cudaGraphNode_t node, std::unique_ptr insertParams) +// : node_{node}, +// insert_params_{std::move(insertParams)} { +// } -void CUDA::SliceNode::update_params(const GraphExec &exec, std::unique_ptr sliceParams) { - slice_params_ = std::move(sliceParams); - throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, slice_params_->getKnp())); -} +// void CUDA::SliceNode::update_params(const GraphExec &exec, std::unique_ptr sliceParams) { +// slice_params_ = std::move(sliceParams); +// throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, slice_params_->getKnp())); +// } -CUDA::SliceNode::SliceNode(cudaGraphNode_t node, std::unique_ptr sliceParams) - : node_{node}, - slice_params_{std::move(sliceParams)} { -} +// CUDA::SliceNode::SliceNode(cudaGraphNode_t node, std::unique_ptr sliceParams) +// : node_{node}, +// slice_params_{std::move(sliceParams)} { +// } + +CUDA::KernelNode::KernelNode(cudaGraphNode_t node, const cudaKernelNodeParams &knp) : node_{node}, knp_{&knp} {} +// CUDA::KernelNode::KernelNode(cudaGraphNode_t node) : node_{node} {} bool UploadNode::operator==(const UploadNode &rhs) const { return size_ == rhs.size_ && src_ == rhs.src_ && dst_.get() == rhs.dst_.get() && node_ == rhs.node_; diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp index baaaf28c3..92fe33cb9 100644 --- a/modules/nvidia_plugin/src/cuda/graph.hpp +++ b/modules/nvidia_plugin/src/cuda/graph.hpp @@ -131,40 +131,58 @@ class TransferNode { std::size_t size_; }; -class InsertNode { +// class InsertNode { +// friend CaptureInfo; + +// public: +// // InsertNode() = default; +// // InsertNode(const InsertNode&) = default; +// // InsertNode(InsertNode&&) = default; +// void update_params(const GraphExec& exec, std::unique_ptr insertParams); +// // bool operator==(const InsertNode& rhs) const; + +// private: +// InsertNode(cudaGraphNode_t node, std::unique_ptr kernelParams); +// cudaGraphNode_t node_; +// std::unique_ptr insert_params_; +// // cudaKernelNodeParams knp_; +// }; + +// class SliceNode { +// friend CaptureInfo; + +// public: +// // SliceNode() = default; +// // SliceNode(const SliceNode&) = default; +// // SliceNode(SliceNode&&) = default; +// void update_params(const GraphExec& exec, std::unique_ptr sliceParams); +// // bool operator==(const InsertNode& rhs) const; + +// private: +// SliceNode(cudaGraphNode_t node, std::unique_ptr kernelParams); +// cudaGraphNode_t node_; +// // std::unique_ptr slice_params_; +// // cudaKernelNodeParams knp_; +// }; + +class KernelNode { friend CaptureInfo; public: - // InsertNode() = default; - // InsertNode(const InsertNode&) = default; - // InsertNode(InsertNode&&) = default; - void update_params(const GraphExec& exec, std::unique_ptr insertParams); - // bool operator==(const InsertNode& rhs) const; - -private: - InsertNode(cudaGraphNode_t node, std::unique_ptr kernelParams); - cudaGraphNode_t node_; - std::unique_ptr insert_params_; - // cudaKernelNodeParams knp_; - -}; - -class SliceNode { - friend CaptureInfo; + // void update_params(const GraphExec& exec); + // void KernelNode::update_params(const GraphExec &exec) { + inline void update_params(const GraphExec& exec, const cudaKernelNodeParams& knp) { + // slice_params_ = std::move(sliceParams); + knp_ = &knp; + throwIfError(cudaGraphExecKernelNodeSetParams(exec.get(), node_, knp_)); + } -public: - // SliceNode() = default; - // SliceNode(const SliceNode&) = default; - // SliceNode(SliceNode&&) = default; - void update_params(const GraphExec& exec, std::unique_ptr sliceParams); // bool operator==(const InsertNode& rhs) const; private: - SliceNode(cudaGraphNode_t node, std::unique_ptr kernelParams); + KernelNode(cudaGraphNode_t node, const cudaKernelNodeParams& knp); cudaGraphNode_t node_; - std::unique_ptr slice_params_; - // cudaKernelNodeParams knp_; - + const cudaKernelNodeParams* knp_; }; class CaptureInfo { @@ -173,8 +191,9 @@ class CaptureInfo { UploadNode addUploadNode(CUDA::DevicePointer dst, const void* src, std::size_t size); DownloadNode addDownloadNode(void* dst, CUDA::DevicePointer src, std::size_t size); TransferNode addTransferNode(CUDA::DevicePointer dst, CUDA::DevicePointer src, std::size_t size); - InsertNode addInsertNode(std::unique_ptr insertParams); - SliceNode addSliceNode(std::unique_ptr sliceParams); + // InsertNode addInsertNode(std::unique_ptr insertParams); + // SliceNode addSliceNode(std::unique_ptr sliceParams); + KernelNode addKernelNode(const cudaKernelNodeParams& knp); private: const Stream& stream_; diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp index 03824ffee..68df50bfa 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -54,20 +54,26 @@ void CudaGraphContext::add_transfer(const std::string& ti_op_name, ti_graphs_.at(ti_op_name).add_transfer(stream, dst, src, size); } -void CudaGraphContext::add_slice(const std::string& ti_op_name, - const CUDA::Stream& stream, - std::unique_ptr sliceParams) { - // OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized"); - ti_graphs_.at(ti_op_name).add_slice(stream, std::move(sliceParams)); -} - -void CudaGraphContext::add_insert(const std::string& ti_op_name, +// void CudaGraphContext::add_slice(const std::string& ti_op_name, +// const CUDA::Stream& stream, +// std::unique_ptr sliceParams) { +// // OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized"); +// ti_graphs_.at(ti_op_name).add_slice(stream, std::move(sliceParams)); +// } + +void CudaGraphContext::add_kernel(const std::string& ti_op_name, const CUDA::Stream& stream, - std::unique_ptr insertParams) { - // OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized"); - ti_graphs_.at(ti_op_name).add_insert(stream, std::move(insertParams)); + const cudaKernelNodeParams& knp) { + ti_graphs_.at(ti_op_name).add_kernel(stream, knp); } +// void CudaGraphContext::add_insert(const std::string& ti_op_name, +// const CUDA::Stream& stream, +// std::unique_ptr insertParams) { +// // OPENVINO_ASSERT(ti_graphs_.is_initialized(), "TI graph not initialized"); +// ti_graphs_.at(ti_op_name).add_insert(stream, std::move(insertParams)); +// } + void CudaGraphContext::add_graph(const CUDA::Graph& graph) { OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); graphs_[currentGraphIndex_].set_graph(graph); @@ -89,19 +95,26 @@ void CudaGraphContext::update_capture(const TensorMappingContext& context) { } } -void CudaGraphContext::update_slice(const std::string& ti_op_name, +// void CudaGraphContext::update_slice(const std::string& ti_op_name, +// std::size_t index, +// std::unique_ptr sliceParams) const { +// OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); +// ti_graphs_.at(ti_op_name).update_slice(index, std::move(sliceParams)); +// } + +void CudaGraphContext::update_kernel(const std::string& ti_op_name, std::size_t index, - std::unique_ptr sliceParams) const { + const cudaKernelNodeParams& knp) const { OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); - ti_graphs_.at(ti_op_name).update_slice(index, std::move(sliceParams)); + ti_graphs_.at(ti_op_name).update_kernel(index, knp); } -void CudaGraphContext::update_insert(const std::string& ti_op_name, - std::size_t index, - std::unique_ptr insertParams) const { - OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); - ti_graphs_.at(ti_op_name).update_insert(index, std::move(insertParams)); -} +// void CudaGraphContext::update_insert(const std::string& ti_op_name, +// std::size_t index, +// std::unique_ptr insertParams) const { +// OPENVINO_ASSERT(ti_graphs_.at(ti_op_name).is_initialized(), "TI graph not initialized"); +// ti_graphs_.at(ti_op_name).update_insert(index, std::move(insertParams)); +// } void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const { currentGraphIndex_ = index; @@ -135,14 +148,18 @@ std::size_t CudaGraphContext::get_transfers_count(const std::string& ti_op_name) } -std::size_t CudaGraphContext::get_slices_count(const std::string& ti_op_name) const { - return ti_graphs_.at(ti_op_name).get_slices_count(); -} +// std::size_t CudaGraphContext::get_slices_count(const std::string& ti_op_name) const { +// return ti_graphs_.at(ti_op_name).get_slices_count(); +// } -std::size_t CudaGraphContext::get_inserts_count(const std::string& ti_op_name) const { - return ti_graphs_.at(ti_op_name).get_inserts_count(); +std::size_t CudaGraphContext::get_kernels_count(const std::string& ti_op_name) const { + return ti_graphs_.at(ti_op_name).get_kernels_count(); } +// std::size_t CudaGraphContext::get_inserts_count(const std::string& ti_op_name) const { +// return ti_graphs_.at(ti_op_name).get_inserts_count(); +// } + std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); } @@ -173,18 +190,23 @@ void CudaGraphContext::CudaGraphInfo::add_transfer(const CUDA::Stream& stream, transferNodes_.emplace_back(captureInfo.addTransferNode(dst, src, size)); } -void CudaGraphContext::CudaGraphInfo::add_slice(const CUDA::Stream& stream, - std::unique_ptr sliceParams) { - CUDA::CaptureInfo captureInfo{stream}; - sliceNodes_.emplace_back(captureInfo.addSliceNode(std::move(sliceParams))); -} +// void CudaGraphContext::CudaGraphInfo::add_slice(const CUDA::Stream& stream, +// std::unique_ptr sliceParams) { +// CUDA::CaptureInfo captureInfo{stream}; +// sliceNodes_.emplace_back(captureInfo.addSliceNode(std::move(sliceParams))); +// } -void CudaGraphContext::CudaGraphInfo::add_insert(const CUDA::Stream& stream, - std::unique_ptr insertParams) { +void CudaGraphContext::CudaGraphInfo::add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp) { CUDA::CaptureInfo captureInfo{stream}; - insertNodes_.emplace_back(captureInfo.addInsertNode(std::move(insertParams))); + kernelNodes_.emplace_back(captureInfo.addKernelNode(knp)); } +// void CudaGraphContext::CudaGraphInfo::add_insert(const CUDA::Stream& stream, +// std::unique_ptr insertParams) { +// CUDA::CaptureInfo captureInfo{stream}; +// insertNodes_.emplace_back(captureInfo.addInsertNode(std::move(insertParams))); +// } + void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) { graph_.emplace(graph); graphExec_.emplace(graph); @@ -201,16 +223,20 @@ void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& } } -void CudaGraphContext::CudaGraphInfo::update_slice(std::size_t index, - std::unique_ptr sliceParams) { - sliceNodes_[index].update_params(graphExec_.value(), std::move(sliceParams)); -} +// void CudaGraphContext::CudaGraphInfo::update_slice(std::size_t index, +// std::unique_ptr sliceParams) { +// sliceNodes_[index].update_params(graphExec_.value(), std::move(sliceParams)); +// } -void CudaGraphContext::CudaGraphInfo::update_insert( - std::size_t index, std::unique_ptr insertParams) { - insertNodes_[index].update_params(graphExec_.value(), std::move(insertParams)); +void CudaGraphContext::CudaGraphInfo::update_kernel(std::size_t index, const cudaKernelNodeParams& knp) { + kernelNodes_[index].update_params(graphExec_.value(), knp); } +// void CudaGraphContext::CudaGraphInfo::update_insert( +// std::size_t index, std::unique_ptr insertParams) { +// insertNodes_[index].update_params(graphExec_.value(), std::move(insertParams)); +// } + void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); } @@ -219,9 +245,10 @@ std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return std::size_t CudaGraphContext::CudaGraphInfo::get_transfers_count() const { return transferNodes_.size(); } -std::size_t CudaGraphContext::CudaGraphInfo::get_slices_count() const { return sliceNodes_.size(); } +// std::size_t CudaGraphContext::CudaGraphInfo::get_slices_count() const { return sliceNodes_.size(); } +std::size_t CudaGraphContext::CudaGraphInfo::get_kernels_count() const { return kernelNodes_.size(); } -std::size_t CudaGraphContext::CudaGraphInfo::get_inserts_count() const { return insertNodes_.size(); } +// std::size_t CudaGraphContext::CudaGraphInfo::get_inserts_count() const { return insertNodes_.size(); } bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index 3d2ee4bc3..71ac33736 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -37,13 +37,17 @@ class CudaGraphContext { CUDA::DevicePointer src, std::size_t size); - void add_slice(const std::string& ti_op_name, - const CUDA::Stream& stream, - std::unique_ptr sliceParams); + // void add_slice(const std::string& ti_op_name, + // const CUDA::Stream& stream, + // std::unique_ptr sliceParams); - void add_insert(const std::string& ti_op_name, + void add_kernel(const std::string& ti_op_name, const CUDA::Stream& stream, - std::unique_ptr insertParams); + const cudaKernelNodeParams& knp); + + // void add_insert(const std::string& ti_op_name, + // const CUDA::Stream& stream, + // std::unique_ptr insertParams); void add_graph(const CUDA::Graph& graph); void add_ti_graph(const std::string& ti_op_name, const CUDA::Graph& graph); @@ -51,12 +55,15 @@ class CudaGraphContext { bool is_initialized() const; void update_capture(const TensorMappingContext& context); - void update_slice(const std::string& ti_op_name, - std::size_t index, - std::unique_ptr sliceParams) const; - void update_insert(const std::string& ti_op_name, + // void update_slice(const std::string& ti_op_name, + // std::size_t index, + // std::unique_ptr sliceParams) const; + void update_kernel(const std::string& ti_op_name, std::size_t index, - std::unique_ptr insertParams) const; + const cudaKernelNodeParams& knp) const; + // void update_insert(const std::string& ti_op_name, + // std::size_t index, + // std::unique_ptr insertParams) const; void launch(std::size_t index, const CUDA::Stream& stream) const; void launch_ti_graph(const std::string& ti_op_name, const CUDA::Stream& stream) const; @@ -65,8 +72,9 @@ class CudaGraphContext { std::size_t get_results_count() const; std::size_t get_transfers_count(const std::string& ti_op_name) const; - std::size_t get_slices_count(const std::string& ti_op_name) const; - std::size_t get_inserts_count(const std::string& ti_op_name) const; + // std::size_t get_slices_count(const std::string& ti_op_name) const; + std::size_t get_kernels_count(const std::string& ti_op_name) const; + // std::size_t get_inserts_count(const std::string& ti_op_name) const; std::size_t get_graphs_count() const; @@ -98,16 +106,18 @@ class CudaGraphContext { CUDA::DevicePointer src, std::size_t size); - void add_slice(const CUDA::Stream& stream, std::unique_ptr sliceParams); - void add_insert(const CUDA::Stream& stream, std::unique_ptr insertParams); + // void add_slice(const CUDA::Stream& stream, std::unique_ptr sliceParams); + void add_kernel(const CUDA::Stream& stream, const cudaKernelNodeParams& knp); + // void add_insert(const CUDA::Stream& stream, std::unique_ptr insertParams); void set_graph(const CUDA::Graph& graph); bool is_initialized() const; void update_capture(const TensorMappingContext& context); - void update_slice(std::size_t index, std::unique_ptr sliceParams); - void update_insert(std::size_t index, std::unique_ptr insertParams); + // void update_slice(std::size_t index, std::unique_ptr sliceParams); + void update_kernel(std::size_t index, const cudaKernelNodeParams& knp); + // void update_insert(std::size_t index, std::unique_ptr insertParams); void launch(const CUDA::Stream& stream) const; @@ -115,8 +125,9 @@ class CudaGraphContext { std::size_t get_results_count() const; std::size_t get_transfers_count() const; - std::size_t get_slices_count() const; - std::size_t get_inserts_count() const; + // std::size_t get_slices_count() const; + std::size_t get_kernels_count() const; + // std::size_t get_inserts_count() const; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); @@ -128,8 +139,9 @@ class CudaGraphContext { std::map resultNodes_; std::vector transferNodes_; - std::vector sliceNodes_; - std::vector insertNodes_; + // std::vector sliceNodes_; + std::vector kernelNodes_; + // std::vector insertNodes_; }; friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); diff --git a/modules/nvidia_plugin/src/kernels/insert.cu b/modules/nvidia_plugin/src/kernels/insert.cu index 60118235d..af6d30edf 100644 --- a/modules/nvidia_plugin/src/kernels/insert.cu +++ b/modules/nvidia_plugin/src/kernels/insert.cu @@ -37,105 +37,98 @@ Insert::Insert(const Type_t element_type, const Props& props, const size_t max_t : element_type_{element_type}, props_{props}, size_{shape_size(props.old_shape)} { TypeValidator::check(element_type_); std::tie(num_blocks_, threads_per_block_) = calculateElementwiseGrid(size_, max_threads_per_block); -} - -void Insert::operator()(const cudaStream_t stream, const void* src, void* dst, const size_t start) const { switch (element_type_) { case Type_t::boolean: - return call(stream, src, dst, start); + // TODO: get rid of C-cast + params_.kernel = reinterpret_cast(&insert_part); + break; #ifdef CUDA_HAS_BF16_TYPE case Type_t::bf16: - return call<__nv_bfloat16>(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part<__nv_bfloat16>); + break; #endif case Type_t::f16: - return call<__half>(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part<__half>); + break; case Type_t::f32: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::f64: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::i8: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::i16: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::i32: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::i64: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::u8: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::u16: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::u32: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; case Type_t::u64: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&insert_part); + break; default: throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", static_cast(element_type_))); } + params_.num_blocks = num_blocks_; + params_.threads_per_block = threads_per_block_; + params_.size = size_; } -std::unique_ptr Insert::getParams(const void* src, void* dst, const size_t start) const { - auto res = std::make_unique(); - Params& p = *res; +void Insert::operator()(const cudaStream_t stream, const void* src, void* dst, const size_t start) const { switch (element_type_) { case Type_t::boolean: - // TODO: get rid of C-cast - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); #ifdef CUDA_HAS_BF16_TYPE case Type_t::bf16: - p.kernel = (void*)&insert_part<__nv_bfloat16>; - break; + return call<__nv_bfloat16>(stream, src, dst, start); #endif case Type_t::f16: - p.kernel = (void*)&insert_part<__half>; - break; + return call<__half>(stream, src, dst, start); case Type_t::f32: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::f64: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::i8: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::i16: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::i32: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::i64: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::u8: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::u16: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::u32: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); case Type_t::u64: - p.kernel = (void*)&insert_part; - break; + return call(stream, src, dst, start); default: throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", static_cast(element_type_))); } - p.num_blocks = num_blocks_; - p.threads_per_block = threads_per_block_; - p.props = static_cast(props_ptr_); - p.start = start; - p.size = size_; - p.x = src; - p.y = dst; - // return p; - return res; } +// std::unique_ptr Insert::getParams(const void* src, void* dst, const size_t start) const { +// return res; +// } + template void Insert::call(const cudaStream_t stream, const void* src, void* dst, const size_t start) const { assertThrow(props_ptr_, "props_ptr_ == nullptr"); diff --git a/modules/nvidia_plugin/src/kernels/insert.hpp b/modules/nvidia_plugin/src/kernels/insert.hpp index 9aab8e27b..bad2be40e 100644 --- a/modules/nvidia_plugin/src/kernels/insert.hpp +++ b/modules/nvidia_plugin/src/kernels/insert.hpp @@ -23,7 +23,7 @@ class Insert { }; struct Params { - inline const cudaKernelNodeParams* getKnp() { + inline const cudaKernelNodeParams& getKnp() { knp_.func = kernel; knp_.gridDim = num_blocks; knp_.blockDim = threads_per_block; @@ -35,7 +35,7 @@ class Insert { args_[4] = &y; knp_.kernelParams = &args_[0]; knp_.extra = nullptr; - return &knp_; + return knp_; } // inline operator==(const Params& rhs) { @@ -70,7 +70,15 @@ class Insert { void operator()(cudaStream_t stream, const void* src, void* dst, size_t start) const; - std::unique_ptr getParams(const void* src, void* dst, const size_t start) const; + // std::unique_ptr getParams(const void* src, void* dst, size_t start) const; + inline const cudaKernelNodeParams& getKnp(const void* src, void* dst, size_t start) const { + params_.start = start; + params_.x = src; + params_.y = dst; + // return p; + return params_.getKnp(); + } + size_t getImmutableWorkbufferSize() const; void setImmutableWorkbuffer(void* immutableBuffer); @@ -85,6 +93,7 @@ class Insert { size_t num_blocks_{}; size_t threads_per_block_{}; void* props_ptr_{}; + mutable Params params_; }; inline size_t Insert::getImmutableWorkbufferSize() const { return sizeof(props_); } @@ -93,6 +102,7 @@ inline void Insert::setImmutableWorkbuffer(void* immutableBuffer) { kernel::throwIfError( cudaMemcpyAsync(immutableBuffer, static_cast(&props_), sizeof(props_), cudaMemcpyHostToDevice)); props_ptr_ = immutableBuffer; + params_.props = static_cast(props_ptr_); } } // namespace kernel diff --git a/modules/nvidia_plugin/src/kernels/slice.cu b/modules/nvidia_plugin/src/kernels/slice.cu index 60852e621..f3ee60730 100644 --- a/modules/nvidia_plugin/src/kernels/slice.cu +++ b/modules/nvidia_plugin/src/kernels/slice.cu @@ -36,107 +36,102 @@ Slice::Slice(const Type_t element_type, const Props &props, const size_t max_thr : element_type_{element_type}, props_{props}, size_{shape_size(props.new_shape)} { TypeValidator::check(element_type_); std::tie(num_blocks_, threads_per_block_) = calculateElementwiseGrid(size_, max_threads_per_block); -} -void Slice::operator()(cudaStream_t stream, const void *src, void *dst, const size_t start) const { + Params& p = params_; switch (element_type_) { case Type_t::boolean: - return call(stream, src, dst, start); + // TODO: get rid of C-cast + // params_.kernel = (void*)&slice_part; + params_.kernel = reinterpret_cast(&slice_part); + break; #ifdef CUDA_HAS_BF16_TYPE case Type_t::bf16: - return call<__nv_bfloat16>(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part<__nv_bfloat16>); + break; #endif case Type_t::f16: - return call<__half>(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part<__half>); + break; case Type_t::f32: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::f64: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::i8: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::i16: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::i32: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::i64: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::u8: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::u16: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::u32: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; case Type_t::u64: - return call(stream, src, dst, start); + params_.kernel = reinterpret_cast(&slice_part); + break; default: - throw_ov_exception( - fmt::format("Input element type = {} is not supported by Slice operation " - "!!", - static_cast(element_type_))); + throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", + static_cast(element_type_))); } + params_.num_blocks = num_blocks_; + params_.threads_per_block = threads_per_block_; + params_.size = size_; } -std::unique_ptr Slice::getParams(const void* src, void* dst, const size_t start) const { - auto res = std::make_unique(); - Params& p = *res; +void Slice::operator()(cudaStream_t stream, const void *src, void *dst, const size_t start) const { switch (element_type_) { case Type_t::boolean: - // TODO: get rid of C-cast - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); #ifdef CUDA_HAS_BF16_TYPE case Type_t::bf16: - p.kernel = (void*)&slice_part<__nv_bfloat16>; - break; + return call<__nv_bfloat16>(stream, src, dst, start); #endif case Type_t::f16: - p.kernel = (void*)&slice_part<__half>; - break; + return call<__half>(stream, src, dst, start); case Type_t::f32: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::f64: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::i8: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::i16: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::i32: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::i64: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::u8: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::u16: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::u32: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); case Type_t::u64: - p.kernel = (void*)&slice_part; - break; + return call(stream, src, dst, start); default: - throw_ov_exception(fmt::format("Input element type = {} is not supported by Split operation !!", - static_cast(element_type_))); + throw_ov_exception( + fmt::format("Input element type = {} is not supported by Slice operation " + "!!", + static_cast(element_type_))); } - p.num_blocks = num_blocks_; - p.threads_per_block = threads_per_block_; - p.props = static_cast(props_ptr_); - p.start = start; - p.size = size_; - p.x = src; - p.y = dst; - // return p; - return res; } +// std::unique_ptr Slice::getParams(const void* src, void* dst, const size_t start) const { +// } + template void Slice::call(cudaStream_t stream, const void *src, void *dst, size_t start) const { assertThrow(props_ptr_, "props_ptr_ == nullptr"); diff --git a/modules/nvidia_plugin/src/kernels/slice.hpp b/modules/nvidia_plugin/src/kernels/slice.hpp index 1e8025b2a..2860ad817 100644 --- a/modules/nvidia_plugin/src/kernels/slice.hpp +++ b/modules/nvidia_plugin/src/kernels/slice.hpp @@ -21,7 +21,13 @@ class Slice { }; struct Params { - inline const cudaKernelNodeParams* getKnp() { + Params() = default; + Params(Params&&) = default; + Params& operator=(Params&&) = default; + Params(const Params&) = delete; + Params& operator=(const Params&) = delete; + + inline const cudaKernelNodeParams& getKnp() { knp_.func = kernel; knp_.gridDim = num_blocks; knp_.blockDim = threads_per_block; @@ -33,7 +39,7 @@ class Slice { args_[4] = &y; knp_.kernelParams = &args_[0]; knp_.extra = nullptr; - return &knp_; + return knp_; } // inline operator==(const Params& rhs) { @@ -68,7 +74,13 @@ class Slice { void operator()(cudaStream_t stream, const void* src, void* dst, size_t start) const; - std::unique_ptr getParams(const void* src, void* dst, const size_t start) const; + // std::unique_ptr getParams(const void* src, void* dst, size_t start) const; + inline const cudaKernelNodeParams& getKnp(const void* src, void* dst, const size_t start) const { + params_.x = src; + params_.y = dst; + params_.start = start; + return params_.getKnp(); + } size_t getImmutableWorkbufferSize() const; void setImmutableWorkbuffer(void* immutableBuffer); @@ -83,6 +95,7 @@ class Slice { unsigned num_blocks_{}; unsigned threads_per_block_{}; void* props_ptr_{}; + mutable Params params_; }; inline size_t Slice::getImmutableWorkbufferSize() const { return sizeof(props_); } @@ -91,6 +104,7 @@ inline void Slice::setImmutableWorkbuffer(void* immutableBuffer) { kernel::throwIfError( cudaMemcpyAsync(immutableBuffer, static_cast(&props_), sizeof(props_), cudaMemcpyHostToDevice)); props_ptr_ = immutableBuffer; + params_.props = static_cast(props_ptr_); } } // namespace kernel diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 2f388aa1e..5b7eb8299 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -343,8 +343,8 @@ void TensorIteratorOp::ExecuteGraph(const InferenceRequestContext& context, auto& graphContext = context.getCudaGraphContext(); const auto& opName = GetName(); - OPENVINO_ASSERT(graphContext.get_slices_count(opName) == slices_.size() && - graphContext.get_inserts_count(opName) == inserts_.size(), + // OPENVINO_ASSERT(graphContext.get_slices_count(opName) == slices_.size() && + OPENVINO_ASSERT(graphContext.get_kernels_count(opName) == slices_.size() + inserts_.size(), "CudaGraphContext/TensorIteratorOp slices or inserts count incosistency"); for (int64_t iter = 0; iter < num_iterations_; ++iter) { @@ -357,10 +357,12 @@ void TensorIteratorOp::ExecuteGraph(const InferenceRequestContext& context, // graph_exec_->launch(tream); for (std::size_t i = 0; i < slices_.size(); ++i) { - graphContext.update_slice(opName, i, slices_[i].get_params(stream, mutableBuffer, inputTensors, iter)); + // graphContext.update_slice(opName, i, slices_[i].get_params(stream, mutableBuffer, inputTensors, iter)); + graphContext.update_kernel(opName, i, slices_[i].get_knp(stream, mutableBuffer, inputTensors, iter)); } for (std::size_t i = 0; i < inserts_.size(); ++i) { - graphContext.update_insert(opName, i, inserts_[i].get_params(stream, mutableBuffer, outputTensors, iter)); + // graphContext.update_insert(opName, i, inserts_[i].get_params(stream, mutableBuffer, outputTensors, iter)); + graphContext.update_kernel(opName, i + slices_.size(), inserts_[i].get_knp(stream, mutableBuffer, outputTensors, iter)); } graphContext.launch_ti_graph(opName, stream); } @@ -445,7 +447,8 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, for (auto& slice : slices_) { // slice.capture(); // slice.capture(stream); - graphContext.add_slice(opName, stream, slice.get_params(stream, mutableBuffer, inputTensors, 0)); + // graphContext.add_slice(opName, stream, slice.get_params(stream, mutableBuffer, inputTensors, 0)); + graphContext.add_kernel(opName, stream, slice.get_knp(stream, mutableBuffer, inputTensors, 0)); } // Inner loop @@ -466,7 +469,8 @@ void TensorIteratorOp::Capture(InferenceRequestContext& context, for (auto& insert : inserts_) { // insert.capture(); // insert.capture(stream); - graphContext.add_insert(opName, stream, insert.get_params(stream, mutableBuffer, outputTensors, 0)); + // graphContext.add_insert(opName, stream, insert.get_params(stream, mutableBuffer, outputTensors, 0)); + graphContext.add_kernel(opName, stream, insert.get_knp(stream, mutableBuffer, outputTensors, 0)); } // throwIfError(cudaStreamEndCapture(stream.get(), &cudaGraph)); } diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index b06610002..35fed4575 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -74,14 +74,16 @@ class TensorIteratorOp : public SubGraph { slice_(stream.get(), src, dst, start_ + iter * stride_); } - inline std::unique_ptr get_params(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Inputs& inputTensors, - int64_t iter) { + // inline std::unique_ptr get_params(const CUDA::Stream& stream, + inline const cudaKernelNodeParams& get_knp(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Inputs& inputTensors, + int64_t iter) { // slice_node_.emplace(CUDA::CaptureInfo{stream}.addSliceNode(slice_.getParams(src_, dst_, start_))); const auto* src = inputTensors[input_idx_].get(); auto* dst = memory_manager_.outputTensorPointers(param_, mutableBuffer)[0].get(); - return slice_.getParams(src, dst, start_ + iter * stride_); + // return slice_.getParams(src, dst, start_ + iter * stride_); + return slice_.getKnp(src, dst, start_ + iter * stride_); } // inline void update_capture(const CUDA::GraphExec& exec, int64_t iter) { @@ -174,15 +176,17 @@ class TensorIteratorOp : public SubGraph { insert_(stream.get(), src, dst, start_ + iter * stride_); } - inline std::unique_ptr get_params(const CUDA::Stream& stream, - CUDA::DevicePointer mutableBuffer, - const IOperationExec::Outputs& outputTensors, - int64_t iter) { + // inline std::unique_ptr get_params(const CUDA::Stream& stream, + inline const cudaKernelNodeParams& get_knp(const CUDA::Stream& stream, + CUDA::DevicePointer mutableBuffer, + const IOperationExec::Outputs& outputTensors, + int64_t iter) { // insert_node_.emplace(CUDA::CaptureInfo{stream}.addInsertNode(insert_.getParams(src_, dst_, start_))); const auto* src = memory_manager_.inputTensorPointers(result_, mutableBuffer)[0].get(); auto* dst = outputTensors[output_idx_].get(); - return insert_.getParams(src, dst, start_ + iter * stride_); + // return insert_.getParams(src, dst, start_ + iter * stride_); + return insert_.getKnp(src, dst, start_ + iter * stride_); } // inline void update_capture(const CUDA::GraphExec& exec, int64_t iter) {