From d04f06c09e5c24d0a842cd9aca2932aed25fd373 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jun 2024 10:16:23 +0200 Subject: [PATCH] [batch] split cg compilation (cuda) --- cuda/CMakeLists.txt | 3 +- cuda/solver/batch_cg_kernels.cu | 239 -------------------- cuda/solver/batch_cg_kernels.cuh | 220 ++++++++++++++++++ cuda/solver/batch_cg_kernels.instantiate.cu | 159 +++++++++++++ 4 files changed, 381 insertions(+), 240 deletions(-) delete mode 100644 cuda/solver/batch_cg_kernels.cu create mode 100644 cuda/solver/batch_cg_kernels.cuh create mode 100644 cuda/solver/batch_cg_kernels.instantiate.cu diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 1c2c6ee64e4..6077731173b 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) add_instantiation_files(. solver/batch_bicgstab_kernels.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_kernels.instantiate.cu BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -39,7 +40,7 @@ target_sources(ginkgo_cuda preconditioner/jacobi_generate_kernels.cu preconditioner/jacobi_simple_apply_kernels.cu ${BATCH_BICGSTAB_INSTANTIATE} - solver/batch_cg_kernels.cu + ${BACTH_CG_INSTANTIATE} solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu deleted file mode 100644 index 746be0365e7..00000000000 --- a/cuda/solver/batch_cg_kernels.cu +++ /dev/null @@ -1,239 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/batch_cg_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "core/solver/batch_dispatch.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ -namespace batch_cg { - - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" -#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" - - -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_cg::settings; - - -template -class kernel_caller { -public: - using value_type = CuValueType; - - kernel_caller(std::shared_ptr exec, - const settings> settings) - : exec_{std::move(exec)}, settings_{settings} - {} - - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template - void call_kernel( - LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const - { - using real_type = gko::remove_complex; - const size_type num_batch_items = mat.num_batch_items; - constexpr int align_multiple = 8; - const int padded_num_rows = - ceildiv(mat.num_rows, align_multiple) * align_multiple; - const int shmem_per_blk = - get_max_dynamic_shared_memory(exec_); - const int block_size = - get_num_threads_per_block( - exec_, mat.num_rows); - GKO_ASSERT(block_size >= 2 * config::warp_size); - - const size_t prec_size = PrecType::dynamic_work_size( - padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_cg::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); - const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + - (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); - - value_type* const workspace_data = workspace.get_data(); - - // Template parameters launch_apply_kernel - if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } - } - -private: - std::shared_ptr exec_; - const settings> settings_; -}; - - -template -void apply(std::shared_ptr exec, - const settings>& settings, - const batch::BatchLinOp* const mat, - const batch::BatchLinOp* const precon, - const batch::MultiVector* const b, - batch::MultiVector* const x, - batch::log::detail::log_data>& logdata) -{ - using cu_value_type = cuda_type; - auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); - dispatcher.apply(b, x, logdata); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL); - - -} // namespace batch_cg -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/solver/batch_cg_kernels.cuh b/cuda/solver/batch_cg_kernels.cuh new file mode 100644 index 00000000000..425ff080e3e --- /dev/null +++ b/cuda/solver/batch_cg_kernels.cuh @@ -0,0 +1,220 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows); + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \ + pre_t, stop_t) \ + int get_num_threads_per_block>, pre_t<_vtype>, \ + log_t<_vtype>, mat_t<_vtype>, _vtype>( \ + std::shared_ptr exec, const int num_rows) + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \ + _vtype) + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec); + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_max_dynamic_shared_memory( \ + std::shared_ptr exec) + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \ + _vtype) + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +template +class kernel_caller { +public: + using cuda_value_type = cuda_type; + + kernel_caller(std::shared_ptr exec, + const settings> settings) + : exec_{std::move(exec)}, settings_{settings} + {} + + template + void call_kernel( + LogType logger, const BatchMatrixType& mat, PrecType prec, + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const + { + using real_type = gko::remove_complex; + const size_type num_batch_items = mat.num_batch_items; + constexpr int align_multiple = 8; + const int padded_num_rows = + ceildiv(mat.num_rows, align_multiple) * align_multiple; + auto shem_guard = + gko::kernels::cuda::detail::shared_memory_config_guard< + cuda_value_type>(); + const int shmem_per_blk = + get_max_dynamic_shared_memory( + exec_); + const int block_size = + get_num_threads_per_block( + exec_, mat.num_rows); + GKO_ASSERT(block_size >= 2 * config::warp_size); + + const size_t prec_size = PrecType::dynamic_work_size( + padded_num_rows, mat.get_single_item_num_nnz()); + const auto sconf = + gko::kernels::batch_cg::compute_shared_storage( + shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), + b.num_rhs); + const size_t shared_size = + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + + (sconf.prec_shared ? prec_size : 0); + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); + + cuda_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } + } + +private: + std::shared_ptr exec_; + const settings> settings_; +}; + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/batch_cg_kernels.instantiate.cu b/cuda/solver/batch_cg_kernels.instantiate.cu new file mode 100644 index 00000000000..e2f13bb32aa --- /dev/null +++ b/cuda/solver/batch_cg_kernels.instantiate.cu @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cuda/solver/batch_cg_kernels.cuh" + +#include +#include + +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +// NOTE: this default block size is not used for the main solver kernel. +constexpr int default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + const int num_regs_used = funcattr.numRegs; + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +void apply(std::shared_ptr exec, + const settings>& settings, + const batch::BatchLinOp* const mat, + const batch::BatchLinOp* const precon, + const batch::MultiVector* const b, + batch::MultiVector* const x, + batch::log::detail::log_data>& logdata) +{ + auto dispatcher = batch::solver::create_dispatcher( + kernel_caller(exec, settings), settings, mat, precon); + dispatcher.apply(b, x, logdata); +} + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko