diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 3ab1194e8ca..338be3ea288 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -3,6 +3,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) add_instantiation_files(. solver/batch_bicgstab_kernels.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_kernels.instantiate.hip.cpp BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES @@ -36,7 +37,7 @@ set(GINKGO_HIP_SOURCES preconditioner/jacobi_generate_kernels.hip.cpp preconditioner/jacobi_simple_apply_kernels.hip.cpp ${BATCH_BICGSTAB_INSTANTIATE} - solver/batch_cg_kernels.hip.cpp + ${BATCH_CG_INSTANTIATE} solver/lower_trs_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp deleted file mode 100644 index 6102749b988..00000000000 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/batch_cg_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "core/solver/batch_dispatch.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ -namespace batch_cg { - - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" -#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" - - -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; - // This value has been taken from ROCm docs. This is the number of registers - // that maximizes the occupancy on an AMD GPU (MI200). HIP does not have an - // API to query the number of registers a function uses. - const int num_regs_used_per_thread = 64; - int max_regs_blk = 0; - GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( - &max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock, - exec->get_device_id())); - int max_threads_regs = (max_regs_blk / num_regs_used_per_thread); - max_threads_regs = (max_threads_regs / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -using settings = gko::kernels::batch_cg::settings; - - -template -class kernel_caller { -public: - using value_type = HipValueType; - - kernel_caller(std::shared_ptr exec, - const settings> settings) - : exec_{exec}, settings_{settings} - {} - - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - - template - void call_kernel( - LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const - { - using real_type = gko::remove_complex; - const size_type num_batch_items = mat.num_batch_items; - constexpr int align_multiple = 8; - const int padded_num_rows = - ceildiv(mat.num_rows, align_multiple) * align_multiple; - int shmem_per_blk = 0; - GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( - &shmem_per_blk, hipDeviceAttributeMaxSharedMemoryPerBlock, - exec_->get_device_id())); - const int block_size = - get_num_threads_per_block(exec_, mat.num_rows); - bool is_block_size_aligned = block_size % config::warp_size == 0; - GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(is_block_size_aligned); - - // Returns amount required in bytes - const size_t prec_size = PrecType::dynamic_work_size( - padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_cg::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); - const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + - (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; - GKO_ASSERT(is_stride_aligned); - - value_type* const workspace_data = workspace.get_data(); - - // Template parameters launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } - } - -private: - std::shared_ptr exec_; - const settings> settings_; -}; - - -template -void apply(std::shared_ptr exec, - const settings>& settings, - const batch::BatchLinOp* const mat, - const batch::BatchLinOp* const precon, - const batch::MultiVector* const b, - batch::MultiVector* const x, - batch::log::detail::log_data>& logdata) -{ - using hip_value_type = hip_type; - auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); - dispatcher.apply(b, x, logdata); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL); - - -} // namespace batch_cg -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/batch_cg_kernels.hip.hpp b/hip/solver/batch_cg_kernels.hip.hpp new file mode 100644 index 00000000000..7274256f89d --- /dev/null +++ b/hip/solver/batch_cg_kernels.hip.hpp @@ -0,0 +1,214 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + +#include +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; + // This value has been taken from ROCm docs. This is the number of registers + // that maximizes the occupancy on an AMD GPU (MI200). HIP does not have an + // API to query the number of registers a function uses. + const int num_regs_used_per_thread = 64; + int max_regs_blk = 0; + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock, + exec->get_device_id())); + int max_threads_regs = (max_regs_blk / num_regs_used_per_thread); + max_threads_regs = (max_threads_regs / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +using settings = gko::kernels::batch_cg::settings; + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const hip_type<_vtype>* const __restrict__ b_values, \ + hip_type<_vtype>* const __restrict__ x_values, \ + hip_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +template +class kernel_caller { +public: + using hip_value_type = hip_type; + + kernel_caller(std::shared_ptr exec, + const settings> settings) + : exec_{exec}, settings_{settings} + {} + + template + void call_kernel( + LogType logger, const BatchMatrixType& mat, PrecType prec, + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const + { + using real_type = gko::remove_complex; + const size_type num_batch_items = mat.num_batch_items; + constexpr int align_multiple = 8; + const int padded_num_rows = + ceildiv(mat.num_rows, align_multiple) * align_multiple; + int shmem_per_blk = 0; + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &shmem_per_blk, hipDeviceAttributeMaxSharedMemoryPerBlock, + exec_->get_device_id())); + const int block_size = + get_num_threads_per_block(exec_, mat.num_rows); + GKO_ASSERT(block_size >= 2 * config::warp_size); + GKO_ASSERT(block_size % config::warp_size == 0); + + // Returns amount required in bytes + const size_t prec_size = PrecType::dynamic_work_size( + padded_num_rows, mat.get_single_item_num_nnz()); + const auto sconf = + gko::kernels::batch_cg::compute_shared_storage( + shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), + b.num_rhs); + const size_t shared_size = + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + + (sconf.prec_shared ? prec_size : 0); + auto workspace = gko::array( + exec_, + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0); + + hip_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } + } + +private: + std::shared_ptr exec_; + const settings> settings_; +}; + + +} // namespace batch_cg +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/batch_cg_kernels.instantiate.hip.cpp b/hip/solver/batch_cg_kernels.instantiate.hip.cpp new file mode 100644 index 00000000000..f22596c3cda --- /dev/null +++ b/hip/solver/batch_cg_kernels.instantiate.hip.cpp @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + +#include +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" +#include "hip/solver/batch_cg_kernels.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +constexpr int default_block_size = 256; +constexpr int sm_oversubscription = 4; + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" + + +template +void apply(std::shared_ptr exec, + const settings>& settings, + const batch::BatchLinOp* const mat, + const batch::BatchLinOp* const precon, + const batch::MultiVector* const b, + batch::MultiVector* const x, + batch::log::detail::log_data>& logdata) +{ + using hip_value_type = hip_type; + auto dispatcher = batch::solver::create_dispatcher( + kernel_caller(exec, settings), settings, mat, precon); + dispatcher.apply(b, x, logdata); +} + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_hip_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace hip +} // namespace kernels +} // namespace gko