From 67aa5924f1073038ef2817402e7a98b6e5b3951c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jun 2024 16:43:11 +0200 Subject: [PATCH 01/15] [batch] provide default index type for matrix device types --- core/matrix/batch_struct.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index a3604fd9b99..593f9c99781 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -22,7 +22,7 @@ namespace csr { /** * Encapsulates one matrix from a batch of csr matrices. */ -template +template struct batch_item { using value_type = ValueType; using index_type = IndexType; @@ -44,7 +44,7 @@ struct batch_item { /** * A 'simple' structure to store a global uniform batch of csr matrices. */ -template +template struct uniform_batch { using value_type = ValueType; using index_type = IndexType; @@ -119,7 +119,7 @@ namespace ell { /** * Encapsulates one matrix from a batch of ell matrices. */ -template +template struct batch_item { using value_type = ValueType; using index_type = IndexType; @@ -141,7 +141,7 @@ struct batch_item { /** * A 'simple' structure to store a global uniform batch of ell matrices. */ -template +template struct uniform_batch { using value_type = ValueType; using index_type = IndexType; From 381bcf9a3d4ae22d4a6aa681be6b26372475b2e9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jun 2024 16:43:44 +0200 Subject: [PATCH 02/15] [batch] handle constness of index type same as value type --- core/matrix/batch_struct.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index 593f9c99781..13543ccb624 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -28,8 +28,8 @@ struct batch_item { using index_type = IndexType; ValueType* values; - const index_type* col_idxs; - const index_type* row_ptrs; + index_type* col_idxs; + index_type* row_ptrs; index_type num_rows; index_type num_cols; index_type num_nnz_per_item; @@ -51,8 +51,8 @@ struct uniform_batch { using entry_type = batch_item; ValueType* values; - const index_type* col_idxs; - const index_type* row_ptrs; + index_type* col_idxs; + index_type* row_ptrs; size_type num_batch_items; index_type num_rows; index_type num_cols; @@ -125,7 +125,7 @@ struct batch_item { using index_type = IndexType; ValueType* values; - const index_type* col_idxs; + index_type* col_idxs; index_type stride; index_type num_rows; index_type num_cols; @@ -148,7 +148,7 @@ struct uniform_batch { using entry_type = batch_item; ValueType* values; - const index_type* col_idxs; + index_type* col_idxs; size_type num_batch_items; index_type stride; index_type num_rows; From 2d61ab02637b9a3e3ee32baaae64f46bbdd8a8cf Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jun 2024 16:44:20 +0200 Subject: [PATCH 03/15] [batch] add macro to instantiate batched solver --- core/solver/batch_dispatch.hpp | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 018a6674df5..ff5bb3f5390 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -164,6 +164,41 @@ enum class log_type { simple_convergence_completion }; } // namespace log +#define GKO_BATCH_INSTANTIATE_STOP(macro, ...) \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \ + template macro( \ + __VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleRelResidual) + +#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...) \ + GKO_BATCH_INSTANTIATE_STOP( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_preconditioner::Identity); \ + template GKO_BATCH_INSTANTIATE_STOP( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \ + template GKO_BATCH_INSTANTIATE_STOP( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_preconditioner::BlockJacobi) + +#define GKO_BATCH_INSTANTIATE_LOGGER(macro, ...) \ + GKO_BATCH_INSTANTIATE_PRECONDITIONER( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_log::SimpleFinalLogger) + +#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...) \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::ell::uniform_batch); \ + template GKO_BATCH_INSTANTIATE_LOGGER( \ + macro, __VA_ARGS__, batch::matrix::dense::uniform_batch); \ + template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::csr::uniform_batch) + +#define GKO_BATCH_INSTANTIATE(macro, ...) \ + GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__) + + /** * Handles dispatching to the correct instantiation of a batched solver * depending on runtime parameters. From 9ef2d03f99ecbfe7af45d8f353f415883fbecae5 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 17 Sep 2024 15:40:56 +0200 Subject: [PATCH 04/15] [batch] split bicgstab compilation (hip) --- .../solver/batch_bicgstab_kernels.hpp | 2 + hip/CMakeLists.txt | 2 + hip/solver/batch_bicgstab_kernels.hip.cpp | 193 +++++++----------- hip/solver/batch_bicgstab_launch.hip.hpp | 78 +++++++ .../batch_bicgstab_launch.instantiate.hip.cpp | 67 ++++++ 5 files changed, 227 insertions(+), 115 deletions(-) create mode 100644 hip/solver/batch_bicgstab_launch.hip.hpp create mode 100644 hip/solver/batch_bicgstab_launch.instantiate.hip.cpp diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp index 8ea31358ed5..6bce1b53bb8 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp @@ -5,6 +5,8 @@ #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#include "core/solver/batch_bicgstab_kernels.hpp" + #include #include #include diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 7d914d57a81..c91a8609313 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) +add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES @@ -18,6 +19,7 @@ set(GINKGO_HIP_SOURCES ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp + ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.hip.cpp solver/lower_trs_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 17199d2cd19..697bcb94551 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -5,19 +5,13 @@ #include "core/solver/batch_bicgstab_kernels.hpp" #include -#include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_bicgstab_launch.hip.hpp" namespace gko { @@ -51,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr exec, } -template -using settings = gko::kernels::batch_bicgstab::settings; - - -template +template class kernel_caller { public: - using value_type = HipValueType; + using hip_value_type = hip_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{exec}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_bicgstab::storage_config& sconf, - LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = @@ -102,99 +73,92 @@ class kernel_caller { exec_->get_device_id())); const int block_size = get_num_threads_per_block(exec_, mat.num_rows); - bool is_block_size_aligned = block_size % config::warp_size == 0; GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(is_block_size_aligned); + GKO_ASSERT(block_size % config::warp_size == 0); // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_bicgstab::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); + const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage< + PrecType, hip_value_type>(shmem_per_blk, padded_num_rows, + mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; - GKO_ASSERT(is_stride_aligned); + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + hip_value_type* const workspace_data = workspace.get_data(); - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. // Template parameters launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 6: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 7: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 8: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 9: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 6: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 7: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 8: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 9: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -207,9 +171,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using hip_value_type = hip_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/hip/solver/batch_bicgstab_launch.hip.hpp new file mode 100644 index 00000000000..08d39b8fd5e --- /dev/null +++ b/hip/solver/batch_bicgstab_launch.hip.hpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_bicgstab { + + +template +using settings = gko::kernels::batch_bicgstab::settings; + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \ + mat_t, log_t, pre_t, stop_t) \ + void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_bicgstab::storage_config& sconf, \ + const settings>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const hip_type<_vtype>* const __restrict__ b_values, \ + hip_type<_vtype>* const __restrict__ x_values, \ + hip_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true) + + +} // namespace batch_bicgstab +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp new file mode 100644 index 00000000000..fb26c562a94 --- /dev/null +++ b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_bicgstab_launch.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_bicgstab { + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_hip_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); +// end + + +} // namespace batch_bicgstab +} // namespace hip +} // namespace kernels +} // namespace gko From a4c50f1072aa7bec64d0ac6f09fc8e019fceaa0d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 17 Sep 2024 15:49:51 +0200 Subject: [PATCH 05/15] [batch] split bicgstab compilation (cuda) --- cuda/CMakeLists.txt | 2 + cuda/solver/batch_bicgstab_kernels.cu | 249 ++++++------------ cuda/solver/batch_bicgstab_launch.cuh | 112 ++++++++ .../batch_bicgstab_launch.instantiate.cu | 120 +++++++++ 4 files changed, 320 insertions(+), 163 deletions(-) create mode 100644 cuda/solver/batch_bicgstab_launch.cuh create mode 100644 cuda/solver/batch_bicgstab_launch.instantiate.cu diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 000cb7b215f..9529222c540 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -3,6 +3,7 @@ add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) +add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -21,6 +22,7 @@ target_sources(ginkgo_cuda matrix/fft_kernels.cu preconditioner/batch_jacobi_kernels.cu solver/batch_bicgstab_kernels.cu + ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.cu solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 8a5eee6b196..bd07259f771 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -5,16 +5,13 @@ #include "core/solver/batch_bicgstab_kernels.hpp" #include -#include -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "cuda/solver/batch_bicgstab_launch.cuh" namespace gko { @@ -23,194 +20,121 @@ namespace cuda { namespace batch_bicgstab { -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - batch_single_kernels::apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_bicgstab::settings; - - -template +template class kernel_caller { public: - using value_type = CuValueType; + using cuda_value_type = cuda_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_bicgstab::storage_config& sconf, - LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); - // TODO - const int block_size = 256; - // get_num_threads_per_block( - // exec_, mat.num_rows); + BatchMatrixType, cuda_value_type>( + exec_); + const int block_size = + get_num_threads_per_block( + exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_bicgstab::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); + const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage< + PrecType, cuda_value_type>(shmem_per_blk, padded_num_rows, + mat.get_single_item_num_nnz(), + b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + cuda_value_type* const workspace_data = workspace.get_data(); - // TODO: split compilation // Template parameters launch_apply_kernel - // if (sconf.prec_shared) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 6: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 7: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 8: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 9: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 6: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 7: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 8: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 9: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -223,9 +147,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh new file mode 100644 index 00000000000..6c56b6456a8 --- /dev/null +++ b/cuda/solver/batch_bicgstab_launch.cuh @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_bicgstab { + + +template +using settings = gko::kernels::batch_bicgstab::settings; + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows); + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_num_threads_per_block< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec, \ + const int num_rows) + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(_vtype) \ + GKO_BATCH_INSTANTIATE( \ + GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, _vtype) + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec); + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_max_dynamic_shared_memory< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec) + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ + GKO_BATCH_INSTANTIATE( \ + GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype) + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size); + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \ + mat_t, log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_bicgstab::storage_config& sconf, \ + const settings>>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true) + + +} // namespace batch_bicgstab +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu new file mode 100644 index 00000000000..b88b19abb0f --- /dev/null +++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cuda/solver/batch_bicgstab_launch.cuh" + +#include + +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_bicgstab { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + const int num_regs_used = funcattr.numRegs; + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size) +{ + apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); +// end + + +} // namespace batch_bicgstab +} // namespace cuda +} // namespace kernels +} // namespace gko From a870c482e1cfbf41ad232f5d6452112a83aea70d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 17 Sep 2024 16:11:49 +0200 Subject: [PATCH 06/15] [batch] split cg compilation (hip) --- common/cuda_hip/solver/batch_cg_kernels.hpp | 2 + .../batch_bicgstab_launch.instantiate.cu | 20 +-- hip/CMakeLists.txt | 2 + hip/solver/batch_cg_kernels.hip.cpp | 153 +++++++----------- hip/solver/batch_cg_launch.hip.hpp | 70 ++++++++ .../batch_cg_launch.instantiate.hip.cpp | 59 +++++++ 6 files changed, 204 insertions(+), 102 deletions(-) create mode 100644 hip/solver/batch_cg_launch.hip.hpp create mode 100644 hip/solver/batch_cg_launch.instantiate.hip.cpp diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp index 7ccdc5f9926..c8502e28b1f 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp @@ -6,6 +6,8 @@ #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_ +#include "core/solver/batch_cg_kernels.hpp" + #include #include #include diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu index b88b19abb0f..ec88cc17c85 100644 --- a/cuda/solver/batch_bicgstab_launch.instantiate.cu +++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu @@ -29,9 +29,10 @@ int get_num_threads_per_block(std::shared_ptr exec, const int device_max_threads = ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); const int num_regs_used = funcattr.numRegs; int max_regs_blk = 0; cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, @@ -53,13 +54,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) cudaDevAttrMaxSharedMemoryPerMultiprocessor, exec->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, + batch_single_kernels::apply_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); return funcattr.maxDynamicSharedSizeBytes; } @@ -76,7 +78,7 @@ void launch_apply_kernel( ValueType* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) { - apply_kernel + batch_single_kernels::apply_kernel <<get_stream()>>>( sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), logger, prec, mat, b_values, x_values, workspace_data); diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index c91a8609313..4a540046322 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -3,6 +3,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_launch.instantiate.hip.cpp BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES @@ -21,6 +22,7 @@ set(GINKGO_HIP_SOURCES solver/batch_bicgstab_kernels.hip.cpp ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.hip.cpp + ${BATCH_CG_INSTANTIATE} solver/lower_trs_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 6d5e3bff3b3..25ebd667a7e 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -5,18 +5,13 @@ #include "core/solver/batch_cg_kernels.hpp" #include -#include -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_cg_launch.hip.hpp" namespace gko { @@ -50,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr exec, } -template -using settings = gko::kernels::batch_cg::settings; - - -template +template class kernel_caller { public: - using value_type = HipValueType; + using hip_value_type = hip_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{exec}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = @@ -101,79 +73,74 @@ class kernel_caller { exec_->get_device_id())); const int block_size = get_num_threads_per_block(exec_, mat.num_rows); - bool is_block_size_aligned = block_size % config::warp_size == 0; GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(is_block_size_aligned); + GKO_ASSERT(block_size % config::warp_size == 0); // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + hip_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; - GKO_ASSERT(is_stride_aligned); - - value_type* const workspace_data = workspace.get_data(); - - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. - // Template parameters launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0); + + hip_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; diff --git a/hip/solver/batch_cg_launch.hip.hpp b/hip/solver/batch_cg_launch.hip.hpp new file mode 100644 index 00000000000..a1e41310b8b --- /dev/null +++ b/hip/solver/batch_cg_launch.hip.hpp @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_cg { + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const hip_type<_vtype>* const __restrict__ b_values, \ + hip_type<_vtype>* const __restrict__ x_values, \ + hip_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +} // namespace batch_cg +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/batch_cg_launch.instantiate.hip.cpp b/hip/solver/batch_cg_launch.instantiate.hip.cpp new file mode 100644 index 00000000000..3605a88651d --- /dev/null +++ b/hip/solver/batch_cg_launch.instantiate.hip.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_cg_launch.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_cg { + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_hip_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace hip +} // namespace kernels +} // namespace gko From ca5e7b5d000b5187d1a2250393a5166a98468581 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 17 Sep 2024 16:14:38 +0200 Subject: [PATCH 07/15] [batch] split cg compilation (cuda) --- cuda/CMakeLists.txt | 2 + cuda/solver/batch_cg_kernels.cu | 204 +++++++-------------- cuda/solver/batch_cg_launch.cuh | 104 +++++++++++ cuda/solver/batch_cg_launch.instantiate.cu | 114 ++++++++++++ 4 files changed, 284 insertions(+), 140 deletions(-) create mode 100644 cuda/solver/batch_cg_launch.cuh create mode 100644 cuda/solver/batch_cg_launch.instantiate.cu diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 9529222c540..bfa65eee79b 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -24,6 +25,7 @@ target_sources(ginkgo_cuda solver/batch_bicgstab_kernels.cu ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.cu + ${BATCH_CG_INSTANTIATE} solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 32e66d7ee54..126a62006cf 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -5,16 +5,13 @@ #include "core/solver/batch_cg_kernels.hpp" #include -#include -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "cuda/solver/batch_cg_launch.cuh" namespace gko { @@ -23,104 +20,35 @@ namespace cuda { namespace batch_cg { -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - batch_single_kernels::apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_cg::settings; - - -template +template class kernel_caller { public: - using value_type = CuValueType; + using cuda_value_type = cuda_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); + BatchMatrixType, cuda_value_type>( + exec_); const int block_size = get_num_threads_per_block( + BatchMatrixType, cuda_value_type>( exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); @@ -128,69 +56,66 @@ public: padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + cuda_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); - - value_type* const workspace_data = workspace.get_data(); - - // TODO: split compilation - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. - // Template parameters launch_apply_kernel - // if (sconf.prec_shared) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); + + cuda_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -203,9 +128,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh new file mode 100644 index 00000000000..7196d6f8366 --- /dev/null +++ b/cuda/solver/batch_cg_launch.cuh @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_cg { + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows); + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \ + pre_t, stop_t) \ + int get_num_threads_per_block< \ + stop_t>, pre_t>, \ + log_t>>, \ + mat_t>, cuda_type<_vtype>>( \ + std::shared_ptr exec, const int num_rows) + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \ + _vtype) + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec); + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_max_dynamic_shared_memory< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec) + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \ + _vtype) + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu new file mode 100644 index 00000000000..9fca587f33e --- /dev/null +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cuda/solver/batch_cg_launch.cuh" + +#include + +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_cg { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + cudaFuncAttributes funcattr; + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); + const int num_regs_used = funcattr.numRegs; + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + batch_single_kernels::apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko From b93f60050d0de02f1c7e16c53ebd4a4da2aba972 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 1 Oct 2024 11:53:30 +0200 Subject: [PATCH 08/15] [batch] review updates: - adds header guard Co-authored-by: Pratik Nayak --- cuda/solver/batch_bicgstab_launch.cuh | 7 +++++++ cuda/solver/batch_cg_launch.cuh | 7 +++++++ hip/solver/batch_bicgstab_launch.hip.hpp | 7 +++++++ hip/solver/batch_cg_launch.hip.hpp | 7 +++++++ 4 files changed, 28 insertions(+) diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh index 6c56b6456a8..5106b21251e 100644 --- a/cuda/solver/batch_bicgstab_launch.cuh +++ b/cuda/solver/batch_bicgstab_launch.cuh @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_ +#define GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_ + + #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" @@ -110,3 +114,6 @@ void launch_apply_kernel( } // namespace cuda } // namespace kernels } // namespace gko + + +#endif diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh index 7196d6f8366..9cb470eb51b 100644 --- a/cuda/solver/batch_cg_launch.cuh +++ b/cuda/solver/batch_cg_launch.cuh @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_ +#define GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_ + + #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" @@ -102,3 +106,6 @@ void launch_apply_kernel( } // namespace cuda } // namespace kernels } // namespace gko + + +#endif diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/hip/solver/batch_bicgstab_launch.hip.hpp index 08d39b8fd5e..0f62a9487a3 100644 --- a/hip/solver/batch_bicgstab_launch.hip.hpp +++ b/hip/solver/batch_bicgstab_launch.hip.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_ +#define GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_ + + #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" @@ -76,3 +80,6 @@ void launch_apply_kernel( } // namespace hip } // namespace kernels } // namespace gko + + +#endif diff --git a/hip/solver/batch_cg_launch.hip.hpp b/hip/solver/batch_cg_launch.hip.hpp index a1e41310b8b..7071c5c4065 100644 --- a/hip/solver/batch_cg_launch.hip.hpp +++ b/hip/solver/batch_cg_launch.hip.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_ +#define GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_ + + #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" @@ -68,3 +72,6 @@ void launch_apply_kernel(std::shared_ptr exec, } // namespace hip } // namespace kernels } // namespace gko + + +#endif From 81973ee81ccfb5bdedcfd6e293733f470128a254 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 2 Oct 2024 13:16:28 +0200 Subject: [PATCH 09/15] [batch] add launch bounds and fix register check --- .../solver/batch_bicgstab_kernels.hpp | 19 +++++++++----- common/cuda_hip/solver/batch_cg_kernels.hpp | 21 ++++++++++------ .../batch_bicgstab_launch.instantiate.cu | 25 +++++++++++++------ cuda/solver/batch_cg_launch.instantiate.cu | 22 ++++++++++------ 4 files changed, 58 insertions(+), 29 deletions(-) diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp index 6bce1b53bb8..9aa14243de3 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp @@ -27,6 +27,11 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { + + +constexpr int max_bicgstab_threads = 1024; + + namespace batch_single_kernels { @@ -170,12 +175,14 @@ __device__ __forceinline__ void update_x_middle( template -__global__ void apply_kernel( - const gko::kernels::batch_bicgstab::storage_config sconf, - const int max_iter, const gko::remove_complex tol, - LogType logger, PrecType prec_shared, const BatchMatrixType mat, - const ValueType* const __restrict__ b, ValueType* const __restrict__ x, - ValueType* const __restrict__ workspace = nullptr) +__global__ void __launch_bounds__(max_bicgstab_threads) + apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, + const int max_iter, const gko::remove_complex tol, + LogType logger, PrecType prec_shared, + const BatchMatrixType mat, + const ValueType* const __restrict__ b, + ValueType* const __restrict__ x, + ValueType* const __restrict__ workspace = nullptr) { using real_type = typename gko::remove_complex; const auto num_batch_items = mat.num_batch_items; diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp index c8502e28b1f..2c42d359fff 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp @@ -29,6 +29,11 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { + + +constexpr int max_cg_threads = 1024; + + namespace batch_single_kernels { @@ -115,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r( template -__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, - const int max_iter, - const gko::remove_complex tol, - LogType logger, PrecType prec_shared, - const BatchMatrixType mat, - const ValueType* const __restrict__ b, - ValueType* const __restrict__ x, - ValueType* const __restrict__ workspace = nullptr) +__global__ void __launch_bounds__(max_cg_threads) + apply_kernel(const gko::kernels::batch_cg::storage_config sconf, + const int max_iter, const gko::remove_complex tol, + LogType logger, PrecType prec_shared, + const BatchMatrixType mat, + const ValueType* const __restrict__ b, + ValueType* const __restrict__ x, + ValueType* const __restrict__ workspace = nullptr) { using real_type = typename gko::remove_complex; const auto num_batch_items = mat.num_batch_items; diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu index ec88cc17c85..ad17394c4a9 100644 --- a/cuda/solver/batch_bicgstab_launch.instantiate.cu +++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu @@ -27,20 +27,29 @@ int get_num_threads_per_block(std::shared_ptr exec, constexpr int warp_sz = static_cast(config::warp_size); const int min_block_size = 2 * warp_sz; const int device_max_threads = - ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - const int num_regs_used = funcattr.numRegs; + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + auto get_num_regs = [](const auto func) { + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, func); + return funcattr.numRegs; + }; + const int num_regs_used = std::max( + get_num_regs( + batch_single_kernels::apply_kernel), + get_num_regs( + batch_single_kernels::apply_kernel)); int max_regs_blk = 0; cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, exec->get_device_id()); const int max_threads_regs = ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; + max_threads = max_threads <= max_bicgstab_threads ? max_threads + : max_bicgstab_threads; return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); } diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu index 9fca587f33e..89e96e85ace 100644 --- a/cuda/solver/batch_cg_launch.instantiate.cu +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -28,19 +28,27 @@ int get_num_threads_per_block(std::shared_ptr exec, const int min_block_size = 2 * warp_sz; const int device_max_threads = (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - const int num_regs_used = funcattr.numRegs; + auto get_num_regs = [](const auto func) { + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, func); + return funcattr.numRegs; + }; + const int num_regs_used = std::max( + get_num_regs( + batch_single_kernels::apply_kernel), + get_num_regs( + batch_single_kernels::apply_kernel)); int max_regs_blk = 0; cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, exec->get_device_id()); const int max_threads_regs = ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; + max_threads = max_threads <= max_cg_threads ? max_threads : max_cg_threads; return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); } From 57d0a790f60bb1e06f2ad25e17af7c2a2d588152 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 21 Oct 2024 11:19:25 +0200 Subject: [PATCH 10/15] [batch] add macro indirection Co-authored-by: Tobias Ribizel --- core/solver/batch_dispatch.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index ff5bb3f5390..3e3fd01a03c 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -164,12 +164,16 @@ enum class log_type { simple_convergence_completion }; } // namespace log -#define GKO_BATCH_INSTANTIATE_STOP(macro, ...) \ - macro(__VA_ARGS__, \ - ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \ - template macro( \ - __VA_ARGS__, \ - ::gko::batch::solver::device::batch_stop::SimpleRelResidual) +#define GKO_INDIRECT(...) __VA_ARGS__ + + +#define GKO_BATCH_INSTANTIATE_STOP(macro, ...) \ + GKO_INDIRECT( \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \ + template GKO_INDIRECT( \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleRelResidual)) #define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...) \ GKO_BATCH_INSTANTIATE_STOP( \ From 68aa8e38d413f2213117b8fd3fe58737ad048e82 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 7 Nov 2024 09:33:02 +0000 Subject: [PATCH 11/15] [batch] unify batch solver --- .../cuda_hip/solver/batch_bicgstab_launch.hpp | 35 +++++------- .../batch_bicgstab_launch.instantiate.cpp | 18 +++--- .../cuda_hip/solver/batch_cg_launch.hpp | 55 +++++++++---------- .../solver/batch_cg_launch.instantiate.cpp | 30 +++++----- cuda/CMakeLists.txt | 15 +++-- cuda/solver/batch_bicgstab_launch.cuh | 52 +----------------- .../batch_bicgstab_launch.instantiate.cu | 41 -------------- cuda/solver/batch_cg_launch.cuh | 44 +-------------- cuda/solver/batch_cg_launch.instantiate.cu | 33 ----------- hip/CMakeLists.txt | 4 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 24 ++++---- hip/solver/batch_cg_kernels.hip.cpp | 19 +++---- 12 files changed, 101 insertions(+), 269 deletions(-) rename hip/solver/batch_bicgstab_launch.hip.hpp => common/cuda_hip/solver/batch_bicgstab_launch.hpp (76%) rename hip/solver/batch_bicgstab_launch.instantiate.hip.cpp => common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp (81%) rename hip/solver/batch_cg_launch.hip.hpp => common/cuda_hip/solver/batch_cg_launch.hpp (58%) rename hip/solver/batch_cg_launch.instantiate.hip.cpp => common/cuda_hip/solver/batch_cg_launch.instantiate.cpp (60%) diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/common/cuda_hip/solver/batch_bicgstab_launch.hpp similarity index 76% rename from hip/solver/batch_bicgstab_launch.hip.hpp rename to common/cuda_hip/solver/batch_bicgstab_launch.hpp index 0f62a9487a3..3db03db0409 100644 --- a/hip/solver/batch_bicgstab_launch.hip.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_launch.hpp @@ -2,9 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_ -#define GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_ - +#pragma once #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" @@ -17,7 +15,7 @@ namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { namespace batch_bicgstab { @@ -32,24 +30,24 @@ void launch_apply_kernel( const gko::kernels::batch_bicgstab::storage_config& sconf, const settings>& settings, LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const hip_type* const __restrict__ b_values, - hip_type* const __restrict__ x_values, - hip_type* const __restrict__ workspace_data, + const device_type* const __restrict__ b_values, + device_type* const __restrict__ x_values, + device_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size); #define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \ mat_t, log_t, pre_t, stop_t) \ - void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \ - stop_t>>( \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ std::shared_ptr exec, \ const gko::kernels::batch_bicgstab::storage_config& sconf, \ - const settings>& settings, \ - log_t>>& logger, \ - pre_t>& prec, \ - const mat_t>& mat, \ - const hip_type<_vtype>* const __restrict__ b_values, \ - hip_type<_vtype>* const __restrict__ x_values, \ - hip_type<_vtype>* const __restrict__ workspace_data, \ + const settings>>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const device_type<_vtype>* const __restrict__ b_values, \ + device_type<_vtype>* const __restrict__ x_values, \ + device_type<_vtype>* const __restrict__ workspace_data, \ const int& block_size, const size_t& shared_size) #define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ @@ -77,9 +75,6 @@ void launch_apply_kernel( } // namespace batch_bicgstab -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko - - -#endif diff --git a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp similarity index 81% rename from hip/solver/batch_bicgstab_launch.instantiate.hip.cpp rename to common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp index fb26c562a94..bff6babb446 100644 --- a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp +++ b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp @@ -2,18 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp" + #include #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_bicgstab_kernels.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/solver/batch_bicgstab_launch.hip.hpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { namespace batch_bicgstab { @@ -24,15 +25,16 @@ void launch_apply_kernel( const gko::kernels::batch_bicgstab::storage_config& sconf, const settings>& settings, LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const hip_type* const __restrict__ b_values, - hip_type* const __restrict__ x_values, - hip_type* const __restrict__ workspace_data, + const device_type* const __restrict__ b_values, + device_type* const __restrict__ x_values, + device_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) { batch_single_kernels::apply_kernel <<get_stream()>>>( - sconf, settings.max_iterations, as_hip_type(settings.residual_tol), - logger, prec, mat, b_values, x_values, workspace_data); + sconf, settings.max_iterations, + as_device_type(settings.residual_tol), logger, prec, mat, b_values, + x_values, workspace_data); } @@ -62,6 +64,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); } // namespace batch_bicgstab -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/hip/solver/batch_cg_launch.hip.hpp b/common/cuda_hip/solver/batch_cg_launch.hpp similarity index 58% rename from hip/solver/batch_cg_launch.hip.hpp rename to common/cuda_hip/solver/batch_cg_launch.hpp index 7071c5c4065..6fa144ba35e 100644 --- a/hip/solver/batch_cg_launch.hip.hpp +++ b/common/cuda_hip/solver/batch_cg_launch.hpp @@ -2,9 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_ -#define GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_ - +#pragma once #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" @@ -17,7 +15,7 @@ namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { namespace batch_cg { @@ -27,29 +25,29 @@ using settings = gko::kernels::batch_cg::settings; template -void launch_apply_kernel(std::shared_ptr exec, - const gko::kernels::batch_cg::storage_config& sconf, - const settings>& settings, - LogType& logger, PrecType& prec, - const BatchMatrixType& mat, - const hip_type* const __restrict__ b_values, - hip_type* const __restrict__ x_values, - hip_type* const __restrict__ workspace_data, - const int& block_size, const size_t& shared_size); +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const device_type* const __restrict__ b_values, + device_type* const __restrict__ x_values, + device_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); -#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ - log_t, pre_t, stop_t) \ - void launch_apply_kernel, _n_shared, _prec_shared, \ - stop_t>>( \ - std::shared_ptr exec, \ - const gko::kernels::batch_cg::storage_config& sconf, \ - const settings>& settings, \ - log_t>>>& logger, \ - pre_t>& prec, \ - const mat_t>& mat, \ - const hip_type<_vtype>* const __restrict__ b_values, \ - hip_type<_vtype>* const __restrict__ x_values, \ - hip_type<_vtype>* const __restrict__ workspace_data, \ +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const device_type<_vtype>* const __restrict__ b_values, \ + device_type<_vtype>* const __restrict__ x_values, \ + device_type<_vtype>* const __restrict__ workspace_data, \ const int& block_size, const size_t& shared_size) #define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ @@ -69,9 +67,6 @@ void launch_apply_kernel(std::shared_ptr exec, } // namespace batch_cg -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko - - -#endif diff --git a/hip/solver/batch_cg_launch.instantiate.hip.cpp b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp similarity index 60% rename from hip/solver/batch_cg_launch.instantiate.hip.cpp rename to common/cuda_hip/solver/batch_cg_launch.instantiate.cpp index 3605a88651d..eef120df196 100644 --- a/hip/solver/batch_cg_launch.instantiate.hip.cpp +++ b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp @@ -2,37 +2,39 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/solver/batch_cg_launch.hpp" + #include #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_cg_kernels.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/solver/batch_cg_launch.hip.hpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { namespace batch_cg { template -void launch_apply_kernel(std::shared_ptr exec, - const gko::kernels::batch_cg::storage_config& sconf, - const settings>& settings, - LogType& logger, PrecType& prec, - const BatchMatrixType& mat, - const hip_type* const __restrict__ b_values, - hip_type* const __restrict__ x_values, - hip_type* const __restrict__ workspace_data, - const int& block_size, const size_t& shared_size) +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const device_type* const __restrict__ b_values, + device_type* const __restrict__ x_values, + device_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) { batch_single_kernels::apply_kernel <<get_stream()>>>( - sconf, settings.max_iterations, as_hip_type(settings.residual_tol), - logger, prec, mat, b_values, x_values, workspace_data); + sconf, settings.max_iterations, + as_device_type(settings.residual_tol), logger, prec, mat, b_values, + x_values, workspace_data); } @@ -54,6 +56,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); } // namespace batch_cg -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index bfa65eee79b..7567a1adf3c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -3,8 +3,10 @@ add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) -add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) -add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_bicgstab_launch.instantiate.cpp BATCH_BICGSTAB_INSTANTIATE1) +add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE2) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_cg_launch.instantiate.cpp BATCH_CG_INSTANTIATE1) +add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE2) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -23,9 +25,11 @@ target_sources(ginkgo_cuda matrix/fft_kernels.cu preconditioner/batch_jacobi_kernels.cu solver/batch_bicgstab_kernels.cu - ${BATCH_BICGSTAB_INSTANTIATE} + ${BATCH_BICGSTAB_INSTANTIATE1} + ${BATCH_BICGSTAB_INSTANTIATE2} solver/batch_cg_kernels.cu - ${BATCH_CG_INSTANTIATE} + ${BATCH_CG_INSTANTIATE1} + ${BATCH_CG_INSTANTIATE2} solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} @@ -41,7 +45,8 @@ else() endif() jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}") # override the default language mapping for the common files, set them to CUDA -foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES CSR_INSTANTIATE FBCSR_INSTANTIATE) +foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES + CSR_INSTANTIATE FBCSR_INSTANTIATE BATCH_BICGSTAB_INSTANTIATE1 BATCH_CG_INSTANTIATE1) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh index 5106b21251e..76528c84670 100644 --- a/cuda/solver/batch_bicgstab_launch.cuh +++ b/cuda/solver/batch_bicgstab_launch.cuh @@ -10,6 +10,7 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_bicgstab_kernels.hpp" @@ -59,57 +60,6 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec); GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype) -template -void launch_apply_kernel( - std::shared_ptr exec, - const gko::kernels::batch_bicgstab::storage_config& sconf, - const settings>& settings, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const ValueType* const __restrict__ b_values, - ValueType* const __restrict__ x_values, - ValueType* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size); - -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \ - mat_t, log_t, pre_t, stop_t) \ - void launch_apply_kernel, _n_shared, _prec_shared, \ - stop_t>>( \ - std::shared_ptr exec, \ - const gko::kernels::batch_bicgstab::storage_config& sconf, \ - const settings>>& settings, \ - log_t>>& logger, \ - pre_t>& prec, \ - const mat_t>& mat, \ - const cuda_type<_vtype>* const __restrict__ b_values, \ - cuda_type<_vtype>* const __restrict__ x_values, \ - cuda_type<_vtype>* const __restrict__ workspace_data, \ - const int& block_size, const size_t& shared_size) - -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true) - - } // namespace batch_bicgstab } // namespace cuda } // namespace kernels diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu index ad17394c4a9..629b4f9c6ad 100644 --- a/cuda/solver/batch_bicgstab_launch.instantiate.cu +++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu @@ -75,53 +75,12 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) } -template -void launch_apply_kernel( - std::shared_ptr exec, - const gko::kernels::batch_bicgstab::storage_config& sconf, - const settings>& settings, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const ValueType* const __restrict__ b_values, - ValueType* const __restrict__ x_values, - ValueType* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) -{ - batch_single_kernels::apply_kernel - <<get_stream()>>>( - sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), - logger, prec, mat, b_values, x_values, workspace_data); -} - - // begin GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK); // split GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); // end diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh index 9cb470eb51b..dafaaf19a9f 100644 --- a/cuda/solver/batch_cg_launch.cuh +++ b/cuda/solver/batch_cg_launch.cuh @@ -10,6 +10,7 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/solver/batch_cg_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_cg_kernels.hpp" @@ -59,49 +60,6 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec); _vtype) -template -void launch_apply_kernel( - std::shared_ptr exec, - const gko::kernels::batch_cg::storage_config& sconf, - const settings>& settings, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const cuda_type* const __restrict__ b_values, - cuda_type* const __restrict__ x_values, - cuda_type* const __restrict__ workspace_data, - const int& block_size, const size_t& shared_size); - -#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ - log_t, pre_t, stop_t) \ - void launch_apply_kernel, _n_shared, _prec_shared, \ - stop_t>>( \ - std::shared_ptr exec, \ - const gko::kernels::batch_cg::storage_config& sconf, \ - const settings>& settings, \ - log_t>>>& logger, \ - pre_t>& prec, \ - const mat_t>& mat, \ - const cuda_type<_vtype>* const __restrict__ b_values, \ - cuda_type<_vtype>* const __restrict__ x_values, \ - cuda_type<_vtype>* const __restrict__ workspace_data, \ - const int& block_size, const size_t& shared_size) - -#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) - - } // namespace batch_cg } // namespace cuda } // namespace kernels diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu index 89e96e85ace..70c5cecb6f5 100644 --- a/cuda/solver/batch_cg_launch.instantiate.cu +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -74,45 +74,12 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) } -template -void launch_apply_kernel( - std::shared_ptr exec, - const gko::kernels::batch_cg::storage_config& sconf, - const settings>& settings, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const cuda_type* const __restrict__ b_values, - cuda_type* const __restrict__ x_values, - cuda_type* const __restrict__ workspace_data, - const int& block_size, const size_t& shared_size) -{ - batch_single_kernels::apply_kernel - <<get_stream()>>>( - sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), - logger, prec, mat, b_values, x_values, workspace_data); -} - - // begin GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK); // split GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); // end diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 4a540046322..68be287a722 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) -add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE) -add_instantiation_files(. solver/batch_cg_launch.instantiate.hip.cpp BATCH_CG_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_bicgstab_launch.instantiate.cpp BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip solver/batch_cg_launch.instantiate.cpp BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 697bcb94551..f3e770c609d 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -9,9 +9,9 @@ #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" +#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/solver/batch_bicgstab_launch.hip.hpp" namespace gko { @@ -95,58 +95,58 @@ class kernel_caller { // Template parameters launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); } else { switch (sconf.n_shared) { case 0: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 1: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 2: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 3: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 4: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 5: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 6: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 7: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 8: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 9: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 25ebd667a7e..457dfcdefcf 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -9,9 +9,9 @@ #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" +#include "common/cuda_hip/solver/batch_cg_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/solver/batch_cg_launch.hip.hpp" namespace gko { @@ -97,38 +97,38 @@ class kernel_caller { // Template parameters launch_apply_kernel if (sconf.prec_shared) { - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); } else { switch (sconf.n_shared) { case 0: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 1: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 2: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 3: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 4: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; case 5: - launch_apply_kernel( + launch_apply_kernel( exec_, sconf, settings_, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); break; @@ -153,9 +153,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using hip_value_type = hip_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } From 8d25e63123dcbcf80270ef187e96ae57a21411a9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 7 Nov 2024 16:42:18 +0000 Subject: [PATCH 12/15] [batch] split batch solver (sycl) --- dpcpp/CMakeLists.txt | 6 +- dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 208 ++++++------------ dpcpp/solver/batch_bicgstab_launch.hpp | 85 +++++++ .../batch_bicgstab_launch.instantiate.dp.cpp | 111 ++++++++++ dpcpp/solver/batch_cg_kernels.dp.cpp | 157 ++++--------- dpcpp/solver/batch_cg_launch.hpp | 74 +++++++ .../solver/batch_cg_launch.instantiate.dp.cpp | 110 +++++++++ 7 files changed, 503 insertions(+), 248 deletions(-) create mode 100644 dpcpp/solver/batch_bicgstab_launch.hpp create mode 100644 dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp create mode 100644 dpcpp/solver/batch_cg_launch.hpp create mode 100644 dpcpp/solver/batch_cg_launch.instantiate.dp.cpp diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 516e9307e30..fcf123a513b 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -1,11 +1,13 @@ find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}" "$ENV{MKL_ROOT}") find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}" "$ENV{DPLROOT}") -# use the parameter from cmake +# use the parameter from cmake set(GINKGO_MKL_ROOT "${MKL_DIR}" PARENT_SCOPE) set(GINKGO_DPL_ROOT "${oneDPL_DIR}" PARENT_SCOPE) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE) +add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.dp.cpp BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_launch.instantiate.dp.cpp BATCH_CG_INSTANTIATE) add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp PRIVATE @@ -59,7 +61,9 @@ target_sources(ginkgo_dpcpp preconditioner/sor_kernels.dp.cpp reorder/rcm_kernels.dp.cpp solver/batch_bicgstab_kernels.dp.cpp + ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.dp.cpp + ${BATCH_CG_INSTANTIATE} solver/cb_gmres_kernels.dp.cpp solver/idr_kernels.dp.cpp solver/lower_trs_kernels.dp.cpp diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 74648aecf44..2aa98c26ed1 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -6,28 +6,14 @@ #include -#include -#include #include #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" -#include "dpcpp/base/config.hpp" -#include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/dpct.hpp" -#include "dpcpp/base/helper.hpp" -#include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" -#include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/matrix/batch_csr_kernels.hpp" -#include "dpcpp/matrix/batch_dense_kernels.hpp" -#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" -#include "dpcpp/solver/batch_bicgstab_kernels.hpp" +#include "dpcpp/solver/batch_bicgstab_launch.hpp" namespace gko { @@ -40,8 +26,7 @@ template using settings = gko::kernels::batch_bicgstab::settings; -__dpct_inline__ int get_group_size(int value, - int subgroup_size = config::warp_size) +int get_group_size(int value, int subgroup_size = config::warp_size) { int num_sg = ceildiv(value, subgroup_size); return num_sg * subgroup_size; @@ -56,53 +41,6 @@ class kernel_caller { : exec_{std::move(exec)}, settings_{settings} {} - template - __dpct_inline__ void launch_apply_kernel( - const gko::kernels::batch_bicgstab::storage_config& sconf, - LogType& logger, PrecType& prec, const BatchMatrixType mat, - const ValueType* const __restrict__ b_values, - ValueType* const __restrict__ x_values, - ValueType* const __restrict__ workspace, const int& group_size, - const int& shared_size) const - { - auto num_rows = mat.num_rows; - - const dim3 block(group_size); - const dim3 grid(mat.num_batch_items); - - auto max_iters = settings_.max_iterations; - auto res_tol = settings_.residual_tol; - - exec_->get_queue()->submit([&](sycl::handler& cgh) { - sycl::local_accessor slm_values( - sycl::range<1>(shared_size), cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size( - subgroup_size)]] [[intel::kernel_args_restrict]] { - auto batch_id = item_ct1.get_group_linear_id(); - const auto mat_global_entry = - gko::batch::matrix::extract_batch_item(mat, batch_id); - const ValueType* const b_global_entry = - gko::batch::multi_vector::batch_item_ptr( - b_values, 1, num_rows, batch_id); - ValueType* const x_global_entry = - gko::batch::multi_vector::batch_item_ptr( - x_values, 1, num_rows, batch_id); - batch_single_kernels::apply_kernel( - sconf, max_iters, res_tol, logger, prec, - mat_global_entry, b_global_entry, x_global_entry, - num_rows, mat.get_single_item_num_nnz(), - static_cast(slm_values.get_pointer()), - item_ct1, workspace); - }); - }); - } - template void call_kernel( @@ -152,80 +90,76 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); - // TODO: split compilation - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. - // template // launch_apply_kernel - // if (num_rows <= 32 && n_shared_total == 10) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // group_size, shared_size); - // } else if (num_rows <= 256 && n_shared_total == 10) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // group_size, shared_size); - // } else { - // switch (n_shared_total) { - // case 0: - launch_apply_kernel(sconf, logger, prec, mat, b.values, - x.values, workspace_data, - group_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 6: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 7: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 8: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 9: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 10: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + if (num_rows <= 32 && n_shared_total == 10) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, group_size, shared_size); + } else if (num_rows <= 256 && n_shared_total == 10) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, group_size, shared_size); + } else { + switch (n_shared_total) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 6: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 7: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 8: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 9: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 10: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp new file mode 100644 index 00000000000..e4b1917ee9b --- /dev/null +++ b/dpcpp/solver/batch_bicgstab_launch.hpp @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "dpcpp/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace batch_bicgstab { + + +template +using settings = gko::kernels::batch_bicgstab::settings; + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace, const int& group_size, + const int& shared_size); + + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _subgroup_size, _n_shared, \ + mat_t, log_t, pre_t, stop_t) \ + void \ + launch_apply_kernel<_vtype, stop_t<_vtype>, _subgroup_size, _n_shared>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_bicgstab::storage_config& sconf, \ + const settings>& settings, \ + log_t>& logger, pre_t<_vtype>& prec, \ + const mat_t& mat, \ + const _vtype* const __restrict__ b_values, \ + _vtype* const __restrict__ x_values, \ + _vtype* const __restrict__ workspace_data, const int& block_size, \ + const int& shared_size) + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 0) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 1) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 2) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 3) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 4) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 5) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 6) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 7) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 8) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 9) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 10) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 16, 10) + + +} // namespace batch_bicgstab +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp new file mode 100644 index 00000000000..94c5e7462ce --- /dev/null +++ b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "dpcpp/solver/batch_bicgstab_launch.hpp" + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/solver/batch_bicgstab_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace batch_bicgstab { + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace, const int& group_size, + const int& shared_size) +{ + auto num_rows = mat.num_rows; + + const dim3 block(group_size); + const dim3 grid(mat.num_batch_items); + + auto max_iters = settings.max_iterations; + auto res_tol = settings.residual_tol; + + exec->get_queue()->submit([&](sycl::handler& cgh) { + sycl::local_accessor slm_values( + sycl::range<1>(shared_size), cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size( + subgroup_size)]] [[intel::kernel_args_restrict]] { + auto batch_id = item_ct1.get_group_linear_id(); + const auto mat_global_entry = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const ValueType* const b_global_entry = + gko::batch::multi_vector::batch_item_ptr( + b_values, 1, num_rows, batch_id); + ValueType* const x_global_entry = + gko::batch::multi_vector::batch_item_ptr( + x_values, 1, num_rows, batch_id); + batch_single_kernels::apply_kernel( + sconf, max_iters, res_tol, logger, prec, mat_global_entry, + b_global_entry, x_global_entry, num_rows, + mat.get_single_item_num_nnz(), + static_cast(slm_values.get_pointer()), item_ct1, + workspace); + }); + }); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16); +// end + + +} // namespace batch_bicgstab +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 26f5c864187..43807583754 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -6,28 +6,15 @@ #include -#include -#include #include #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" -#include "dpcpp/base/config.hpp" -#include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/dpct.hpp" -#include "dpcpp/base/helper.hpp" -#include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" -#include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/matrix/batch_csr_kernels.hpp" -#include "dpcpp/matrix/batch_dense_kernels.hpp" -#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" #include "dpcpp/solver/batch_cg_kernels.hpp" +#include "dpcpp/solver/batch_cg_launch.hpp" namespace gko { @@ -40,8 +27,7 @@ template using settings = gko::kernels::batch_cg::settings; -__dpct_inline__ int get_group_size(int value, - int subgroup_size = config::warp_size) +int get_group_size(int value, int subgroup_size = config::warp_size) { int num_sg = ceildiv(value, subgroup_size); return num_sg * subgroup_size; @@ -56,53 +42,6 @@ class kernel_caller { : exec_{std::move(exec)}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType mat, - const ValueType* const __restrict__ b_values, - ValueType* const __restrict__ x_values, - ValueType* const __restrict__ workspace, const int& group_size, - const int& shared_size) const - { - auto num_rows = mat.num_rows; - - const dim3 block(group_size); - const dim3 grid(mat.num_batch_items); - - auto max_iters = settings_.max_iterations; - auto res_tol = settings_.residual_tol; - - exec_->get_queue()->submit([&](sycl::handler& cgh) { - sycl::local_accessor slm_values( - sycl::range<1>(shared_size), cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size( - subgroup_size)]] [[intel::kernel_args_restrict]] { - auto batch_id = item_ct1.get_group_linear_id(); - const auto mat_global_entry = - gko::batch::matrix::extract_batch_item(mat, batch_id); - const ValueType* const b_global_entry = - gko::batch::multi_vector::batch_item_ptr( - b_values, 1, num_rows, batch_id); - ValueType* const x_global_entry = - gko::batch::multi_vector::batch_item_ptr( - x_values, 1, num_rows, batch_id); - batch_single_kernels::apply_kernel( - sconf, max_iters, res_tol, logger, prec, - mat_global_entry, b_global_entry, x_global_entry, - num_rows, mat.get_single_item_num_nnz(), - static_cast(slm_values.get_pointer()), - item_ct1, workspace); - }); - }); - } - template void call_kernel( @@ -151,55 +90,53 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. // template // launch_apply_kernel - // if (num_rows <= 32 && n_shared_total == 6) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // group_size, shared_size); - // } else { - // switch (n_shared_total) { - // case 0: - launch_apply_kernel(sconf, logger, prec, mat, b.values, - x.values, workspace_data, - group_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // case 6: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, group_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + if (num_rows <= 32 && n_shared_total == 6) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, group_size, shared_size); + } else { + switch (n_shared_total) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + case 6: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, group_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp new file mode 100644 index 00000000000..e756bf60c2e --- /dev/null +++ b/dpcpp/solver/batch_cg_launch.hpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace batch_cg { + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace, + const int& group_size, const int& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _subgroup_size, _n_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void \ + launch_apply_kernel<_vtype, stop_t<_vtype>, _subgroup_size, _n_shared>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>& logger, pre_t<_vtype>& prec, \ + const mat_t& mat, \ + const _vtype* const __restrict__ b_values, \ + _vtype* const __restrict__ x_values, \ + _vtype* const __restrict__ workspace_data, const int& block_size, \ + const int& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 0) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 1) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 2) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 3) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 4) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 5) +#define GKO_DECLARE_BATCH_CG_LAUNCH_6(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 6) +#define GKO_DECLARE_BATCH_CG_LAUNCH_6_16(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 16, 6) + + +} // namespace batch_cg +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp new file mode 100644 index 00000000000..a45150b0d6c --- /dev/null +++ b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp @@ -0,0 +1,110 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "dpcpp/solver/batch_cg_launch.hpp" + +#include + +#include +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" +#include "dpcpp/solver/batch_cg_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace batch_cg { + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace, + const int& group_size, const int& shared_size) +{ + auto num_rows = mat.num_rows; + + const dim3 block(group_size); + const dim3 grid(mat.num_batch_items); + + auto max_iters = settings.max_iterations; + auto res_tol = settings.residual_tol; + + exec->get_queue()->submit([&](sycl::handler& cgh) { + sycl::local_accessor slm_values( + sycl::range<1>(shared_size), cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size( + subgroup_size)]] [[intel::kernel_args_restrict]] { + auto batch_id = item_ct1.get_group_linear_id(); + const auto mat_global_entry = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const ValueType* const b_global_entry = + gko::batch::multi_vector::batch_item_ptr( + b_values, 1, num_rows, batch_id); + ValueType* const x_global_entry = + gko::batch::multi_vector::batch_item_ptr( + x_values, 1, num_rows, batch_id); + batch_single_kernels::apply_kernel( + sconf, max_iters, res_tol, logger, prec, mat_global_entry, + b_global_entry, x_global_entry, num_rows, + mat.get_single_item_num_nnz(), + static_cast(slm_values.get_pointer()), item_ct1, + workspace); + }); + }); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6_16); +// end + + +} // namespace batch_cg +} // namespace dpcpp +} // namespace kernels +} // namespace gko From 1f4a5bd17b7e78207d5aa12f082d8e7efac4d987 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 12 Nov 2024 11:35:46 +0000 Subject: [PATCH 13/15] [core] add instantiation macro with variable args --- include/ginkgo/core/base/types.hpp | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index e375da15f9c..ceffec9b275 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -442,6 +442,54 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #endif +/** + * Instantiates a template for each non-complex value type compiled by Ginkgo. + * + * @param _macro A macro which expands the template instantiation + * (not including the leading `template` specifier). + * Should take at least two arguments, of which the first one + * is the value type. + * + * @note This won't be necessary after upgrading to C++20 + */ +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \ + template _macro(float, __VA_ARGS__); \ + template <> \ + _macro(double, __VA_ARGS__) GKO_NOT_IMPLEMENTED +#else +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \ + template _macro(float, __VA_ARGS__); \ + template _macro(double, __VA_ARGS__) +#endif + + +/** + * Instantiates a template for each non-complex value type compiled by Ginkgo. + * + * @param _macro A macro which expands the template instantiation + * (not including the leading `template` specifier). + * Should take at least two arguments, of which the first one + * is the value type. + * + * @note This won't be necessary after upgrading to C++20 + */ +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, \ + __VA_ARGS__); \ + template _macro(std::complex, __VA_ARGS__); \ + template <> \ + _macro(std::complex, __VA_ARGS__) GKO_NOT_IMPLEMENTED +#else +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, \ + __VA_ARGS__); \ + template _macro(std::complex, __VA_ARGS__); \ + template _macro(std::complex, __VA_ARGS__) +#endif + + /** * Instantiates a template for each value and scalar type compiled by Ginkgo. * This means all value and scalar type combinations for which From fd87f051e28bd5c6f4ef6386c3598b9509992277 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 12 Nov 2024 12:02:50 +0000 Subject: [PATCH 14/15] [batch] switch order of batch dispatch and value instantiation macros --- .../cuda_hip/solver/batch_bicgstab_launch.hpp | 48 +++++++++-------- .../batch_bicgstab_launch.instantiate.cpp | 22 ++++---- common/cuda_hip/solver/batch_cg_launch.hpp | 32 +++++++----- .../solver/batch_cg_launch.instantiate.cpp | 14 ++--- core/solver/batch_dispatch.hpp | 44 +++++++++++----- cuda/CMakeLists.txt | 2 +- cuda/solver/batch_bicgstab_launch.cuh | 21 +++++--- .../batch_bicgstab_launch.instantiate.cu | 6 +-- cuda/solver/batch_cg_launch.cuh | 20 ++++--- cuda/solver/batch_cg_launch.instantiate.cu | 6 +-- dpcpp/solver/batch_bicgstab_launch.hpp | 52 ++++++++++--------- .../batch_bicgstab_launch.instantiate.dp.cpp | 24 ++++----- dpcpp/solver/batch_cg_launch.hpp | 36 +++++++------ .../solver/batch_cg_launch.instantiate.dp.cpp | 16 +++--- 14 files changed, 196 insertions(+), 147 deletions(-) diff --git a/common/cuda_hip/solver/batch_bicgstab_launch.hpp b/common/cuda_hip/solver/batch_bicgstab_launch.hpp index 3db03db0409..696e11b5899 100644 --- a/common/cuda_hip/solver/batch_bicgstab_launch.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_launch.hpp @@ -50,28 +50,32 @@ void launch_apply_kernel( device_type<_vtype>* const __restrict__ workspace_data, \ const int& block_size, const size_t& shared_size) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \ + GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 0, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 1, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 2, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 3, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 4, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 5, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 6, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 7, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 8, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, false) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, true) } // namespace batch_bicgstab diff --git a/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp index bff6babb446..181fd925c4c 100644 --- a/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp +++ b/common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp @@ -39,27 +39,27 @@ void launch_apply_kernel( // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE; // end diff --git a/common/cuda_hip/solver/batch_cg_launch.hpp b/common/cuda_hip/solver/batch_cg_launch.hpp index 6fa144ba35e..fe5d96c8a21 100644 --- a/common/cuda_hip/solver/batch_cg_launch.hpp +++ b/common/cuda_hip/solver/batch_cg_launch.hpp @@ -50,20 +50,24 @@ void launch_apply_kernel( device_type<_vtype>* const __restrict__ workspace_data, \ const int& block_size, const size_t& shared_size) -#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) -#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \ + __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 0, false) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 1, false) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 2, false) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 3, false) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 4, false) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, false) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, true) } // namespace batch_cg diff --git a/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp index eef120df196..bedc0bab9a5 100644 --- a/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp +++ b/common/cuda_hip/solver/batch_cg_launch.instantiate.cpp @@ -39,19 +39,19 @@ void launch_apply_kernel( // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE; // end diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 3e3fd01a03c..0580325d67b 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -171,7 +171,7 @@ enum class log_type { simple_convergence_completion }; GKO_INDIRECT( \ macro(__VA_ARGS__, \ ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \ - template GKO_INDIRECT( \ + GKO_INDIRECT( \ macro(__VA_ARGS__, \ ::gko::batch::solver::device::batch_stop::SimpleRelResidual)) @@ -179,10 +179,10 @@ enum class log_type { simple_convergence_completion }; GKO_BATCH_INSTANTIATE_STOP( \ macro, __VA_ARGS__, \ ::gko::batch::solver::device::batch_preconditioner::Identity); \ - template GKO_BATCH_INSTANTIATE_STOP( \ + GKO_BATCH_INSTANTIATE_STOP( \ macro, __VA_ARGS__, \ ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \ - template GKO_BATCH_INSTANTIATE_STOP( \ + GKO_BATCH_INSTANTIATE_STOP( \ macro, __VA_ARGS__, \ ::gko::batch::solver::device::batch_preconditioner::BlockJacobi) @@ -191,16 +191,36 @@ enum class log_type { simple_convergence_completion }; macro, __VA_ARGS__, \ ::gko::batch::solver::device::batch_log::SimpleFinalLogger) -#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...) \ - GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ - batch::matrix::ell::uniform_batch); \ - template GKO_BATCH_INSTANTIATE_LOGGER( \ - macro, __VA_ARGS__, batch::matrix::dense::uniform_batch); \ - template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ - batch::matrix::csr::uniform_batch) +#define GKO_BATCH_INSTANTIATE_MATRIX_VARGS(macro, ...) \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::ell::uniform_batch); \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::dense::uniform_batch); \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::csr::uniform_batch) + +/** + * Passes each valid configuration of batch solver template parameter to a + * macro. The order of template parameters is: macro(..., , , + * , ) Any additional macro parameter passed to + * GKO_BATCH_INSTANTIATE will be prepended to the batch solver template + * parameters. + */ +#define GKO_BATCH_INSTANTIATE_VARGS(macro, ...) \ + GKO_BATCH_INSTANTIATE_MATRIX_VARGS(macro, __VA_ARGS__) + + +/** + * Passes each valid configuration of batch solver template parameter to a + * macro. The order of template parameters is: macro(, , + * , ) + */ +#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...) \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::ell::uniform_batch); \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::dense::uniform_batch); \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, batch::matrix::csr::uniform_batch) -#define GKO_BATCH_INSTANTIATE(macro, ...) \ - GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__) +#define GKO_BATCH_INSTANTIATE(macro) GKO_BATCH_INSTANTIATE_MATRIX(macro) /** diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 7567a1adf3c..5316c4c623c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -46,7 +46,7 @@ endif() jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}") # override the default language mapping for the common files, set them to CUDA foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES - CSR_INSTANTIATE FBCSR_INSTANTIATE BATCH_BICGSTAB_INSTANTIATE1 BATCH_CG_INSTANTIATE1) + CSR_INSTANTIATE FBCSR_INSTANTIATE BATCH_BICGSTAB_INSTANTIATE1 BATCH_BICGSTAB_INSTANTIATE2 BATCH_CG_INSTANTIATE1 BATCH_CG_INSTANTIATE2) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh index 76528c84670..737f2a923b0 100644 --- a/cuda/solver/batch_bicgstab_launch.cuh +++ b/cuda/solver/batch_bicgstab_launch.cuh @@ -39,9 +39,13 @@ int get_num_threads_per_block(std::shared_ptr exec, cuda_type<_vtype>>(std::shared_ptr exec, \ const int num_rows) -#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(_vtype) \ - GKO_BATCH_INSTANTIATE( \ - GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, _vtype) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \ + GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK \ + GKO_BATCH_INSTANTIATE( \ + GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_) template exec); log_t>, mat_t>, \ cuda_type<_vtype>>(std::shared_ptr exec) -#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ - GKO_BATCH_INSTANTIATE( \ - GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \ + GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, \ + __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY \ + GKO_BATCH_INSTANTIATE( \ + GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_) } // namespace batch_bicgstab diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu index 629b4f9c6ad..0b2e6e15cdf 100644 --- a/cuda/solver/batch_bicgstab_launch.instantiate.cu +++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu @@ -76,11 +76,9 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK); +GKO_INSTANTIATE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY); +GKO_INSTANTIATE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY; // end diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh index dafaaf19a9f..e803e15fe80 100644 --- a/cuda/solver/batch_cg_launch.cuh +++ b/cuda/solver/batch_cg_launch.cuh @@ -39,9 +39,12 @@ int get_num_threads_per_block(std::shared_ptr exec, mat_t>, cuda_type<_vtype>>( \ std::shared_ptr exec, const int num_rows) -#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \ - _vtype) +#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \ + GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK \ + GKO_BATCH_INSTANTIATE(GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_) template exec); log_t>, mat_t>, \ cuda_type<_vtype>>(std::shared_ptr exec) -#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \ - _vtype) + +#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \ + GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY \ + GKO_BATCH_INSTANTIATE( \ + GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_) } // namespace batch_cg diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu index 70c5cecb6f5..087746e6146 100644 --- a/cuda/solver/batch_cg_launch.instantiate.cu +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -75,11 +75,9 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK); +GKO_INSTANTIATE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY); +GKO_INSTANTIATE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY; // end diff --git a/dpcpp/solver/batch_bicgstab_launch.hpp b/dpcpp/solver/batch_bicgstab_launch.hpp index e4b1917ee9b..06ba8531b42 100644 --- a/dpcpp/solver/batch_bicgstab_launch.hpp +++ b/dpcpp/solver/batch_bicgstab_launch.hpp @@ -53,30 +53,34 @@ void launch_apply_kernel( _vtype* const __restrict__ workspace_data, const int& block_size, \ const int& shared_size) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 0) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 1) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 2) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 3) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 4) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 5) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 6) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 7) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 8) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 9) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 32, 10) -#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 16, 10) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \ + GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 0) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 1) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 2) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 3) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 4) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 5) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 6) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 7) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 8) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 9) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 32, 10) +#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10_16 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 16, 10) } // namespace batch_bicgstab diff --git a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp index 94c5e7462ce..b45d6409575 100644 --- a/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_launch.instantiate.dp.cpp @@ -79,29 +79,29 @@ void launch_apply_kernel( // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_10_16); +GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_10_16; // end diff --git a/dpcpp/solver/batch_cg_launch.hpp b/dpcpp/solver/batch_cg_launch.hpp index e756bf60c2e..3fe1e704963 100644 --- a/dpcpp/solver/batch_cg_launch.hpp +++ b/dpcpp/solver/batch_cg_launch.hpp @@ -50,22 +50,26 @@ void launch_apply_kernel(std::shared_ptr exec, _vtype* const __restrict__ workspace_data, const int& block_size, \ const int& shared_size) -#define GKO_DECLARE_BATCH_CG_LAUNCH_0(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 0) -#define GKO_DECLARE_BATCH_CG_LAUNCH_1(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 1) -#define GKO_DECLARE_BATCH_CG_LAUNCH_2(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 2) -#define GKO_DECLARE_BATCH_CG_LAUNCH_3(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 3) -#define GKO_DECLARE_BATCH_CG_LAUNCH_4(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 4) -#define GKO_DECLARE_BATCH_CG_LAUNCH_5(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 5) -#define GKO_DECLARE_BATCH_CG_LAUNCH_6(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 32, 6) -#define GKO_DECLARE_BATCH_CG_LAUNCH_6_16(_vtype) \ - GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 16, 6) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \ + __VA_ARGS__) + +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 0) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 1) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 2) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 3) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 4) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 5) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_6 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 32, 6) +#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_6_16 \ + GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 16, 6) } // namespace batch_cg diff --git a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp index a45150b0d6c..ba887c8aeb5 100644 --- a/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp +++ b/dpcpp/solver/batch_cg_launch.instantiate.dp.cpp @@ -86,21 +86,21 @@ void launch_apply_kernel(std::shared_ptr exec, // begin -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_0; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_1; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_2; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_3; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_4; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_5; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_6; // split -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_6_16); +GKO_INSTANTIATE_BATCH_CG_LAUNCH_6_16; // end From bdf51dc9caf1e9125c829d6a2f64cde7aff05e0b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 12 Nov 2024 15:17:36 +0000 Subject: [PATCH 15/15] [batch] fix windows build --- core/solver/batch_dispatch.hpp | 15 +++++---------- include/ginkgo/core/base/types.hpp | 22 ++++++++++++++-------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 0580325d67b..5a37b12cf11 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -164,16 +164,11 @@ enum class log_type { simple_convergence_completion }; } // namespace log -#define GKO_INDIRECT(...) __VA_ARGS__ - - -#define GKO_BATCH_INSTANTIATE_STOP(macro, ...) \ - GKO_INDIRECT( \ - macro(__VA_ARGS__, \ - ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \ - GKO_INDIRECT( \ - macro(__VA_ARGS__, \ - ::gko::batch::solver::device::batch_stop::SimpleRelResidual)) +#define GKO_BATCH_INSTANTIATE_STOP(macro, ...) \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleRelResidual) #define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...) \ GKO_BATCH_INSTANTIATE_STOP( \ diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index ceffec9b275..4b06b494707 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -442,6 +442,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #endif +// Helper macro to make Windows builds work +#define GKO_INDIRECT(...) __VA_ARGS__ + + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -454,13 +458,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \ - template _macro(float, __VA_ARGS__); \ + template GKO_INDIRECT(_macro(float, __VA_ARGS__)); \ template <> \ - _macro(double, __VA_ARGS__) GKO_NOT_IMPLEMENTED + GKO_INDIRECT(_macro(double, __VA_ARGS__)) \ + GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, ...) \ - template _macro(float, __VA_ARGS__); \ - template _macro(double, __VA_ARGS__) + template GKO_INDIRECT(_macro(float, __VA_ARGS__)); \ + template GKO_INDIRECT(_macro(double, __VA_ARGS__)) #endif @@ -478,15 +483,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, \ __VA_ARGS__); \ - template _macro(std::complex, __VA_ARGS__); \ + template GKO_INDIRECT(_macro(std::complex, __VA_ARGS__)); \ template <> \ - _macro(std::complex, __VA_ARGS__) GKO_NOT_IMPLEMENTED + GKO_INDIRECT(_macro(std::complex, __VA_ARGS__)) \ + GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(_macro, ...) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_VARGS(_macro, \ __VA_ARGS__); \ - template _macro(std::complex, __VA_ARGS__); \ - template _macro(std::complex, __VA_ARGS__) + template GKO_INDIRECT(_macro(std::complex, __VA_ARGS__)); \ + template GKO_INDIRECT(_macro(std::complex, __VA_ARGS__)) #endif