Skip to content

Commit

Permalink
Merge Split batched solver compilation
Browse files Browse the repository at this point in the history
This PR splits up the compilation of the batched solvers in order to reduce the compilation times. It splits up the instantiations of the kernel launches depending on the number of vectors in shared memory. This is based on the same CMake mechanism as for the csr and fbcsr kernels.

Related PR: #1629
  • Loading branch information
MarcelKoch authored Nov 19, 2024
2 parents 8304552 + bdf51dc commit 53bbc1d
Show file tree
Hide file tree
Showing 26 changed files with 1,582 additions and 786 deletions.
21 changes: 15 additions & 6 deletions common/cuda_hip/solver/batch_bicgstab_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_

#include "core/solver/batch_bicgstab_kernels.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
Expand All @@ -25,6 +27,11 @@
namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {


constexpr int max_bicgstab_threads = 1024;


namespace batch_single_kernels {


Expand Down Expand Up @@ -168,12 +175,14 @@ __device__ __forceinline__ void update_x_middle(
template <typename StopType, int n_shared, bool prec_shared_bool,
typename PrecType, typename LogType, typename BatchMatrixType,
typename ValueType>
__global__ void apply_kernel(
const gko::kernels::batch_bicgstab::storage_config sconf,
const int max_iter, const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared, const BatchMatrixType mat,
const ValueType* const __restrict__ b, ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
__global__ void __launch_bounds__(max_bicgstab_threads)
apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
const int max_iter, const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared,
const BatchMatrixType mat,
const ValueType* const __restrict__ b,
ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
{
using real_type = typename gko::remove_complex<ValueType>;
const auto num_batch_items = mat.num_batch_items;
Expand Down
84 changes: 84 additions & 0 deletions common/cuda_hip/solver/batch_bicgstab_launch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include "common/cuda_hip/base/batch_struct.hpp"
#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/matrix/batch_struct.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_bicgstab_kernels.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_bicgstab {


template <typename T>
using settings = gko::kernels::batch_bicgstab::settings<T>;


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_bicgstab::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size);

#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
mat_t, log_t, pre_t, stop_t) \
void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_bicgstab::storage_config& sconf, \
const settings<remove_complex<device_type<_vtype>>>& settings, \
log_t<gko::remove_complex<device_type<_vtype>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
const device_type<_vtype>* const __restrict__ b_values, \
device_type<_vtype>* const __restrict__ x_values, \
device_type<_vtype>* const __restrict__ workspace_data, \
const int& block_size, const size_t& shared_size)

#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH(...) \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS( \
GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, __VA_ARGS__)

#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 0, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 1, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 2, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 3, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 4, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 5, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 6, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 7, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 8, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, false)
#define GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH, 9, true)


} // namespace batch_bicgstab
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
69 changes: 69 additions & 0 deletions common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>

#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_bicgstab_kernels.hpp"
#include "core/solver/batch_dispatch.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_bicgstab {


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_bicgstab::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size)
{
batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
<<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
sconf, settings.max_iterations,
as_device_type(settings.residual_tol), logger, prec, mat, b_values,
x_values, workspace_data);
}


// begin
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_0_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_1_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_2_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_3_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_4_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_5_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_6_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_7_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_8_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_FALSE;
// split
GKO_INSTANTIATE_BATCH_BICGSTAB_LAUNCH_9_TRUE;
// end


} // namespace batch_bicgstab
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
23 changes: 15 additions & 8 deletions common/cuda_hip/solver/batch_cg_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_


#include "core/solver/batch_cg_kernels.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
Expand All @@ -27,6 +29,11 @@
namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {


constexpr int max_cg_threads = 1024;


namespace batch_single_kernels {


Expand Down Expand Up @@ -113,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r(
template <typename StopType, const int n_shared, const bool prec_shared_bool,
typename PrecType, typename LogType, typename BatchMatrixType,
typename ValueType>
__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
const int max_iter,
const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared,
const BatchMatrixType mat,
const ValueType* const __restrict__ b,
ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
__global__ void __launch_bounds__(max_cg_threads)
apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
const int max_iter, const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared,
const BatchMatrixType mat,
const ValueType* const __restrict__ b,
ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
{
using real_type = typename gko::remove_complex<ValueType>;
const auto num_batch_items = mat.num_batch_items;
Expand Down
76 changes: 76 additions & 0 deletions common/cuda_hip/solver/batch_cg_launch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include "common/cuda_hip/base/batch_struct.hpp"
#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/matrix/batch_struct.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_cg_kernels.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_cg {


template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_cg::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size);

#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
log_t, pre_t, stop_t) \
void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_cg::storage_config& sconf, \
const settings<remove_complex<_vtype>>& settings, \
log_t<device_type<gko::remove_complex<device_type<_vtype>>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
const device_type<_vtype>* const __restrict__ b_values, \
device_type<_vtype>* const __restrict__ x_values, \
device_type<_vtype>* const __restrict__ workspace_data, \
const int& block_size, const size_t& shared_size)

#define GKO_INSTANTIATE_BATCH_CG_LAUNCH(...) \
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_VARGS(GKO_DECLARE_BATCH_CG_LAUNCH, \
__VA_ARGS__)

#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 0, false)
#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 1, false)
#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 2, false)
#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 3, false)
#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 4, false)
#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, false)
#define GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE \
GKO_BATCH_INSTANTIATE_VARGS(GKO_INSTANTIATE_BATCH_CG_LAUNCH, 5, true)


} // namespace batch_cg
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
61 changes: 61 additions & 0 deletions common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/solver/batch_cg_launch.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>

#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_cg_kernels.hpp"
#include "core/solver/batch_dispatch.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_cg {


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_cg::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size)
{
batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
<<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
sconf, settings.max_iterations,
as_device_type(settings.residual_tol), logger, prec, mat, b_values,
x_values, workspace_data);
}


// begin
GKO_INSTANTIATE_BATCH_CG_LAUNCH_0_FALSE;
// split
GKO_INSTANTIATE_BATCH_CG_LAUNCH_1_FALSE;
// split
GKO_INSTANTIATE_BATCH_CG_LAUNCH_2_FALSE;
// split
GKO_INSTANTIATE_BATCH_CG_LAUNCH_3_FALSE;
// split
GKO_INSTANTIATE_BATCH_CG_LAUNCH_4_FALSE;
// split
GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_FALSE;
// split
GKO_INSTANTIATE_BATCH_CG_LAUNCH_5_TRUE;
// end


} // namespace batch_cg
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Loading

0 comments on commit 53bbc1d

Please sign in to comment.