diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index f296fb9da86..01880127641 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -258,6 +258,9 @@ void CudaExecutor::set_gpu_property() kernels::cuda::config::warp_size; this->get_exec_info().max_subgroup_size = kernels::cuda::config::warp_size; + GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute( + &this->get_exec_info().max_shared_memory_per_workgroup, + cudaDevAttrMaxSharedMemoryPerBlock, this->get_device_id())); } } diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index db92543fd74..07e16535631 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -85,7 +85,7 @@ namespace batch_bicgstab { template <typename StopType, typename PrecType, typename LogType, typename BatchMatrixType, typename ValueType> -int get_num_threads_per_block(std::shared_ptr<const CudaExecutor> exec, +int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec, const int num_rows) { int nwarps = num_rows / 4; @@ -117,7 +117,7 @@ int get_num_threads_per_block(std::shared_ptr<const CudaExecutor> exec, template <typename StopType, typename PrecType, typename LogType, typename BatchMatrixType, typename ValueType> -int get_max_dynamic_shared_memory(std::shared_ptr<const CudaExecutor> exec, +int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec, const size_type required_cache_storage) { int shmem_per_sm = 0; @@ -178,7 +178,7 @@ public: { using real_type = gko::remove_complex<value_type>; const size_type num_batch_items = mat.num_batch_items; - constexpr int align_multiple = 2; + constexpr int align_multiple = 8; const int shared_gap = ((mat.num_rows + align_multiple - 1) / align_multiple) * align_multiple; diff --git a/hip/base/exception.hip.hpp b/hip/base/exception.hip.hpp new file mode 100644 index 00000000000..7c3b3b2e12e --- /dev/null +++ b/hip/base/exception.hip.hpp @@ -0,0 +1,56 @@ +/*******************************<GINKGO LICENSE>****************************** +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +******************************<GINKGO LICENSE>*******************************/ + +#ifndef GKO_HIP_BASE_EXCEPTION_HIP_HPP_ +#define GKO_HIP_BASE_EXCEPTION_HIP_HPP_ + + +#include <ginkgo/core/base/exception.hpp> + + +namespace gko { + + +#define GKO_HIP_LAST_IF_ERROR_THROW \ + hipError_t err = hipGetLastError(); \ + if (err != hipSuccess) { \ + printf(" Hip kernel error: %s\n", hipGetErrorString(err)); \ + throw gko::HipError(__FILE__, __LINE__, __func__, err); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + +} // namespace gko + +#endif // GKO_HIP_BASE_EXCEPTION_HIP_HPP_ diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 8d175c0e424..489e9b28ff9 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -262,6 +262,9 @@ void HipExecutor::set_gpu_property() #endif // GINKGO_HIP_PLATFORM_NVCC this->get_exec_info().max_subgroup_size = kernels::hip::config::warp_size; + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &this->get_exec_info().max_shared_memory_per_workgroup, + hipDeviceAttributeMaxSharedMemoryPerBlock, this->get_device_id())); } } diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 4ef8cd36c1b..b9fe8b0c9c3 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -34,21 +34,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <hip/hip_runtime.h> +#include <thrust/functional.h> +#include <thrust/transform.h> #include <ginkgo/core/base/exception_helpers.hpp> #include <ginkgo/core/base/math.hpp> +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" +#include "hip/base/exception.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" namespace gko { namespace kernels { namespace hip { + + +constexpr int default_block_size = 256; +constexpr int sm_oversubscription = 4; + /** * @brief The batch Bicgstab solver namespace. * @@ -57,19 +74,190 @@ namespace hip { namespace batch_bicgstab { +#include "common/cuda_hip/components/uninitialized_array.hpp.inc" + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" + + +template <typename BatchMatrixType> +int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec, + const int num_rows) +{ + int nwarps = num_rows / 4; + if (nwarps < 2) { + nwarps = 2; + } + const int min_block_size = 2 * config::warp_size; + const int device_max_threads = + ((std::max(num_rows, min_block_size)) / config::warp_size) * + config::warp_size; + const int num_regs_used_per_thread = 64; + int max_regs_blk = 0; + hipDeviceGetAttribute(&max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = (max_regs_blk / num_regs_used_per_thread); + const int max_threads = std::min(max_threads_regs, device_max_threads); + return std::min(nwarps * static_cast<int>(config::warp_size), max_threads); +} + + template <typename T> using settings = gko::kernels::batch_bicgstab::settings<T>; +template <typename HipValueType> +class KernelCaller { +public: + using value_type = HipValueType; + + KernelCaller(std::shared_ptr<const DefaultExecutor> exec, + const settings<remove_complex<value_type>> settings) + : exec_{exec}, settings_{settings} + {} + + template <typename StopType, const int n_shared, + const bool prec_shared_bool, typename PrecType, typename LogType, + typename BatchMatrixType> + void launch_apply_kernel( + const gko::kernels::batch_bicgstab::storage_config& sconf, + LogType& logger, PrecType& prec, const BatchMatrixType& mat, + const value_type* const __restrict__ b_values, + value_type* const __restrict__ x_values, + value_type* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size) const + { + apply_kernel<StopType, n_shared, prec_shared_bool> + <<<mat.num_batch_items, block_size, shared_size, + exec_->get_stream()>>>(sconf, settings_.max_iterations, + settings_.residual_tol, logger, prec, mat, + b_values, x_values, workspace_data); + } + + + template <typename BatchMatrixType, typename PrecType, typename StopType, + typename LogType> + void call_kernel( + LogType logger, const BatchMatrixType& mat, PrecType prec, + const gko::batch::multi_vector::uniform_batch<const value_type>& b, + const gko::batch::multi_vector::uniform_batch<value_type>& x) const + { + using real_type = gko::remove_complex<value_type>; + const size_type num_batch_items = mat.num_batch_items; + constexpr int align_multiple = 8; + const int shared_gap = + ((mat.num_rows + align_multiple - 1) / align_multiple) * + align_multiple; + const int shmem_per_blk = exec_->get_max_shared_memory_per_block(); + const int block_size = + get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows); + assert(block_size >= 2 * config::warp_size); + + const size_t prec_size = + PrecType::dynamic_work_size(shared_gap, + mat.get_single_item_num_nnz()) * + sizeof(value_type); + const auto sconf = + gko::kernels::batch_bicgstab::compute_shared_storage<PrecType, + value_type>( + shmem_per_blk, shared_gap, mat.get_single_item_num_nnz(), + b.num_rhs); + const size_t shared_size = + sconf.n_shared * shared_gap * sizeof(value_type) + + (sconf.prec_shared ? prec_size : 0); + auto workspace = gko::array<value_type>( + exec_, + sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); + assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + + value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel<StopType, n_shared, + // prec_shared) + if (sconf.prec_shared) + launch_apply_kernel<StopType, 9, 1>( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); + else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel<StopType, 0, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel<StopType, 1, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel<StopType, 2, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel<StopType, 3, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel<StopType, 4, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel<StopType, 5, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 6: + launch_apply_kernel<StopType, 6, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 7: + launch_apply_kernel<StopType, 7, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 8: + launch_apply_kernel<StopType, 8, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 9: + launch_apply_kernel<StopType, 9, 0>( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + } + } + + GKO_HIP_LAST_IF_ERROR_THROW; + } + +private: + std::shared_ptr<const DefaultExecutor> exec_; + const settings<remove_complex<value_type>> settings_; +}; + + template <typename ValueType> void apply(std::shared_ptr<const DefaultExecutor> exec, const settings<remove_complex<ValueType>>& settings, - const batch::BatchLinOp* const a, + const batch::BatchLinOp* const mat, const batch::BatchLinOp* const precon, const batch::MultiVector<ValueType>* const b, batch::MultiVector<ValueType>* const x, batch::log::detail::log_data<remove_complex<ValueType>>& logdata) - GKO_NOT_IMPLEMENTED; +{ + using hip_value_type = hip_type<ValueType>; + auto dispatcher = batch::solver::create_dispatcher<ValueType>( + KernelCaller<hip_value_type>(exec, settings), settings, mat, precon); + dispatcher.apply(b, x, logdata); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL); diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index c7195501178..5ff6791934e 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -981,6 +981,11 @@ class Executor : public log::EnableLogging<Executor> { */ int max_workgroup_size; + /** + * Maximum available local shared memory per workgroup. + */ + int max_shared_memory_per_workgroup; + /** * The major version for CUDA/HIP device. */ @@ -1307,9 +1312,9 @@ class EnableDeviceReset { * * @param device_reset whether to allow a device reset or not */ - [[deprecated( - "device_reset is no longer supported, call " - "cudaDeviceReset/hipDeviceReset manually")]] void + [ + [deprecated("device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] void set_device_reset(bool device_reset) {} @@ -1318,9 +1323,9 @@ class EnableDeviceReset { * * @return the current status of the device reset boolean for this executor. */ - [[deprecated( - "device_reset is no longer supported, call " - "cudaDeviceReset/hipDeviceReset manually")]] bool + [ + [deprecated("device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] bool get_device_reset() { return false; @@ -1334,10 +1339,10 @@ class EnableDeviceReset { */ EnableDeviceReset() {} - [[deprecated( - "device_reset is no longer supported, call " - "cudaDeviceReset/hipDeviceReset manually")]] EnableDeviceReset(bool - device_reset) + [ + [deprecated("device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset " + "manually")]] EnableDeviceReset(bool device_reset) {} }; @@ -1607,6 +1612,14 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>, return this->get_exec_info().max_subgroup_size; } + /** + * Get maximum shared memory per block. + */ + int get_max_shared_memory_per_block() const noexcept + { + return this->get_exec_info().max_shared_memory_per_workgroup; + } + /** * Get the major version of compute capability. */ @@ -1827,6 +1840,14 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>, return this->get_exec_info().max_subgroup_size; } + /** + * Get maximum shared memory per block. + */ + int get_max_shared_memory_per_block() const noexcept + { + return this->get_exec_info().max_shared_memory_per_workgroup; + } + /** * Get the hipblas handle for this executor * diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt index 28a217a79fc..de3430393ae 100644 --- a/test/solver/CMakeLists.txt +++ b/test/solver/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_common_test(batch_bicgstab_kernels DISABLE_EXECUTORS dpcpp hip) +ginkgo_create_common_test(batch_bicgstab_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(bicg_kernels) ginkgo_create_common_test(bicgstab_kernels) ginkgo_create_common_test(cb_gmres_kernels)