From fb0669f855c97faebd8ab2fc0ba38419d935815a Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 27 Oct 2023 00:06:11 +0200
Subject: [PATCH] Add hip bicgstab solver kernels

Co-authored-by: Aditya Kashi <kashia@ornl.gov>
Co-authored-by: Isha Aggarwal <isha.aggarwal2@kit.edu>
---
 cuda/base/executor.cpp                    |   3 +
 cuda/solver/batch_bicgstab_kernels.cu     |   6 +-
 hip/base/exception.hip.hpp                |  56 +++++++
 hip/base/executor.hip.cpp                 |   3 +
 hip/solver/batch_bicgstab_kernels.hip.cpp | 192 +++++++++++++++++++++-
 include/ginkgo/core/base/executor.hpp     |  41 +++--
 test/solver/CMakeLists.txt                |   2 +-
 7 files changed, 287 insertions(+), 16 deletions(-)
 create mode 100644 hip/base/exception.hip.hpp
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index f296fb9da86..01880127641 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -258,6 +258,9 @@ void CudaExecutor::set_gpu_property()
             kernels::cuda::config::warp_size;
         this->get_exec_info().max_subgroup_size =
             kernels::cuda::config::warp_size;
+        GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
+            &this->get_exec_info().max_shared_memory_per_workgroup,
+            cudaDevAttrMaxSharedMemoryPerBlock, this->get_device_id()));
     }
 }
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index db92543fd74..07e16535631 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -85,7 +85,7 @@ namespace batch_bicgstab {
 
 template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
-int get_num_threads_per_block(std::shared_ptr<const CudaExecutor> exec,
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
                               const int num_rows)
 {
     int nwarps = num_rows / 4;
@@ -117,7 +117,7 @@ int get_num_threads_per_block(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename StopType, typename PrecType, typename LogType,
           typename BatchMatrixType, typename ValueType>
-int get_max_dynamic_shared_memory(std::shared_ptr<const CudaExecutor> exec,
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec,
                                   const size_type required_cache_storage)
 {
     int shmem_per_sm = 0;
@@ -178,7 +178,7 @@ public:
     {
         using real_type = gko::remove_complex<value_type>;
         const size_type num_batch_items = mat.num_batch_items;
-        constexpr int align_multiple = 2;
+        constexpr int align_multiple = 8;
         const int shared_gap =
             ((mat.num_rows + align_multiple - 1) / align_multiple) *
             align_multiple;
diff --git a/hip/base/exception.hip.hpp b/hip/base/exception.hip.hpp
new file mode 100644
index 00000000000..7c3b3b2e12e
--- /dev/null
+++ b/hip/base/exception.hip.hpp
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_EXCEPTION_HIP_HPP_
+#define GKO_HIP_BASE_EXCEPTION_HIP_HPP_
+
+
+#include <ginkgo/core/base/exception.hpp>
+
+
+namespace gko {
+
+
+#define GKO_HIP_LAST_IF_ERROR_THROW                                          \
+    hipError_t err = hipGetLastError();                                      \
+    if (err != hipSuccess) {                                                 \
+        printf(" Hip kernel error: %s\n", hipGetErrorString(err));           \
+        throw gko::HipError(__FILE__, __LINE__, __func__, err);              \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
+}  // namespace gko
+
+#endif  // GKO_HIP_BASE_EXCEPTION_HIP_HPP_
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 8d175c0e424..489e9b28ff9 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -262,6 +262,9 @@ void HipExecutor::set_gpu_property()
 #endif  // GINKGO_HIP_PLATFORM_NVCC
         this->get_exec_info().max_subgroup_size =
             kernels::hip::config::warp_size;
+        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+            &this->get_exec_info().max_shared_memory_per_workgroup,
+            hipDeviceAttributeMaxSharedMemoryPerBlock, this->get_device_id()));
     }
 }
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 4ef8cd36c1b..b9fe8b0c9c3 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -34,21 +34,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
+#include "hip/base/exception.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace hip {
+
+
+constexpr int default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
 /**
  * @brief The batch Bicgstab solver namespace.
  *
@@ -57,19 +74,190 @@ namespace hip {
 namespace batch_bicgstab {
 
 
+#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
+
+
+template <typename BatchMatrixType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int nwarps = num_rows / 4;
+    if (nwarps < 2) {
+        nwarps = 2;
+    }
+    const int min_block_size = 2 * config::warp_size;
+    const int device_max_threads =
+        ((std::max(num_rows, min_block_size)) / config::warp_size) *
+        config::warp_size;
+    const int num_regs_used_per_thread = 64;
+    int max_regs_blk = 0;
+    hipDeviceGetAttribute(&max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock,
+                          exec->get_device_id());
+    const int max_threads_regs = (max_regs_blk / num_regs_used_per_thread);
+    const int max_threads = std::min(max_threads_regs, device_max_threads);
+    return std::min(nwarps * static_cast<int>(config::warp_size), max_threads);
+}
+
+
 template <typename T>
 using settings = gko::kernels::batch_bicgstab::settings<T>;
 
 
+template <typename HipValueType>
+class KernelCaller {
+public:
+    using value_type = HipValueType;
+
+    KernelCaller(std::shared_ptr<const DefaultExecutor> exec,
+                 const settings<remove_complex<value_type>> settings)
+        : exec_{exec}, settings_{settings}
+    {}
+
+    template <typename StopType, const int n_shared,
+              const bool prec_shared_bool, typename PrecType, typename LogType,
+              typename BatchMatrixType>
+    void launch_apply_kernel(
+        const gko::kernels::batch_bicgstab::storage_config& sconf,
+        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
+        const value_type* const __restrict__ b_values,
+        value_type* const __restrict__ x_values,
+        value_type* const __restrict__ workspace_data, const int& block_size,
+        const size_t& shared_size) const
+    {
+        apply_kernel<StopType, n_shared, prec_shared_bool>
+            <<<mat.num_batch_items, block_size, shared_size,
+               exec_->get_stream()>>>(sconf, settings_.max_iterations,
+                                      settings_.residual_tol, logger, prec, mat,
+                                      b_values, x_values, workspace_data);
+    }
+
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        LogType logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+    {
+        using real_type = gko::remove_complex<value_type>;
+        const size_type num_batch_items = mat.num_batch_items;
+        constexpr int align_multiple = 8;
+        const int shared_gap =
+            ((mat.num_rows + align_multiple - 1) / align_multiple) *
+            align_multiple;
+        const int shmem_per_blk = exec_->get_max_shared_memory_per_block();
+        const int block_size =
+            get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
+        assert(block_size >= 2 * config::warp_size);
+
+        const size_t prec_size =
+            PrecType::dynamic_work_size(shared_gap,
+                                        mat.get_single_item_num_nnz()) *
+            sizeof(value_type);
+        const auto sconf =
+            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
+                                                                 value_type>(
+                shmem_per_blk, shared_gap, mat.get_single_item_num_nnz(),
+                b.num_rhs);
+        const size_t shared_size =
+            sconf.n_shared * shared_gap * sizeof(value_type) +
+            (sconf.prec_shared ? prec_size : 0);
+        auto workspace = gko::array<value_type>(
+            exec_,
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
+        assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+
+        value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<StopType, n_shared,
+        // prec_shared)
+        if (sconf.prec_shared)
+            launch_apply_kernel<StopType, 9, 1>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                block_size, shared_size);
+        else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<StopType, 0, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<StopType, 1, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<StopType, 2, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<StopType, 3, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<StopType, 4, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<StopType, 5, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<StopType, 6, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<StopType, 7, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<StopType, 8, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<StopType, 9, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            }
+        }
+
+        GKO_HIP_LAST_IF_ERROR_THROW;
+    }
+
+private:
+    std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<value_type>> settings_;
+};
+
+
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
-           const batch::BatchLinOp* const a,
+           const batch::BatchLinOp* const mat,
            const batch::BatchLinOp* const precon,
            const batch::MultiVector<ValueType>* const b,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
-    GKO_NOT_IMPLEMENTED;
+{
+    using hip_value_type = hip_type<ValueType>;
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        KernelCaller<hip_value_type>(exec, settings), settings, mat, precon);
+    dispatcher.apply(b, x, logdata);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index c7195501178..5ff6791934e 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -981,6 +981,11 @@ class Executor : public log::EnableLogging<Executor> {
          */
         int max_workgroup_size;
 
+        /**
+         * Maximum available local shared memory per workgroup.
+         */
+        int max_shared_memory_per_workgroup;
+
         /**
          * The major version for CUDA/HIP device.
          */
@@ -1307,9 +1312,9 @@ class EnableDeviceReset {
      *
      * @param device_reset  whether to allow a device reset or not
      */
-    [[deprecated(
-        "device_reset is no longer supported, call "
-        "cudaDeviceReset/hipDeviceReset manually")]] void
+    [
+        [deprecated("device_reset is no longer supported, call "
+                    "cudaDeviceReset/hipDeviceReset manually")]] void
     set_device_reset(bool device_reset)
     {}
 
@@ -1318,9 +1323,9 @@ class EnableDeviceReset {
      *
      * @return the current status of the device reset boolean for this executor.
      */
-    [[deprecated(
-        "device_reset is no longer supported, call "
-        "cudaDeviceReset/hipDeviceReset manually")]] bool
+    [
+        [deprecated("device_reset is no longer supported, call "
+                    "cudaDeviceReset/hipDeviceReset manually")]] bool
     get_device_reset()
     {
         return false;
@@ -1334,10 +1339,10 @@ class EnableDeviceReset {
      */
     EnableDeviceReset() {}
 
-    [[deprecated(
-        "device_reset is no longer supported, call "
-        "cudaDeviceReset/hipDeviceReset manually")]] EnableDeviceReset(bool
-                                                                           device_reset)
+    [
+        [deprecated("device_reset is no longer supported, call "
+                    "cudaDeviceReset/hipDeviceReset "
+                    "manually")]] EnableDeviceReset(bool device_reset)
     {}
 };
 
@@ -1607,6 +1612,14 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
         return this->get_exec_info().max_subgroup_size;
     }
 
+    /**
+     * Get maximum shared memory per block.
+     */
+    int get_max_shared_memory_per_block() const noexcept
+    {
+        return this->get_exec_info().max_shared_memory_per_workgroup;
+    }
+
     /**
      * Get the major version of compute capability.
      */
@@ -1827,6 +1840,14 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
         return this->get_exec_info().max_subgroup_size;
     }
 
+    /**
+     * Get maximum shared memory per block.
+     */
+    int get_max_shared_memory_per_block() const noexcept
+    {
+        return this->get_exec_info().max_shared_memory_per_workgroup;
+    }
+
     /**
      * Get the hipblas handle for this executor
      *
diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt
index 28a217a79fc..de3430393ae 100644
--- a/test/solver/CMakeLists.txt
+++ b/test/solver/CMakeLists.txt
@@ -1,4 +1,4 @@
-ginkgo_create_common_test(batch_bicgstab_kernels DISABLE_EXECUTORS dpcpp hip)
+ginkgo_create_common_test(batch_bicgstab_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(bicg_kernels)
 ginkgo_create_common_test(bicgstab_kernels)
 ginkgo_create_common_test(cb_gmres_kernels)