From e59dd3503ed9ee29f6ba620a9ce5ae4ed4cad125 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 12 Dec 2024 09:08:55 +0100
Subject: [PATCH 1/5] WIP: half distributed

---
 .../cuda_hip/distributed/matrix_kernels.cpp   |  14 +-
 core/distributed/helpers.hpp                  |  14 +-
 core/distributed/matrix.cpp                   |  40 +
 core/distributed/vector.cpp                   |  20 +
 core/test/mpi/base/bindings.cpp               | 971 +++++++++---------
 include/ginkgo/core/base/mpi.hpp              |  42 +
 include/ginkgo/core/distributed/matrix.hpp    |  25 +-
 include/ginkgo/core/distributed/vector.hpp    |  42 +-
 test/mpi/matrix.cpp                           |  12 +-
 test/mpi/solver/solver.cpp                    |  10 +-
 test/mpi/vector.cpp                           |   9 +-
 11 files changed, 689 insertions(+), 510 deletions(-)
diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index bdf189d9785..cf3ac70822b 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -137,11 +137,11 @@ void separate_local_nonlocal(
                    col_range_starting_indices[range_id];
         };
 
-    using input_type = input_type<ValueType, GlobalIndexType>;
+    using input_type = input_type<device_type<ValueType>, GlobalIndexType>;
     auto input_it = thrust::make_zip_iterator(thrust::make_tuple(
         input.get_const_row_idxs(), input.get_const_col_idxs(),
-        input.get_const_values(), row_range_ids.get_const_data(),
-        col_range_ids.get_const_data()));
+        as_device_type(input.get_const_values()),
+        row_range_ids.get_const_data(), col_range_ids.get_const_data()));
 
     // copy and transform local entries into arrays
     local_row_idxs.resize_and_reset(num_local_elements);
@@ -157,9 +157,9 @@ void separate_local_nonlocal(
     thrust::copy_if(
         policy, local_it, local_it + input.get_num_stored_elements(),
         range_ids_it,
-        thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(),
-                                                     local_col_idxs.get_data(),
-                                                     local_values.get_data())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            local_row_idxs.get_data(), local_col_idxs.get_data(),
+            as_device_type(local_values.get_data()))),
         [local_part, row_part_ids, col_part_ids] __host__ __device__(
             const thrust::tuple<size_type, size_type>& tuple) {
             auto row_part = row_part_ids[thrust::get<0>(tuple)];
@@ -185,7 +185,7 @@ void separate_local_nonlocal(
         range_ids_it,
         thrust::make_zip_iterator(thrust::make_tuple(
             non_local_row_idxs.get_data(), non_local_col_idxs.get_data(),
-            non_local_values.get_data())),
+            as_device_type(non_local_values.get_data()))),
         [local_part, row_part_ids, col_part_ids] __host__ __device__(
             const thrust::tuple<size_type, size_type>& tuple) {
             auto row_part = row_part_ids[thrust::get<0>(tuple)];
diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp
index 9ce7d3b6ab4..5536dbe32f0 100644
--- a/core/distributed/helpers.hpp
+++ b/core/distributed/helpers.hpp
@@ -122,15 +122,11 @@ void vector_dispatch(T* linop, F&& f, Args&&... args)
 {
 #if GINKGO_BUILD_MPI
     if (is_distributed(linop)) {
-        if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
-            GKO_NOT_SUPPORTED(linop);
-        } else {
-            using type = std::conditional_t<
-                std::is_const<T>::value,
-                const experimental::distributed::Vector<ValueType>,
-                experimental::distributed::Vector<ValueType>>;
-            f(dynamic_cast<type*>(linop), std::forward<Args>(args)...);
-        }
+        using type = std::conditional_t<
+            std::is_const<T>::value,
+            const experimental::distributed::Vector<ValueType>,
+            experimental::distributed::Vector<ValueType>>;
+        f(dynamic_cast<type*>(linop), std::forward<Args>(args)...);
     } else
 #endif
     {
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index fd0ff9565b9..b25b4d2bba8 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -237,6 +237,46 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::move_to(
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
+    Matrix<next_precision<next_precision<value_type>>, local_index_type,
+           global_index_type>* result) const
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->local_mtx_->copy_from(this->local_mtx_.get());
+    result->non_local_mtx_->copy_from(this->non_local_mtx_.get());
+    result->gather_idxs_ = this->gather_idxs_;
+    result->send_offsets_ = this->send_offsets_;
+    result->recv_offsets_ = this->recv_offsets_;
+    result->recv_sizes_ = this->recv_sizes_;
+    result->send_sizes_ = this->send_sizes_;
+    result->non_local_to_global_ = this->non_local_to_global_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::move_to(
+    Matrix<next_precision<next_precision<value_type>>, local_index_type,
+           global_index_type>* result)
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->local_mtx_->move_from(this->local_mtx_.get());
+    result->non_local_mtx_->move_from(this->non_local_mtx_.get());
+    result->gather_idxs_ = std::move(this->gather_idxs_);
+    result->send_offsets_ = std::move(this->send_offsets_);
+    result->recv_offsets_ = std::move(this->recv_offsets_);
+    result->recv_sizes_ = std::move(this->recv_sizes_);
+    result->send_sizes_ = std::move(this->send_sizes_);
+    result->non_local_to_global_ = std::move(this->non_local_to_global_);
+    result->set_size(this->get_size());
+    this->set_size({});
+}
+#endif
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const device_matrix_data<value_type, global_index_type>& data,
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 4a79eb64a9b..3c9c77f1b4a 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -295,6 +295,26 @@ void Vector<ValueType>::move_to(Vector<next_precision_base<ValueType>>* result)
 }
 
 
+#if GINKGO_ENABLE_HALF
+template <typename ValueType>
+void Vector<ValueType>::convert_to(
+    Vector<next_precision<next_precision<ValueType>>>* result) const
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->set_size(this->get_size());
+    this->get_local_vector()->convert_to(&result->local_);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::move_to(
+    Vector<next_precision<next_precision<ValueType>>>* result)
+{
+    this->convert_to(result);
+}
+#endif
+
 template <typename ValueType>
 std::unique_ptr<typename Vector<ValueType>::absolute_type>
 Vector<ValueType>::compute_absolute() const
diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index 9feebb76ca9..cbd13820050 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -24,8 +24,9 @@ class MpiBindings : public ::testing::Test {
     std::shared_ptr<gko::Executor> ref;
 };
 
-using TestTypes = gko::test::merge_type_list_t<gko::test::RealValueTypesBase,
-                                               gko::test::IndexTypes>;
+using TestTypes =
+    gko::test::merge_type_list_t<gko::test::RealValueTypesWithHalf,
+                                 gko::test::IndexTypes>;
 
 TYPED_TEST_SUITE(MpiBindings, TestTypes, TypenameNameGenerator);
 
@@ -274,469 +275,492 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence)
     ASSERT_EQ(data, ref);
 }
 
-
-TYPED_TEST(MpiBindings, CanAccumulateValues)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else if (my_rank == 1) {
-        data = std::vector<TypeParam>{5, 6, 7, 8};
-    } else if (my_rank == 2) {
-        data = std::vector<TypeParam>{9, 10, 11, 12};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-
-    {
-        auto win = window(this->ref, data.data(), 4, comm);
-        if (my_rank == 0) {
-            win.lock_all();
-            for (auto rank = 0; rank < num_ranks; ++rank) {
-                if (rank != my_rank) {
-                    win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
-                                   MPI_SUM);
-                }
-            }
-            win.unlock_all();
-        }
-    }
-
-    std::vector<TypeParam> ref;
-    if (my_rank == 0) {
-        ref = std::vector<TypeParam>{1, 2, 3, 4};
-        ASSERT_EQ(data, ref);
-    } else if (my_rank == 1) {
-        ref = std::vector<TypeParam>{6, 8, 10, 12};
-        ASSERT_EQ(data, ref);
-    } else if (my_rank == 2) {
-        ref = std::vector<TypeParam>{10, 12, 14, 16};
-        ASSERT_EQ(data, ref);
-    } else {
-        ref = std::vector<TypeParam>{1, 2, 3, 4};
-        ASSERT_EQ(data, ref);
-    }
-}
-
-
-TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else if (my_rank == 1) {
-        data = std::vector<TypeParam>{5, 6, 7, 8};
-    } else if (my_rank == 2) {
-        data = std::vector<TypeParam>{9, 10, 11, 12};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-
-    gko::experimental::mpi::request req;
-    {
-        auto win = window(this->ref, data.data(), 4, comm);
-        if (my_rank == 0) {
-            win.lock_all();
-            for (auto rank = 0; rank < num_ranks; ++rank) {
-                if (rank != my_rank) {
-                    req = win.r_accumulate(this->ref, data.data(), 4, rank, 0,
-                                           4, MPI_SUM);
-                }
-            }
-            win.unlock_all();
-        }
-    }
-
-    req.wait();
-    std::vector<TypeParam> ref;
-    if (my_rank == 0) {
-        ref = std::vector<TypeParam>{1, 2, 3, 4};
-        ASSERT_EQ(data, ref);
-    } else if (my_rank == 1) {
-        ref = std::vector<TypeParam>{6, 8, 10, 12};
-        ASSERT_EQ(data, ref);
-    } else if (my_rank == 2) {
-        ref = std::vector<TypeParam>{10, 12, 14, 16};
-        ASSERT_EQ(data, ref);
-    } else {
-        ref = std::vector<TypeParam>{1, 2, 3, 4};
-        ASSERT_EQ(data, ref);
-    }
-}
-
-
-TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-    auto win = window(this->ref, data.data(), 4, comm);
-
-    if (my_rank != 0) {
-        win.lock_all();
-        win.get(this->ref, data.data(), 4, 0, 0, 4);
-        win.unlock_all();
-    }
-
-    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-    ASSERT_EQ(data, ref);
-}
-
-
-TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-    gko::experimental::mpi::request req;
-    auto win = window(this->ref, data.data(), 4, comm);
-
-    if (my_rank != 0) {
-        win.lock_all();
-        req = win.r_get(this->ref, data.data(), 4, 0, 0, 4);
-        win.unlock_all();
-    }
-
-    req.wait();
-    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-    ASSERT_EQ(data, ref);
-}
-
-
-TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-    auto win = window(this->ref, data.data(), 4, comm);
-
-    if (my_rank != 0) {
-        win.lock(0, window::lock_type::exclusive);
-        win.get(this->ref, data.data(), 4, 0, 0, 4);
-        win.unlock(0);
-    }
-
-    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-    ASSERT_EQ(data, ref);
-}
-
-
-TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-    auto win = window(this->ref, data.data(), 4, comm);
-
-    if (my_rank != 0) {
-        win.lock(0);
-        win.get(this->ref, data.data(), 4, 0, 0, 4);
-        win.unlock(0);
-    }
-
-    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-    ASSERT_EQ(data, ref);
-}
-
-
-TYPED_TEST(MpiBindings, CanGetValuesWithFence)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-    auto win = window(this->ref, data.data(), 4, comm);
-
-    win.fence();
-    if (my_rank != 0) {
-        win.get(this->ref, data.data(), 4, 0, 0, 4);
-    }
-    win.fence();
-
-    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-    ASSERT_EQ(data, ref);
-}
-
-
-TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    std::vector<TypeParam> target;
-    std::vector<TypeParam> result(4, 0);
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-        target = std::vector<TypeParam>{1, 2, 3, 4};
-    } else if (my_rank == 1) {
-        data = std::vector<TypeParam>{5, 6, 7, 8};
-        target = std::vector<TypeParam>{5, 6, 7, 8};
-    } else if (my_rank == 2) {
-        data = std::vector<TypeParam>{9, 10, 11, 12};
-        target = std::vector<TypeParam>{9, 10, 11, 12};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-        target = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-
-    {
-        auto win = window(this->ref, target.data(), 4, comm);
-
-        if (my_rank == 2) {
-            win.lock_all();
-            win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, 0,
-                               0, 4, MPI_SUM);
-            win.unlock_all();
-        }
-    }
-
-    std::vector<TypeParam> ref;
-    std::vector<TypeParam> ref2;
-    if (my_rank == 0) {
-        ref = std::vector<TypeParam>{10, 12, 14, 16};
-        EXPECT_EQ(target, ref);
-    } else if (my_rank == 2) {
-        ref = std::vector<TypeParam>{1, 2, 3, 4};
-        EXPECT_EQ(result, ref);
-    }
-}
-
-
-TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    std::vector<TypeParam> target;
-    std::vector<TypeParam> result(4, 0);
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-        target = std::vector<TypeParam>{1, 2, 3, 4};
-    } else if (my_rank == 1) {
-        data = std::vector<TypeParam>{5, 6, 7, 8};
-        target = std::vector<TypeParam>{5, 6, 7, 8};
-    } else if (my_rank == 2) {
-        data = std::vector<TypeParam>{9, 10, 11, 12};
-        target = std::vector<TypeParam>{9, 10, 11, 12};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-        target = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-    gko::experimental::mpi::request req;
-
-    {
-        auto win = window(this->ref, target.data(), 4, comm);
-
-        if (my_rank == 2) {
-            win.lock_all();
-            req = win.r_get_accumulate(this->ref, data.data(), 4, result.data(),
-                                       4, 0, 0, 4, MPI_SUM);
-            win.unlock_all();
-        }
-    }
-
-    req.wait();
-    std::vector<TypeParam> ref;
-    std::vector<TypeParam> ref2;
-    if (my_rank == 0) {
-        ref = std::vector<TypeParam>{10, 12, 14, 16};
-        ref2 = std::vector<TypeParam>{1, 2, 3, 4};
-        EXPECT_EQ(target, ref);
-        EXPECT_EQ(data, ref2);
-    } else if (my_rank == 2) {
-        ref = std::vector<TypeParam>{1, 2, 3, 4};
-        ref2 = std::vector<TypeParam>{9, 10, 11, 12};
-        EXPECT_EQ(result, ref);
-        EXPECT_EQ(target, ref2);
-        EXPECT_EQ(data, ref2);
-    }
-}
-
-
-TYPED_TEST(MpiBindings, CanFetchAndOperate)
-{
-    using window = gko::experimental::mpi::window<TypeParam>;
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    std::vector<TypeParam> data;
-    std::vector<TypeParam> target;
-    std::vector<TypeParam> result(4, 0);
-    if (my_rank == 0) {
-        data = std::vector<TypeParam>{1, 2, 3, 4};
-        target = std::vector<TypeParam>{1, 2, 3, 4};
-    } else if (my_rank == 1) {
-        data = std::vector<TypeParam>{5, 6, 7, 8};
-        target = std::vector<TypeParam>{5, 6, 7, 8};
-    } else if (my_rank == 2) {
-        data = std::vector<TypeParam>{9, 10, 11, 12};
-        target = std::vector<TypeParam>{9, 10, 11, 12};
-    } else {
-        data = std::vector<TypeParam>{0, 0, 0, 0};
-        target = std::vector<TypeParam>{0, 0, 0, 0};
-    }
-
-    {
-        auto win = window(this->ref, target.data(), 4, comm);
-
-        if (my_rank == 2) {
-            win.lock_all();
-            win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1,
-                             MPI_SUM);
-            win.unlock_all();
-        }
-    }
-
-    std::vector<TypeParam> ref;
-    std::vector<TypeParam> ref2;
-    if (my_rank == 0) {
-        ref = std::vector<TypeParam>{1, 11, 3, 4};
-        EXPECT_EQ(target, ref);
-    } else if (my_rank == 2) {
-        ref = std::vector<TypeParam>{2, 0, 0, 0};
-        EXPECT_EQ(result, ref);
-    }
-}
-
-
-TYPED_TEST(MpiBindings, CanBroadcastValues)
+void half_sum(void* input, void* output, int* len, MPI_Datatype* datatype)
 {
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    auto array = gko::array<TypeParam>{this->ref, 8};
-    if (my_rank == 0) {
-        array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+    gko::half* input_ptr = static_cast<gko::half*>(input);
+    gko::half* output_ptr = static_cast<gko::half*>(output);
+    for (int i = 0; i < *len; i++) {
+        output_ptr[i] += input_ptr[i];
     }
-
-    comm.broadcast(this->ref, array.get_data(), 8, 0);
-
-    auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-    GKO_ASSERT_ARRAY_EQ(ref, array);
 }
 
-
-TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
-{
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    auto array = gko::array<TypeParam>{this->ref, 8};
-    if (my_rank == 0) {
-        array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-    }
-
-    auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0);
-
-    req.wait();
-    auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-    GKO_ASSERT_ARRAY_EQ(ref, array);
-}
-
-
-TYPED_TEST(MpiBindings, CanReduceValues)
-{
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    TypeParam data, sum, max, min;
-    if (my_rank == 0) {
-        data = 3;
-    } else if (my_rank == 1) {
-        data = 5;
-    } else if (my_rank == 2) {
-        data = 2;
-    } else if (my_rank == 3) {
-        data = 6;
-    }
-
-    comm.reduce(this->ref, &data, &sum, 1, MPI_SUM, 0);
-    comm.reduce(this->ref, &data, &max, 1, MPI_MAX, 0);
-    comm.reduce(this->ref, &data, &min, 1, MPI_MIN, 0);
-
-    if (my_rank == 0) {
-        EXPECT_EQ(sum, TypeParam{16});
-        EXPECT_EQ(max, TypeParam{6});
-        EXPECT_EQ(min, TypeParam{2});
-    }
-}
-
-
-TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
-{
-    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    auto my_rank = comm.rank();
-    auto num_ranks = comm.size();
-    TypeParam data, sum, max, min;
-    if (my_rank == 0) {
-        data = 3;
-    } else if (my_rank == 1) {
-        data = 5;
-    } else if (my_rank == 2) {
-        data = 2;
-    } else if (my_rank == 3) {
-        data = 6;
-    }
-
-    auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, MPI_SUM, 0);
-    auto req2 = comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0);
-    auto req3 = comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0);
-
-    req1.wait();
-    req2.wait();
-    req3.wait();
-    if (my_rank == 0) {
-        EXPECT_EQ(sum, TypeParam{16});
-        EXPECT_EQ(max, TypeParam{6});
-        EXPECT_EQ(min, TypeParam{2});
-    }
-}
+// TYPED_TEST(MpiBindings, CanAccumulateValues)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else if (my_rank == 1) {
+//         data = std::vector<TypeParam>{5, 6, 7, 8};
+//     } else if (my_rank == 2) {
+//         data = std::vector<TypeParam>{9, 10, 11, 12};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     MPI_Op operation;
+//     MPI_Op_create(&half_sum, 1, &operation);
+//     {
+//         auto win = window(this->ref, data.data(), 4, comm);
+//         if (my_rank == 0) {
+//             win.lock_all();
+//             for (auto rank = 0; rank < num_ranks; ++rank) {
+//                 if (rank != my_rank) {
+//                     if (std::is_same_v<TypeParam, gko::half>) {
+//                         win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
+//                                    operation);
+//                     } else {
+//                     win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
+//                                    MPI_SUM);
+//                     }
+//                 }
+//             }
+//             win.unlock_all();
+//         }
+//     }
+//     MPI_Op_free(&operation);
+
+//     std::vector<TypeParam> ref;
+//     if (my_rank == 0) {
+//         ref = std::vector<TypeParam>{1, 2, 3, 4};
+//         ASSERT_EQ(data, ref);
+//     } else if (my_rank == 1) {
+//         ref = std::vector<TypeParam>{6, 8, 10, 12};
+//         ASSERT_EQ(data, ref);
+//     } else if (my_rank == 2) {
+//         ref = std::vector<TypeParam>{10, 12, 14, 16};
+//         ASSERT_EQ(data, ref);
+//     } else {
+//         ref = std::vector<TypeParam>{1, 2, 3, 4};
+//         ASSERT_EQ(data, ref);
+//     }
+// }
+
+
+// TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else if (my_rank == 1) {
+//         data = std::vector<TypeParam>{5, 6, 7, 8};
+//     } else if (my_rank == 2) {
+//         data = std::vector<TypeParam>{9, 10, 11, 12};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+
+//     gko::experimental::mpi::request req;
+//     {
+//         auto win = window(this->ref, data.data(), 4, comm);
+//         if (my_rank == 0) {
+//             win.lock_all();
+//             for (auto rank = 0; rank < num_ranks; ++rank) {
+//                 if (rank != my_rank) {
+//                     req = win.r_accumulate(this->ref, data.data(), 4, rank,
+//                     0,
+//                                            4,
+//                                            gko::experimental::mpi::sum<TypeParam>());
+//                 }
+//             }
+//             win.unlock_all();
+//         }
+//     }
+
+//     req.wait();
+//     std::vector<TypeParam> ref;
+//     if (my_rank == 0) {
+//         ref = std::vector<TypeParam>{1, 2, 3, 4};
+//         ASSERT_EQ(data, ref);
+//     } else if (my_rank == 1) {
+//         ref = std::vector<TypeParam>{6, 8, 10, 12};
+//         ASSERT_EQ(data, ref);
+//     } else if (my_rank == 2) {
+//         ref = std::vector<TypeParam>{10, 12, 14, 16};
+//         ASSERT_EQ(data, ref);
+//     } else {
+//         ref = std::vector<TypeParam>{1, 2, 3, 4};
+//         ASSERT_EQ(data, ref);
+//     }
+// }
+
+
+// TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     auto win = window(this->ref, data.data(), 4, comm);
+
+//     if (my_rank != 0) {
+//         win.lock_all();
+//         win.get(this->ref, data.data(), 4, 0, 0, 4);
+//         win.unlock_all();
+//     }
+
+//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+//     ASSERT_EQ(data, ref);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     gko::experimental::mpi::request req;
+//     auto win = window(this->ref, data.data(), 4, comm);
+
+//     if (my_rank != 0) {
+//         win.lock_all();
+//         req = win.r_get(this->ref, data.data(), 4, 0, 0, 4);
+//         win.unlock_all();
+//     }
+
+//     req.wait();
+//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+//     ASSERT_EQ(data, ref);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     auto win = window(this->ref, data.data(), 4, comm);
+
+//     if (my_rank != 0) {
+//         win.lock(0, window::lock_type::exclusive);
+//         win.get(this->ref, data.data(), 4, 0, 0, 4);
+//         win.unlock(0);
+//     }
+
+//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+//     ASSERT_EQ(data, ref);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     auto win = window(this->ref, data.data(), 4, comm);
+
+//     if (my_rank != 0) {
+//         win.lock(0);
+//         win.get(this->ref, data.data(), 4, 0, 0, 4);
+//         win.unlock(0);
+//     }
+
+//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+//     ASSERT_EQ(data, ref);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanGetValuesWithFence)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     auto win = window(this->ref, data.data(), 4, comm);
+
+//     win.fence();
+//     if (my_rank != 0) {
+//         win.get(this->ref, data.data(), 4, 0, 0, 4);
+//     }
+//     win.fence();
+
+//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+//     ASSERT_EQ(data, ref);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     std::vector<TypeParam> target;
+//     std::vector<TypeParam> result(4, 0);
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//         target = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else if (my_rank == 1) {
+//         data = std::vector<TypeParam>{5, 6, 7, 8};
+//         target = std::vector<TypeParam>{5, 6, 7, 8};
+//     } else if (my_rank == 2) {
+//         data = std::vector<TypeParam>{9, 10, 11, 12};
+//         target = std::vector<TypeParam>{9, 10, 11, 12};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//         target = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+
+//     {
+//         auto win = window(this->ref, target.data(), 4, comm);
+
+//         if (my_rank == 2) {
+//             win.lock_all();
+//             win.get_accumulate(this->ref, data.data(), 4, result.data(), 4,
+//             0,
+//                                0, 4,
+//                                gko::experimental::mpi::sum<TypeParam>());
+//             win.unlock_all();
+//         }
+//     }
+
+//     std::vector<TypeParam> ref;
+//     std::vector<TypeParam> ref2;
+//     if (my_rank == 0) {
+//         ref = std::vector<TypeParam>{10, 12, 14, 16};
+//         EXPECT_EQ(target, ref);
+//     } else if (my_rank == 2) {
+//         ref = std::vector<TypeParam>{1, 2, 3, 4};
+//         EXPECT_EQ(result, ref);
+//     }
+// }
+
+
+// TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     std::vector<TypeParam> target;
+//     std::vector<TypeParam> result(4, 0);
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//         target = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else if (my_rank == 1) {
+//         data = std::vector<TypeParam>{5, 6, 7, 8};
+//         target = std::vector<TypeParam>{5, 6, 7, 8};
+//     } else if (my_rank == 2) {
+//         data = std::vector<TypeParam>{9, 10, 11, 12};
+//         target = std::vector<TypeParam>{9, 10, 11, 12};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//         target = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+//     gko::experimental::mpi::request req;
+
+//     {
+//         auto win = window(this->ref, target.data(), 4, comm);
+
+//         if (my_rank == 2) {
+//             win.lock_all();
+//             req = win.r_get_accumulate(this->ref, data.data(), 4,
+//             result.data(),
+//                                        4, 0, 0, 4,
+//                                        gko::experimental::mpi::sum<TypeParam>());
+//             win.unlock_all();
+//         }
+//     }
+
+//     req.wait();
+//     std::vector<TypeParam> ref;
+//     std::vector<TypeParam> ref2;
+//     if (my_rank == 0) {
+//         ref = std::vector<TypeParam>{10, 12, 14, 16};
+//         ref2 = std::vector<TypeParam>{1, 2, 3, 4};
+//         EXPECT_EQ(target, ref);
+//         EXPECT_EQ(data, ref2);
+//     } else if (my_rank == 2) {
+//         ref = std::vector<TypeParam>{1, 2, 3, 4};
+//         ref2 = std::vector<TypeParam>{9, 10, 11, 12};
+//         EXPECT_EQ(result, ref);
+//         EXPECT_EQ(target, ref2);
+//         EXPECT_EQ(data, ref2);
+//     }
+// }
+
+
+// TYPED_TEST(MpiBindings, CanFetchAndOperate)
+// {
+//     using window = gko::experimental::mpi::window<TypeParam>;
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     std::vector<TypeParam> data;
+//     std::vector<TypeParam> target;
+//     std::vector<TypeParam> result(4, 0);
+//     if (my_rank == 0) {
+//         data = std::vector<TypeParam>{1, 2, 3, 4};
+//         target = std::vector<TypeParam>{1, 2, 3, 4};
+//     } else if (my_rank == 1) {
+//         data = std::vector<TypeParam>{5, 6, 7, 8};
+//         target = std::vector<TypeParam>{5, 6, 7, 8};
+//     } else if (my_rank == 2) {
+//         data = std::vector<TypeParam>{9, 10, 11, 12};
+//         target = std::vector<TypeParam>{9, 10, 11, 12};
+//     } else {
+//         data = std::vector<TypeParam>{0, 0, 0, 0};
+//         target = std::vector<TypeParam>{0, 0, 0, 0};
+//     }
+
+//     {
+//         auto win = window(this->ref, target.data(), 4, comm);
+
+//         if (my_rank == 2) {
+//             win.lock_all();
+//             win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1,
+//                              gko::experimental::mpi::sum<TypeParam>());
+//             win.unlock_all();
+//         }
+//     }
+
+//     std::vector<TypeParam> ref;
+//     std::vector<TypeParam> ref2;
+//     if (my_rank == 0) {
+//         ref = std::vector<TypeParam>{1, 11, 3, 4};
+//         EXPECT_EQ(target, ref);
+//     } else if (my_rank == 2) {
+//         ref = std::vector<TypeParam>{2, 0, 0, 0};
+//         EXPECT_EQ(result, ref);
+//     }
+// }
+
+
+// TYPED_TEST(MpiBindings, CanBroadcastValues)
+// {
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     auto array = gko::array<TypeParam>{this->ref, 8};
+//     if (my_rank == 0) {
+//         array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+//     }
+
+//     comm.broadcast(this->ref, array.get_data(), 8, 0);
+
+//     auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+//     GKO_ASSERT_ARRAY_EQ(ref, array);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
+// {
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     auto array = gko::array<TypeParam>{this->ref, 8};
+//     if (my_rank == 0) {
+//         array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+//     }
+
+//     auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0);
+
+//     req.wait();
+//     auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+//     GKO_ASSERT_ARRAY_EQ(ref, array);
+// }
+
+
+// TYPED_TEST(MpiBindings, CanReduceValues)
+// {
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     TypeParam data, sum, max, min;
+//     if (my_rank == 0) {
+//         data = 3;
+//     } else if (my_rank == 1) {
+//         data = 5;
+//     } else if (my_rank == 2) {
+//         data = 2;
+//     } else if (my_rank == 3) {
+//         data = 6;
+//     }
+
+//     comm.reduce(this->ref, &data, &sum, 1,
+//     gko::experimental::mpi::sum<TypeParam>(), 0); comm.reduce(this->ref,
+//     &data, &max, 1, MPI_MAX, 0); comm.reduce(this->ref, &data, &min, 1,
+//     MPI_MIN, 0);
+
+//     if (my_rank == 0) {
+//         EXPECT_EQ(sum, TypeParam{16});
+//         EXPECT_EQ(max, TypeParam{6});
+//         EXPECT_EQ(min, TypeParam{2});
+//     }
+// }
+
+
+// TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
+// {
+//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+//     auto my_rank = comm.rank();
+//     auto num_ranks = comm.size();
+//     TypeParam data, sum, max, min;
+//     if (my_rank == 0) {
+//         data = 3;
+//     } else if (my_rank == 1) {
+//         data = 5;
+//     } else if (my_rank == 2) {
+//         data = 2;
+//     } else if (my_rank == 3) {
+//         data = 6;
+//     }
+
+//     auto req1 = comm.i_reduce(this->ref, &data, &sum, 1,
+//     gko::experimental::mpi::sum<TypeParam>(), 0); auto req2 =
+//     comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0); auto req3 =
+//     comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0);
+
+//     req1.wait();
+//     req2.wait();
+//     req3.wait();
+//     if (my_rank == 0) {
+//         EXPECT_EQ(sum, TypeParam{16});
+//         EXPECT_EQ(max, TypeParam{6});
+//         EXPECT_EQ(min, TypeParam{2});
+//     }
+// }
 
 
 TYPED_TEST(MpiBindings, CanAllReduceValues)
@@ -754,9 +778,16 @@ TYPED_TEST(MpiBindings, CanAllReduceValues)
     } else if (my_rank == 3) {
         data = 6;
     }
-
-    comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
-
+    MPI_Op operation;
+    MPI_Op_create(&half_sum, 1, &operation);
+    if (std::is_same_v<gko::half, TypeParam>) {
+        comm.all_reduce(this->ref, &data, &sum, 1, operation);
+    } else {
+        comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
+    }
+    // comm.all_reduce(this->ref, &data, &sum, 1,
+    // gko::experimental::mpi::sum<TypeParam>());
+    MPI_Op_free(&operation);
     ASSERT_EQ(sum, TypeParam{16});
 }
 
@@ -777,7 +808,8 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace)
         data = 6;
     }
 
-    comm.all_reduce(this->ref, &data, 1, MPI_SUM);
+    comm.all_reduce(this->ref, &data, 1,
+                    gko::experimental::mpi::sum<TypeParam>());
 
     ASSERT_EQ(data, TypeParam{16});
 }
@@ -799,7 +831,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues)
         data = 6;
     }
 
-    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
+    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1,
+                                 gko::experimental::mpi::sum<TypeParam>());
 
     req.wait();
     ASSERT_EQ(sum, TypeParam{16});
@@ -1438,7 +1471,8 @@ TYPED_TEST(MpiBindings, CanScanValues)
         data = 6;
     }
 
-    comm.scan(this->ref, &data, &sum, 1, MPI_SUM);
+    comm.scan(this->ref, &data, &sum, 1,
+              gko::experimental::mpi::sum<TypeParam>());
     comm.scan(this->ref, &data, &max, 1, MPI_MAX);
     comm.scan(this->ref, &data, &min, 1, MPI_MIN);
 
@@ -1478,7 +1512,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues)
         data = 6;
     }
 
-    auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM);
+    auto req1 = comm.i_scan(this->ref, &data, &sum, 1,
+                            gko::experimental::mpi::sum<TypeParam>());
     auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX);
     auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN);
 
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
index 64c04e1805a..94c5f0263c1 100644
--- a/include/ginkgo/core/base/mpi.hpp
+++ b/include/ginkgo/core/base/mpi.hpp
@@ -14,6 +14,7 @@
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/half.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
 
@@ -88,10 +89,51 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG);
 GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT);
 GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE);
 GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE);
+#if GINKGO_ENABLE_HALF
+// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16
+// TODO: it only works on the transferring
+GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT);
+GKO_REGISTER_MPI_TYPE(std::complex<half>, MPI_FLOAT);
+#endif  // GKO_ENABLE_HALF
 GKO_REGISTER_MPI_TYPE(std::complex<float>, MPI_C_FLOAT_COMPLEX);
 GKO_REGISTER_MPI_TYPE(std::complex<double>, MPI_C_DOUBLE_COMPLEX);
 
 
+namespace detail {
+
+inline void half_sum(void* input, void* output, int* len,
+                     MPI_Datatype* datatype)
+{
+    gko::half* input_ptr = static_cast<gko::half*>(input);
+    gko::half* output_ptr = static_cast<gko::half*>(output);
+    for (int i = 0; i < *len; i++) {
+        output_ptr[i] += input_ptr[i];
+    }
+}
+
+}  // namespace detail
+
+template <typename ValueType>
+inline MPI_Op sum()
+{
+    return MPI_SUM;
+}
+
+template <>
+inline MPI_Op sum<gko::half>()
+{
+    using handle_manager =
+        std::unique_ptr<MPI_Op, std::function<void(MPI_Op*)>>;
+    static handle_manager mpi_op(
+        []() {
+            MPI_Op* operation = new MPI_Op;
+            MPI_Op_create(&detail::half_sum, 1, operation);
+            return operation;
+        }(),
+        [](MPI_Op* op) { MPI_Op_free(op); });
+    return *mpi_op.get();
+}
+
 /**
  * A move-only wrapper for a contiguous MPI_Datatype.
  *
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 4aca13df237..86c94fc74fb 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -259,14 +259,19 @@ template <typename ValueType = default_precision,
           typename LocalIndexType = int32, typename GlobalIndexType = int64>
 class Matrix
     : public EnableLinOp<Matrix<ValueType, LocalIndexType, GlobalIndexType>>,
-      public ConvertibleTo<Matrix<next_precision_base<ValueType>,
+      public ConvertibleTo<
+          Matrix<next_precision<ValueType>, LocalIndexType, GlobalIndexType>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<Matrix<next_precision<next_precision<ValueType>>,
                                   LocalIndexType, GlobalIndexType>>,
+#endif
       public DistributedBase {
     friend class EnablePolymorphicObject<Matrix, LinOp>;
-    friend class Matrix<next_precision_base<ValueType>, LocalIndexType,
+     friend class Matrix<previous_precision<ValueType>, LocalIndexType,
                         GlobalIndexType>;
     friend class multigrid::Pgm<ValueType, LocalIndexType>;
 
+
 public:
     using value_type = ValueType;
     using index_type = GlobalIndexType;
@@ -288,7 +293,23 @@ class Matrix
 
     void move_to(Matrix<next_precision_base<value_type>, local_index_type,
                         global_index_type>* result) override;
+#if GINKGO_ENABLE_HALF
+    friend class Matrix<previous_precision<previous_precision<ValueType>>,
+                        LocalIndexType, GlobalIndexType>;
+    using ConvertibleTo<
+        Matrix<next_precision<next_precision<value_type>>, local_index_type,
+               global_index_type>>::convert_to;
+    using ConvertibleTo<Matrix<next_precision<next_precision<value_type>>,
+                               local_index_type, global_index_type>>::move_to;
+
+    void convert_to(
+        Matrix<next_precision<next_precision<value_type>>, local_index_type,
+               global_index_type>* result) const override;
+
+    void move_to(Matrix<next_precision<next_precision<value_type>>,
+                        local_index_type, global_index_type>* result) override;
 
+#endif
     /**
      * Reads a square matrix from the device_matrix_data structure and a global
      * partition.
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
index c5d039f7e30..80d59beca7d 100644
--- a/include/ginkgo/core/distributed/vector.hpp
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -66,13 +66,16 @@ class Partition;
 template <typename ValueType = double>
 class Vector
     : public EnableLinOp<Vector<ValueType>>,
-      public ConvertibleTo<Vector<next_precision_base<ValueType>>>,
+      public ConvertibleTo<Vector<next_precision<ValueType>>>,
+#if GINKGO_ENABLE_HALF
+      public ConvertibleTo<Vector<next_precision<next_precision<ValueType>>>>,
+#endif
       public EnableAbsoluteComputation<remove_complex<Vector<ValueType>>>,
       public DistributedBase {
     friend class EnablePolymorphicObject<Vector, LinOp>;
     friend class Vector<to_complex<ValueType>>;
     friend class Vector<remove_complex<ValueType>>;
-    friend class Vector<next_precision_base<ValueType>>;
+    friend class Vector<previous_precision<ValueType>>;
     friend class detail::VectorCache<ValueType>;
 
 public:
@@ -173,6 +176,20 @@ class Vector
 
     void move_to(Vector<next_precision_base<ValueType>>* result) override;
 
+#if GINKGO_ENABLE_HALF
+    friend class Vector<previous_precision<previous_precision<ValueType>>>;
+    using ConvertibleTo<
+        Vector<next_precision<next_precision<ValueType>>>>::convert_to;
+    using ConvertibleTo<
+        Vector<next_precision<next_precision<ValueType>>>>::move_to;
+
+    void convert_to(Vector<next_precision<next_precision<ValueType>>>* result)
+        const override;
+
+    void move_to(
+        Vector<next_precision<next_precision<ValueType>>>* result) override;
+#endif
+
     std::unique_ptr<absolute_type> compute_absolute() const override;
 
     void compute_absolute_inplace() override;
@@ -680,6 +697,27 @@ struct conversion_target_helper<experimental::distributed::Vector<ValueType>> {
         return target_type::create(source->get_executor(),
                                    source->get_communicator());
     }
+
+    // Allow to create_empty of the same type
+    // For distributed case, next<next<V>> will be V in the candicated list.
+    // TODO: decide to whether to add this or add condition to the list
+    static std::unique_ptr<target_type> create_empty(const target_type* source)
+    {
+        return target_type::create(source->get_executor(),
+                                   source->get_communicator());
+    }
+
+#if GINKGO_ENABLE_HALF
+    using snd_source_type = experimental::distributed::Vector<
+        previous_precision<previous_precision<ValueType>>>;
+
+    static std::unique_ptr<target_type> create_empty(
+        const snd_source_type* source)
+    {
+        return target_type::create(source->get_executor(),
+                                   source->get_communicator());
+    }
+#endif
 };
 
 
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index e761aab159e..8e4eeb3921c 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -367,12 +367,10 @@ class Matrix : public CommonMpiTestFixture {
 
         alpha = gko::test::generate_random_matrix<dense_vec_type>(
             1, 1, std::uniform_int_distribution<gko::size_type>(1, 1),
-            std::normal_distribution<gko::remove_complex<value_type>>(),
-            this->engine, this->exec);
+            std::normal_distribution<>(), this->engine, this->exec);
         beta = gko::test::generate_random_matrix<dense_vec_type>(
             1, 1, std::uniform_int_distribution<gko::size_type>(1, 1),
-            std::normal_distribution<gko::remove_complex<value_type>>(),
-            this->engine, this->exec);
+            std::normal_distribution<>(), this->engine, this->exec);
     }
 
     void SetUp() override { ASSERT_EQ(comm.size(), 3); }
@@ -412,14 +410,12 @@ class Matrix : public CommonMpiTestFixture {
             num_rows, num_cols,
             std::uniform_int_distribution<int>(static_cast<int>(num_cols),
                                                static_cast<int>(num_cols)),
-            std::normal_distribution<gko::remove_complex<value_type>>(),
-            engine);
+            std::normal_distribution<>(), engine);
         auto mat_md = gko::test::generate_random_matrix_data<value_type,
                                                              global_index_type>(
             num_rows, num_rows,
             std::uniform_int_distribution<int>(0, static_cast<int>(num_rows)),
-            std::normal_distribution<gko::remove_complex<value_type>>(),
-            engine);
+            std::normal_distribution<>(), engine);
 
         auto row_mapping = gko::test::generate_random_array<
             gko::experimental::distributed::comm_index_type>(
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index 392bf9990b8..23ee506d51a 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -264,10 +264,7 @@ class Solver : public CommonMpiTestFixture {
     template <typename ValueType, typename IndexType>
     gko::matrix_data<ValueType, IndexType> gen_dense_data(gko::dim<2> size)
     {
-        return {
-            size,
-            std::normal_distribution<gko::remove_complex<ValueType>>(0.0, 1.0),
-            rand_engine};
+        return {size, std::normal_distribution<>(0.0, 1.0), rand_engine};
     }
 
     template <typename DistVecType = Vec>
@@ -294,10 +291,7 @@ class Solver : public CommonMpiTestFixture {
     {
         return gko::share(gko::initialize<VecType>(
             {gko::test::detail::get_rand_value<typename VecType::value_type>(
-                std::normal_distribution<
-                    gko::remove_complex<typename VecType::value_type>>(0.0,
-                                                                       1.0),
-                rand_engine)},
+                std::normal_distribution<>(0.0, 1.0), rand_engine)},
             exec));
     }
 
diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp
index 752342a8e64..a65bbc7fd36 100644
--- a/test/mpi/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -762,8 +762,7 @@ class VectorLocalOps : public CommonMpiTestFixture {
             local_size[0], local_size[1],
             std::uniform_int_distribution<gko::size_type>(local_size[1],
                                                           local_size[1]),
-            std::normal_distribution<gko::remove_complex<vtype>>(), engine,
-            exec);
+            std::normal_distribution<>(), engine, exec);
         dist = DistVectorType::create(exec, comm, size, gko::clone(local));
     }
 
@@ -775,8 +774,7 @@ class VectorLocalOps : public CommonMpiTestFixture {
         alpha = gko::test::generate_random_matrix<dense_type>(
             1, size[1],
             std::uniform_int_distribution<gko::size_type>(size[1], size[1]),
-            std::normal_distribution<gko::remove_complex<value_type>>(), engine,
-            exec);
+            std::normal_distribution<>(), engine, exec);
     }
 
     void init_complex_vectors()
@@ -971,8 +969,7 @@ TYPED_TEST(VectorLocalOps, FillSameAsLocal)
 {
     using value_type = typename TestFixture::value_type;
     auto value = gko::test::detail::get_rand_value<value_type>(
-        std::normal_distribution<gko::remove_complex<value_type>>(),
-        this->engine);
+        std::normal_distribution<>(), this->engine);
     this->init_vectors();
 
     this->x->fill(value);

From b8ea909640770f600cc36087f64b6044e22dc4c9 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 30 Dec 2024 14:56:25 +0100
Subject: [PATCH 2/5] WIP: change the instantiate type and implement the draft

---
 .../cuda_hip/distributed/assembly_kernels.cpp |   2 +-
 .../cuda_hip/distributed/matrix_kernels.cpp   |   2 +-
 .../cuda_hip/distributed/vector_kernels.cpp   |   2 +-
 .../unified/distributed/assembly_kernels.cpp  |   2 +-
 core/device_hooks/common_kernels.inc.cpp      |   2 +-
 core/distributed/assembly.cpp                 |   2 +-
 core/distributed/matrix.cpp                   |  10 +-
 core/distributed/preconditioner/schwarz.cpp   |   3 +-
 core/distributed/vector.cpp                   |   6 +-
 core/test/mpi/base/bindings.cpp               |  43 ++--
 dpcpp/distributed/assembly_kernels.dp.cpp     |   2 +-
 dpcpp/distributed/matrix_kernels.dp.cpp       |   2 +-
 dpcpp/distributed/vector_kernels.dp.cpp       |   2 +-
 include/ginkgo/core/base/mpi.hpp              | 209 ++++++++++++++++--
 include/ginkgo/core/distributed/matrix.hpp    |  10 +-
 include/ginkgo/core/distributed/vector.hpp    |  11 +-
 omp/distributed/assembly_kernels.cpp          |   2 +-
 omp/distributed/matrix_kernels.cpp            |   2 +-
 omp/distributed/vector_kernels.cpp            |   2 +-
 reference/distributed/assembly_kernels.cpp    |   4 +-
 reference/distributed/matrix_kernels.cpp      |   2 +-
 reference/distributed/vector_kernels.cpp      |   2 +-
 22 files changed, 249 insertions(+), 75 deletions(-)

diff --git a/common/cuda_hip/distributed/assembly_kernels.cpp b/common/cuda_hip/distributed/assembly_kernels.cpp
index fb1a8dbc75d..81478538477 100644
--- a/common/cuda_hip/distributed/assembly_kernels.cpp
+++ b/common/cuda_hip/distributed/assembly_kernels.cpp
@@ -90,7 +90,7 @@ void count_non_owning_entries(
         num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index cf3ac70822b..551eafe6c8c 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -194,7 +194,7 @@ void separate_local_nonlocal(
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/common/cuda_hip/distributed/vector_kernels.cpp b/common/cuda_hip/distributed/vector_kernels.cpp
index 668a721d249..1bacc93489a 100644
--- a/common/cuda_hip/distributed/vector_kernels.cpp
+++ b/common/cuda_hip/distributed/vector_kernels.cpp
@@ -83,7 +83,7 @@ void build_local(
         range_id.get_data(), local_mtx->get_values(), is_local_row);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/common/unified/distributed/assembly_kernels.cpp b/common/unified/distributed/assembly_kernels.cpp
index a33fca28796..a3ac5207f17 100644
--- a/common/unified/distributed/assembly_kernels.cpp
+++ b/common/unified/distributed/assembly_kernels.cpp
@@ -48,7 +48,7 @@ void fill_send_buffers(
         send_values.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 0240dabc7e4..40c7102fe9a 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -183,7 +183,7 @@
               typename GlobalIndexType>                         \
     _macro(ValueType, LocalIndexType, GlobalIndexType)          \
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                      \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(_macro)
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)
 
 #define GKO_STUB_TEMPLATE_TYPE_BASE(_macro)              \
     template <typename IndexType>                        \
diff --git a/core/distributed/assembly.cpp b/core/distributed/assembly.cpp
index 116cf83ee94..424e641f845 100644
--- a/core/distributed/assembly.cpp
+++ b/core/distributed/assembly.cpp
@@ -135,7 +135,7 @@ device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
         mpi::communicator comm,                                            \
         const device_matrix_data<_value_type, _global_type>& input,        \
         ptr_param<const Partition<_local_type, _global_type>> partition)
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_ASSEMBLE_ROWS_FROM_NEIGHBORS);
 
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index b25b4d2bba8..7320bc27914 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -200,8 +200,8 @@ Matrix<ValueType, LocalIndexType, GlobalIndexType>::create(
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
-    Matrix<next_precision_base<value_type>, local_index_type,
-           global_index_type>* result) const
+    Matrix<next_precision<value_type>, local_index_type, global_index_type>*
+        result) const
 {
     GKO_ASSERT(this->get_communicator().size() ==
                result->get_communicator().size());
@@ -219,8 +219,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::move_to(
-    Matrix<next_precision_base<value_type>, local_index_type,
-           global_index_type>* result)
+    Matrix<next_precision<value_type>, local_index_type, global_index_type>*
+        result)
 {
     GKO_ASSERT(this->get_communicator().size() ==
                result->get_communicator().size());
@@ -701,7 +701,7 @@ Matrix<ValueType, LocalIndexType, GlobalIndexType>::operator=(Matrix&& other)
 #define GKO_DECLARE_DISTRIBUTED_MATRIX(ValueType, LocalIndexType, \
                                        GlobalIndexType)           \
     class Matrix<ValueType, LocalIndexType, GlobalIndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_MATRIX);
 
 
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 901d2ee1527..965414349d6 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -144,8 +144,7 @@ void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::generate(
 
 #define GKO_DECLARE_SCHWARZ(ValueType, LocalIndexType, GlobalIndexType) \
     class Schwarz<ValueType, LocalIndexType, GlobalIndexType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
-    GKO_DECLARE_SCHWARZ);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SCHWARZ);
 
 
 }  // namespace preconditioner
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 3c9c77f1b4a..86eb450b888 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -279,7 +279,7 @@ void Vector<ValueType>::fill(const ValueType value)
 
 template <typename ValueType>
 void Vector<ValueType>::convert_to(
-    Vector<next_precision_base<ValueType>>* result) const
+    Vector<next_precision<ValueType>>* result) const
 {
     GKO_ASSERT(this->get_communicator().size() ==
                result->get_communicator().size());
@@ -289,7 +289,7 @@ void Vector<ValueType>::convert_to(
 
 
 template <typename ValueType>
-void Vector<ValueType>::move_to(Vector<next_precision_base<ValueType>>* result)
+void Vector<ValueType>::move_to(Vector<next_precision<ValueType>>* result)
 {
     this->convert_to(result);
 }
@@ -740,7 +740,7 @@ std::unique_ptr<Vector<ValueType>> Vector<ValueType>::create_with_type_of_impl(
 
 
 #define GKO_DECLARE_DISTRIBUTED_VECTOR(ValueType) class Vector<ValueType>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_DISTRIBUTED_VECTOR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DISTRIBUTED_VECTOR);
 
 
 }  // namespace distributed
diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index cbd13820050..f914075ce60 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -24,9 +24,8 @@ class MpiBindings : public ::testing::Test {
     std::shared_ptr<gko::Executor> ref;
 };
 
-using TestTypes =
-    gko::test::merge_type_list_t<gko::test::RealValueTypesWithHalf,
-                                 gko::test::IndexTypes>;
+using TestTypes = gko::test::merge_type_list_t<gko::test::RealValueTypes,
+                                               gko::test::IndexTypes>;
 
 TYPED_TEST_SUITE(MpiBindings, TestTypes, TypenameNameGenerator);
 
@@ -762,7 +761,17 @@ void half_sum(void* input, void* output, int* len, MPI_Datatype* datatype)
 //     }
 // }
 
-
+struct sum_op {
+    template <typename ValueType>
+    void operator()(void* input, void* output, int* len, MPI_Datatype* datatype)
+    {
+        ValueType* input_ptr = static_cast<ValueType*>(input);
+        ValueType* output_ptr = static_cast<ValueType*>(output);
+        for (int i = 0; i < *len; i++) {
+            output_ptr[i] += input_ptr[i];
+        }
+    }
+};
 TYPED_TEST(MpiBindings, CanAllReduceValues)
 {
     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
@@ -780,11 +789,15 @@ TYPED_TEST(MpiBindings, CanAllReduceValues)
     }
     MPI_Op operation;
     MPI_Op_create(&half_sum, 1, &operation);
-    if (std::is_same_v<gko::half, TypeParam>) {
-        comm.all_reduce(this->ref, &data, &sum, 1, operation);
-    } else {
-        comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
-    }
+    // if (std::is_same_v<gko::half, TypeParam>) {
+    //     comm.all_reduce(this->ref, &data, &sum, 1, operation);
+    // } else {
+    // gko::experimental::mpi::op_type<TypeParam> op(1, MPI_SUM,
+    // gko::experimental::mpi::detail::sum);
+    // gko::experimental::mpi::op_type<TypeParam> op(1, MPI_SUM, sum_op());
+    auto op = gko::experimental::mpi::sum<TypeParam>();
+    comm.all_reduce(this->ref, &data, &sum, 1, op.get());
+    // }
     // comm.all_reduce(this->ref, &data, &sum, 1,
     // gko::experimental::mpi::sum<TypeParam>());
     MPI_Op_free(&operation);
@@ -808,8 +821,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace)
         data = 6;
     }
 
-    comm.all_reduce(this->ref, &data, 1,
-                    gko::experimental::mpi::sum<TypeParam>());
+    comm.all_reduce(this->ref, &data, 1, MPI_SUM);
 
     ASSERT_EQ(data, TypeParam{16});
 }
@@ -831,8 +843,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues)
         data = 6;
     }
 
-    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1,
-                                 gko::experimental::mpi::sum<TypeParam>());
+    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
 
     req.wait();
     ASSERT_EQ(sum, TypeParam{16});
@@ -1471,8 +1482,7 @@ TYPED_TEST(MpiBindings, CanScanValues)
         data = 6;
     }
 
-    comm.scan(this->ref, &data, &sum, 1,
-              gko::experimental::mpi::sum<TypeParam>());
+    comm.scan(this->ref, &data, &sum, 1, MPI_SUM);
     comm.scan(this->ref, &data, &max, 1, MPI_MAX);
     comm.scan(this->ref, &data, &min, 1, MPI_MIN);
 
@@ -1512,8 +1522,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues)
         data = 6;
     }
 
-    auto req1 = comm.i_scan(this->ref, &data, &sum, 1,
-                            gko::experimental::mpi::sum<TypeParam>());
+    auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM);
     auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX);
     auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN);
 
diff --git a/dpcpp/distributed/assembly_kernels.dp.cpp b/dpcpp/distributed/assembly_kernels.dp.cpp
index 3f89c45ff1f..e0cc872b783 100644
--- a/dpcpp/distributed/assembly_kernels.dp.cpp
+++ b/dpcpp/distributed/assembly_kernels.dp.cpp
@@ -23,7 +23,7 @@ void count_non_owning_entries(
     array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index ec9bc367e5a..47adaaeca59 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -27,7 +27,7 @@ void separate_local_nonlocal(
     array<GlobalIndexType>& non_local_col_idxs,
     array<ValueType>& non_local_values) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp
index 4f451e2f76b..fdc5dd2e52d 100644
--- a/dpcpp/distributed/vector_kernels.dp.cpp
+++ b/dpcpp/distributed/vector_kernels.dp.cpp
@@ -22,7 +22,7 @@ void build_local(
     comm_index_type local_part,
     matrix::Dense<ValueType>* local_mtx) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
index 94c5f0263c1..82c6319f6f9 100644
--- a/include/ginkgo/core/base/mpi.hpp
+++ b/include/ginkgo/core/base/mpi.hpp
@@ -111,29 +111,140 @@ inline void half_sum(void* input, void* output, int* len,
     }
 }
 
+template <typename ValueType>
+inline void sum(void* input, void* output, int* len, MPI_Datatype* datatype)
+{
+    ValueType* input_ptr = static_cast<ValueType*>(input);
+    ValueType* output_ptr = static_cast<ValueType*>(output);
+    for (int i = 0; i < *len; i++) {
+        output_ptr[i] += input_ptr[i];
+    }
+}
+
 }  // namespace detail
 
-template <typename ValueType>
-inline MPI_Op sum()
+
+using op_manager = std::unique_ptr<ompi_op_t, std::function<void(MPI_Op)>>;
+
+template <typename ValueType,
+          std::enable_if_t<std::is_arithmetic_v<ValueType>>* = nullptr>
+inline op_manager sum()
 {
-    return MPI_SUM;
+    return op_manager([]() { return MPI_SUM; }(), [](MPI_Op op) {});
 }
 
-template <>
-inline MPI_Op sum<gko::half>()
+template <typename ValueType,
+          std::enable_if_t<!std::is_arithmetic_v<ValueType>>* = nullptr>
+inline op_manager sum()
 {
-    using handle_manager =
-        std::unique_ptr<MPI_Op, std::function<void(MPI_Op*)>>;
-    static handle_manager mpi_op(
+    // MPI_Op is MPI_ABI_Op*
+    return op_manager(
         []() {
-            MPI_Op* operation = new MPI_Op;
-            MPI_Op_create(&detail::half_sum, 1, operation);
+            MPI_Op operation;
+            MPI_Op_create(&detail::sum<ValueType>, 1, &operation);
+            // MPI_Op_create(&detail::half_sum, 1, operation);
+            std::cout << "custom operator" << std::endl;
             return operation;
         }(),
-        [](MPI_Op* op) { MPI_Op_free(op); });
-    return *mpi_op.get();
+        [](MPI_Op op) { MPI_Op_free(&op); });
 }
 
+/**
+ * A move-only wrapper for a contiguous MPI_Datatype.
+ *
+ * The underlying MPI_Datatype is automatically created and committed when an
+ * object of this type is constructed, and freed when it is destructed.
+ */
+template <typename ValueType>
+class op_type {
+public:
+    template <typename T>
+    struct mpi_native_type
+        : std::conditional_t<std::is_arithmetic_v<T>, std::true_type,
+                             std::false_type> {};
+
+    /**
+     * Constructs a wrapper for a contiguous MPI_Datatype.
+     *
+     * @param count  the number of old_type elements the new datatype contains.
+     * @param old_type  the MPI_Datatype that is contained.
+     */
+    template <typename AltFunc>
+    op_type(int commutativity, MPI_Op default_op, AltFunc&& alt_func)
+        : custom_(false), handle_(MPI_OP_NULL)
+    {
+        custom_ = mpi_native_type<ValueType>::value;
+        if constexpr (mpi_native_type<ValueType>::value) {
+            auto op = alt_func.template operator()<ValueType>;
+            GKO_ASSERT_NO_MPI_ERRORS(
+                MPI_Op_create(&op, commutativity, &handle_));
+        } else {
+            handle_ = default_op;
+        }
+    }
+
+    /**
+     * Constructs empty wrapper with MPI_OP_NULL.
+     */
+    op_type() : handle_(MPI_OP_NULL) {}
+
+    /**
+     * Disallow copying of wrapper type.
+     */
+    op_type(const op_type&) = delete;
+
+    /**
+     * Disallow copying of wrapper type.
+     */
+    op_type& operator=(const op_type&) = delete;
+
+    /**
+     * Move constructor, leaves other with MPI_OP_NULL.
+     *
+     * @param other  to be moved from object.
+     */
+    op_type(op_type&& other) noexcept : handle_(MPI_OP_NULL)
+    {
+        *this = std::move(other);
+    }
+
+    /**
+     * Move assignment, leaves other with MPI_OP_NULL.
+     *
+     * @param other  to be moved from object.
+     *
+     * @return  this object.
+     */
+    op_type& operator=(op_type&& other) noexcept
+    {
+        if (this != &other) {
+            this->handle_ = std::exchange(other.handle_, MPI_OP_NULL);
+        }
+        return *this;
+    }
+
+    /**
+     * Destructs object by freeing wrapped MPI_Datatype.
+     */
+    ~op_type()
+    {
+        if (custom_ && handle_ != MPI_OP_NULL) {
+            MPI_Op_free(&handle_);
+        }
+    }
+
+    /**
+     * Access the underlying MPI_Op.
+     *
+     * @return  the underlying MPI_Op.
+     */
+    MPI_Op get() const { return handle_; }
+
+private:
+    bool custom_;
+    MPI_Op handle_;
+};
+
 /**
  * A move-only wrapper for a contiguous MPI_Datatype.
  *
@@ -782,9 +893,22 @@ class communicator {
                     ReduceType* recv_buffer, int count, MPI_Op operation) const
     {
         auto guard = exec->get_scoped_device_id_guard();
-        GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
-            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
-            operation, this->get()));
+        if constexpr (std::is_same_v<ReduceType, gko::half>) {
+            if (operation == MPI_SUM) {
+                MPI_Op op;
+                MPI_Op_create(&detail::half_sum, 1, &op);
+                GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
+                    MPI_IN_PLACE, recv_buffer, count,
+                    type_impl<ReduceType>::get_type(), op, this->get()));
+                MPI_Op_free(&op);
+            } else {
+                GKO_NOT_IMPLEMENTED;
+            }
+        } else {
+            GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
+                MPI_IN_PLACE, recv_buffer, count,
+                type_impl<ReduceType>::get_type(), operation, this->get()));
+        }
     }
 
     /**
@@ -809,9 +933,24 @@ class communicator {
     {
         auto guard = exec->get_scoped_device_id_guard();
         request req;
-        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
-            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
-            operation, this->get(), req.get()));
+        if constexpr (std::is_same_v<ReduceType, gko::half>) {
+            if (operation == MPI_SUM) {
+                MPI_Op op;
+                MPI_Op_create(&detail::half_sum, 1, &op);
+                GKO_ASSERT_NO_MPI_ERRORS(
+                    MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count,
+                                   type_impl<ReduceType>::get_type(), op,
+                                   this->get(), req.get()));
+                MPI_Op_free(&op);
+            } else {
+                GKO_NOT_IMPLEMENTED;
+            }
+        } else {
+            GKO_ASSERT_NO_MPI_ERRORS(
+                MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count,
+                               type_impl<ReduceType>::get_type(), operation,
+                               this->get(), req.get()));
+        }
         return req;
     }
 
@@ -835,9 +974,22 @@ class communicator {
                     int count, MPI_Op operation) const
     {
         auto guard = exec->get_scoped_device_id_guard();
+        // if constexpr (std::is_same_v<ReduceType, gko::half>) {
+        //     if (operation == MPI_SUM) {
+        //         MPI_Op op;
+        //         MPI_Op_create(&detail::half_sum, 1, &op);
+        //         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
+        //             send_buffer, recv_buffer, count,
+        //             type_impl<ReduceType>::get_type(), op, this->get()));
+        //         MPI_Op_free(&op);
+        //     } else {
+        //         GKO_NOT_IMPLEMENTED;
+        //     }
+        // } else {
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
             send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
             operation, this->get()));
+        // }
     }
 
     /**
@@ -863,9 +1015,24 @@ class communicator {
     {
         auto guard = exec->get_scoped_device_id_guard();
         request req;
-        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
-            send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
-            operation, this->get(), req.get()));
+        if constexpr (std::is_same_v<ReduceType, gko::half>) {
+            if (operation == MPI_SUM) {
+                MPI_Op op;
+                MPI_Op_create(&detail::half_sum, 1, &op);
+                GKO_ASSERT_NO_MPI_ERRORS(
+                    MPI_Iallreduce(send_buffer, recv_buffer, count,
+                                   type_impl<ReduceType>::get_type(), op,
+                                   this->get(), req.get()));
+                MPI_Op_free(&op);
+            } else {
+                GKO_NOT_IMPLEMENTED;
+            }
+        } else {
+            GKO_ASSERT_NO_MPI_ERRORS(
+                MPI_Iallreduce(send_buffer, recv_buffer, count,
+                               type_impl<ReduceType>::get_type(), operation,
+                               this->get(), req.get()));
+        }
         return req;
     }
 
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 86c94fc74fb..ceda97c0427 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -267,7 +267,7 @@ class Matrix
 #endif
       public DistributedBase {
     friend class EnablePolymorphicObject<Matrix, LinOp>;
-     friend class Matrix<previous_precision<ValueType>, LocalIndexType,
+    friend class Matrix<previous_precision<ValueType>, LocalIndexType,
                         GlobalIndexType>;
     friend class multigrid::Pgm<ValueType, LocalIndexType>;
 
@@ -283,15 +283,15 @@ class Matrix
 
     using EnableLinOp<Matrix>::convert_to;
     using EnableLinOp<Matrix>::move_to;
-    using ConvertibleTo<Matrix<next_precision_base<ValueType>, LocalIndexType,
+    using ConvertibleTo<Matrix<next_precision<ValueType>, LocalIndexType,
                                GlobalIndexType>>::convert_to;
-    using ConvertibleTo<Matrix<next_precision_base<ValueType>, LocalIndexType,
+    using ConvertibleTo<Matrix<next_precision<ValueType>, LocalIndexType,
                                GlobalIndexType>>::move_to;
 
-    void convert_to(Matrix<next_precision_base<value_type>, local_index_type,
+    void convert_to(Matrix<next_precision<value_type>, local_index_type,
                            global_index_type>* result) const override;
 
-    void move_to(Matrix<next_precision_base<value_type>, local_index_type,
+    void move_to(Matrix<next_precision<value_type>, local_index_type,
                         global_index_type>* result) override;
 #if GINKGO_ENABLE_HALF
     friend class Matrix<previous_precision<previous_precision<ValueType>>,
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
index 80d59beca7d..20ccfb6435e 100644
--- a/include/ginkgo/core/distributed/vector.hpp
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -81,8 +81,8 @@ class Vector
 public:
     using EnableLinOp<Vector>::convert_to;
     using EnableLinOp<Vector>::move_to;
-    using ConvertibleTo<Vector<next_precision_base<ValueType>>>::convert_to;
-    using ConvertibleTo<Vector<next_precision_base<ValueType>>>::move_to;
+    using ConvertibleTo<Vector<next_precision<ValueType>>>::convert_to;
+    using ConvertibleTo<Vector<next_precision<ValueType>>>::move_to;
 
     using value_type = ValueType;
     using absolute_type = remove_complex<Vector>;
@@ -171,10 +171,9 @@ class Vector
     void read_distributed(const matrix_data<ValueType, int32>& data,
                           ptr_param<const Partition<int32, int32>> partition);
 
-    void convert_to(
-        Vector<next_precision_base<ValueType>>* result) const override;
+    void convert_to(Vector<next_precision<ValueType>>* result) const override;
 
-    void move_to(Vector<next_precision_base<ValueType>>* result) override;
+    void move_to(Vector<next_precision<ValueType>>* result) override;
 
 #if GINKGO_ENABLE_HALF
     friend class Vector<previous_precision<previous_precision<ValueType>>>;
@@ -690,7 +689,7 @@ template <typename ValueType>
 struct conversion_target_helper<experimental::distributed::Vector<ValueType>> {
     using target_type = experimental::distributed::Vector<ValueType>;
     using source_type =
-        experimental::distributed::Vector<previous_precision_base<ValueType>>;
+        experimental::distributed::Vector<previous_precision<ValueType>>;
 
     static std::unique_ptr<target_type> create_empty(const source_type* source)
     {
diff --git a/omp/distributed/assembly_kernels.cpp b/omp/distributed/assembly_kernels.cpp
index 44c9c908c52..9fa9976e607 100644
--- a/omp/distributed/assembly_kernels.cpp
+++ b/omp/distributed/assembly_kernels.cpp
@@ -73,7 +73,7 @@ void count_non_owning_entries(
                                        num_input_elements);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index d60b31ac6a8..2f36ec4a778 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -149,7 +149,7 @@ void separate_local_nonlocal(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/omp/distributed/vector_kernels.cpp b/omp/distributed/vector_kernels.cpp
index 007509f50fd..1ae60ed108e 100644
--- a/omp/distributed/vector_kernels.cpp
+++ b/omp/distributed/vector_kernels.cpp
@@ -42,7 +42,7 @@ void build_local(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 
diff --git a/reference/distributed/assembly_kernels.cpp b/reference/distributed/assembly_kernels.cpp
index 36c44ca4022..e38680243a0 100644
--- a/reference/distributed/assembly_kernels.cpp
+++ b/reference/distributed/assembly_kernels.cpp
@@ -67,7 +67,7 @@ void count_non_owning_entries(
                                        num_input_elements);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
@@ -97,7 +97,7 @@ void fill_send_buffers(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index ab0e07070ff..95176b34656 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -86,7 +86,7 @@ void separate_local_nonlocal(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/reference/distributed/vector_kernels.cpp b/reference/distributed/vector_kernels.cpp
index 1425f1dc9ab..76a8be06a0f 100644
--- a/reference/distributed/vector_kernels.cpp
+++ b/reference/distributed/vector_kernels.cpp
@@ -40,7 +40,7 @@ void build_local(
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
 
 

From 1df60284b4583e71f51c69c41d9bd55d7fb9895f Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 30 Dec 2024 15:47:47 +0100
Subject: [PATCH 3/5] implement and test

---
 core/test/mpi/base/bindings.cpp  | 1046 +++++++++++++++---------------
 include/ginkgo/core/base/mpi.hpp |  206 ++----
 2 files changed, 570 insertions(+), 682 deletions(-)

diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index f914075ce60..c7e9e6fc294 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include <memory>
+#include <type_traits>
 
 #include <gtest/gtest.h>
 
@@ -15,13 +16,62 @@
 #include "core/test/utils.hpp"
 
 
+namespace detail {
+
+
+template <typename ValueType>
+inline void min(void* input, void* output, int* len, MPI_Datatype* datatype)
+{
+    ValueType* input_ptr = static_cast<ValueType*>(input);
+    ValueType* output_ptr = static_cast<ValueType*>(output);
+    for (int i = 0; i < *len; i++) {
+        if (input_ptr[i] < output_ptr[i]) {
+            output_ptr[i] = input_ptr[i];
+        }
+    }
+}
+
+
+}  // namespace detail
+
+
+using gko::experimental::mpi::op_manager;
+
+template <typename ValueType,
+          std::enable_if_t<std::is_arithmetic_v<ValueType>>* = nullptr>
+inline op_manager min()
+{
+    return op_manager([]() { return MPI_MIN; }(), [](MPI_Op op) {});
+}
+
+template <typename ValueType,
+          std::enable_if_t<!std::is_arithmetic_v<ValueType>>* = nullptr>
+inline op_manager min()
+{
+    return op_manager(
+        []() {
+            MPI_Op operation;
+            MPI_Op_create(&detail::min<ValueType>, 1, &operation);
+            return operation;
+        }(),
+        [](MPI_Op op) { MPI_Op_free(&op); });
+}
+
 template <typename T>
 class MpiBindings : public ::testing::Test {
 protected:
     using value_type = T;
-    MpiBindings() : ref(gko::ReferenceExecutor::create()) {}
+    MpiBindings()
+        : ref(gko::ReferenceExecutor::create()),
+          sum_op(gko::experimental::mpi::sum<value_type>()),
+          max_op(gko::experimental::mpi::max<value_type>()),
+          min_op(min<value_type>())
+    {}
 
     std::shared_ptr<gko::Executor> ref;
+    gko::experimental::mpi::op_manager sum_op;
+    gko::experimental::mpi::op_manager max_op;
+    gko::experimental::mpi::op_manager min_op;
 };
 
 using TestTypes = gko::test::merge_type_list_t<gko::test::RealValueTypes,
@@ -274,504 +324,481 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence)
     ASSERT_EQ(data, ref);
 }
 
-void half_sum(void* input, void* output, int* len, MPI_Datatype* datatype)
+
+TYPED_TEST(MpiBindings, CanAccumulateValues)
 {
-    gko::half* input_ptr = static_cast<gko::half*>(input);
-    gko::half* output_ptr = static_cast<gko::half*>(output);
-    for (int i = 0; i < *len; i++) {
-        output_ptr[i] += input_ptr[i];
+    // one-side accumlation only supports native type
+    SKIP_IF_HALF(TypeParam);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else if (my_rank == 1) {
+        data = std::vector<TypeParam>{5, 6, 7, 8};
+    } else if (my_rank == 2) {
+        data = std::vector<TypeParam>{9, 10, 11, 12};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+
+    {
+        auto win = window(this->ref, data.data(), 4, comm);
+        if (my_rank == 0) {
+            win.lock_all();
+            for (auto rank = 0; rank < num_ranks; ++rank) {
+                if (rank != my_rank) {
+                    win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
+                                   MPI_SUM);
+                }
+            }
+            win.unlock_all();
+        }
+    }
+
+    std::vector<TypeParam> ref;
+    if (my_rank == 0) {
+        ref = std::vector<TypeParam>{1, 2, 3, 4};
+        ASSERT_EQ(data, ref);
+    } else if (my_rank == 1) {
+        ref = std::vector<TypeParam>{6, 8, 10, 12};
+        ASSERT_EQ(data, ref);
+    } else if (my_rank == 2) {
+        ref = std::vector<TypeParam>{10, 12, 14, 16};
+        ASSERT_EQ(data, ref);
+    } else {
+        ref = std::vector<TypeParam>{1, 2, 3, 4};
+        ASSERT_EQ(data, ref);
     }
 }
 
-// TYPED_TEST(MpiBindings, CanAccumulateValues)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else if (my_rank == 1) {
-//         data = std::vector<TypeParam>{5, 6, 7, 8};
-//     } else if (my_rank == 2) {
-//         data = std::vector<TypeParam>{9, 10, 11, 12};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     MPI_Op operation;
-//     MPI_Op_create(&half_sum, 1, &operation);
-//     {
-//         auto win = window(this->ref, data.data(), 4, comm);
-//         if (my_rank == 0) {
-//             win.lock_all();
-//             for (auto rank = 0; rank < num_ranks; ++rank) {
-//                 if (rank != my_rank) {
-//                     if (std::is_same_v<TypeParam, gko::half>) {
-//                         win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
-//                                    operation);
-//                     } else {
-//                     win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
-//                                    MPI_SUM);
-//                     }
-//                 }
-//             }
-//             win.unlock_all();
-//         }
-//     }
-//     MPI_Op_free(&operation);
-
-//     std::vector<TypeParam> ref;
-//     if (my_rank == 0) {
-//         ref = std::vector<TypeParam>{1, 2, 3, 4};
-//         ASSERT_EQ(data, ref);
-//     } else if (my_rank == 1) {
-//         ref = std::vector<TypeParam>{6, 8, 10, 12};
-//         ASSERT_EQ(data, ref);
-//     } else if (my_rank == 2) {
-//         ref = std::vector<TypeParam>{10, 12, 14, 16};
-//         ASSERT_EQ(data, ref);
-//     } else {
-//         ref = std::vector<TypeParam>{1, 2, 3, 4};
-//         ASSERT_EQ(data, ref);
-//     }
-// }
-
-
-// TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else if (my_rank == 1) {
-//         data = std::vector<TypeParam>{5, 6, 7, 8};
-//     } else if (my_rank == 2) {
-//         data = std::vector<TypeParam>{9, 10, 11, 12};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-
-//     gko::experimental::mpi::request req;
-//     {
-//         auto win = window(this->ref, data.data(), 4, comm);
-//         if (my_rank == 0) {
-//             win.lock_all();
-//             for (auto rank = 0; rank < num_ranks; ++rank) {
-//                 if (rank != my_rank) {
-//                     req = win.r_accumulate(this->ref, data.data(), 4, rank,
-//                     0,
-//                                            4,
-//                                            gko::experimental::mpi::sum<TypeParam>());
-//                 }
-//             }
-//             win.unlock_all();
-//         }
-//     }
-
-//     req.wait();
-//     std::vector<TypeParam> ref;
-//     if (my_rank == 0) {
-//         ref = std::vector<TypeParam>{1, 2, 3, 4};
-//         ASSERT_EQ(data, ref);
-//     } else if (my_rank == 1) {
-//         ref = std::vector<TypeParam>{6, 8, 10, 12};
-//         ASSERT_EQ(data, ref);
-//     } else if (my_rank == 2) {
-//         ref = std::vector<TypeParam>{10, 12, 14, 16};
-//         ASSERT_EQ(data, ref);
-//     } else {
-//         ref = std::vector<TypeParam>{1, 2, 3, 4};
-//         ASSERT_EQ(data, ref);
-//     }
-// }
-
-
-// TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     auto win = window(this->ref, data.data(), 4, comm);
-
-//     if (my_rank != 0) {
-//         win.lock_all();
-//         win.get(this->ref, data.data(), 4, 0, 0, 4);
-//         win.unlock_all();
-//     }
-
-//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-//     ASSERT_EQ(data, ref);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     gko::experimental::mpi::request req;
-//     auto win = window(this->ref, data.data(), 4, comm);
-
-//     if (my_rank != 0) {
-//         win.lock_all();
-//         req = win.r_get(this->ref, data.data(), 4, 0, 0, 4);
-//         win.unlock_all();
-//     }
-
-//     req.wait();
-//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-//     ASSERT_EQ(data, ref);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     auto win = window(this->ref, data.data(), 4, comm);
-
-//     if (my_rank != 0) {
-//         win.lock(0, window::lock_type::exclusive);
-//         win.get(this->ref, data.data(), 4, 0, 0, 4);
-//         win.unlock(0);
-//     }
-
-//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-//     ASSERT_EQ(data, ref);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     auto win = window(this->ref, data.data(), 4, comm);
-
-//     if (my_rank != 0) {
-//         win.lock(0);
-//         win.get(this->ref, data.data(), 4, 0, 0, 4);
-//         win.unlock(0);
-//     }
-
-//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-//     ASSERT_EQ(data, ref);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanGetValuesWithFence)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     auto win = window(this->ref, data.data(), 4, comm);
-
-//     win.fence();
-//     if (my_rank != 0) {
-//         win.get(this->ref, data.data(), 4, 0, 0, 4);
-//     }
-//     win.fence();
-
-//     auto ref = std::vector<TypeParam>{1, 2, 3, 4};
-//     ASSERT_EQ(data, ref);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     std::vector<TypeParam> target;
-//     std::vector<TypeParam> result(4, 0);
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//         target = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else if (my_rank == 1) {
-//         data = std::vector<TypeParam>{5, 6, 7, 8};
-//         target = std::vector<TypeParam>{5, 6, 7, 8};
-//     } else if (my_rank == 2) {
-//         data = std::vector<TypeParam>{9, 10, 11, 12};
-//         target = std::vector<TypeParam>{9, 10, 11, 12};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//         target = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-
-//     {
-//         auto win = window(this->ref, target.data(), 4, comm);
-
-//         if (my_rank == 2) {
-//             win.lock_all();
-//             win.get_accumulate(this->ref, data.data(), 4, result.data(), 4,
-//             0,
-//                                0, 4,
-//                                gko::experimental::mpi::sum<TypeParam>());
-//             win.unlock_all();
-//         }
-//     }
-
-//     std::vector<TypeParam> ref;
-//     std::vector<TypeParam> ref2;
-//     if (my_rank == 0) {
-//         ref = std::vector<TypeParam>{10, 12, 14, 16};
-//         EXPECT_EQ(target, ref);
-//     } else if (my_rank == 2) {
-//         ref = std::vector<TypeParam>{1, 2, 3, 4};
-//         EXPECT_EQ(result, ref);
-//     }
-// }
-
-
-// TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     std::vector<TypeParam> target;
-//     std::vector<TypeParam> result(4, 0);
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//         target = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else if (my_rank == 1) {
-//         data = std::vector<TypeParam>{5, 6, 7, 8};
-//         target = std::vector<TypeParam>{5, 6, 7, 8};
-//     } else if (my_rank == 2) {
-//         data = std::vector<TypeParam>{9, 10, 11, 12};
-//         target = std::vector<TypeParam>{9, 10, 11, 12};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//         target = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-//     gko::experimental::mpi::request req;
-
-//     {
-//         auto win = window(this->ref, target.data(), 4, comm);
-
-//         if (my_rank == 2) {
-//             win.lock_all();
-//             req = win.r_get_accumulate(this->ref, data.data(), 4,
-//             result.data(),
-//                                        4, 0, 0, 4,
-//                                        gko::experimental::mpi::sum<TypeParam>());
-//             win.unlock_all();
-//         }
-//     }
-
-//     req.wait();
-//     std::vector<TypeParam> ref;
-//     std::vector<TypeParam> ref2;
-//     if (my_rank == 0) {
-//         ref = std::vector<TypeParam>{10, 12, 14, 16};
-//         ref2 = std::vector<TypeParam>{1, 2, 3, 4};
-//         EXPECT_EQ(target, ref);
-//         EXPECT_EQ(data, ref2);
-//     } else if (my_rank == 2) {
-//         ref = std::vector<TypeParam>{1, 2, 3, 4};
-//         ref2 = std::vector<TypeParam>{9, 10, 11, 12};
-//         EXPECT_EQ(result, ref);
-//         EXPECT_EQ(target, ref2);
-//         EXPECT_EQ(data, ref2);
-//     }
-// }
-
-
-// TYPED_TEST(MpiBindings, CanFetchAndOperate)
-// {
-//     using window = gko::experimental::mpi::window<TypeParam>;
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     std::vector<TypeParam> data;
-//     std::vector<TypeParam> target;
-//     std::vector<TypeParam> result(4, 0);
-//     if (my_rank == 0) {
-//         data = std::vector<TypeParam>{1, 2, 3, 4};
-//         target = std::vector<TypeParam>{1, 2, 3, 4};
-//     } else if (my_rank == 1) {
-//         data = std::vector<TypeParam>{5, 6, 7, 8};
-//         target = std::vector<TypeParam>{5, 6, 7, 8};
-//     } else if (my_rank == 2) {
-//         data = std::vector<TypeParam>{9, 10, 11, 12};
-//         target = std::vector<TypeParam>{9, 10, 11, 12};
-//     } else {
-//         data = std::vector<TypeParam>{0, 0, 0, 0};
-//         target = std::vector<TypeParam>{0, 0, 0, 0};
-//     }
-
-//     {
-//         auto win = window(this->ref, target.data(), 4, comm);
-
-//         if (my_rank == 2) {
-//             win.lock_all();
-//             win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1,
-//                              gko::experimental::mpi::sum<TypeParam>());
-//             win.unlock_all();
-//         }
-//     }
-
-//     std::vector<TypeParam> ref;
-//     std::vector<TypeParam> ref2;
-//     if (my_rank == 0) {
-//         ref = std::vector<TypeParam>{1, 11, 3, 4};
-//         EXPECT_EQ(target, ref);
-//     } else if (my_rank == 2) {
-//         ref = std::vector<TypeParam>{2, 0, 0, 0};
-//         EXPECT_EQ(result, ref);
-//     }
-// }
-
-
-// TYPED_TEST(MpiBindings, CanBroadcastValues)
-// {
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     auto array = gko::array<TypeParam>{this->ref, 8};
-//     if (my_rank == 0) {
-//         array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-//     }
-
-//     comm.broadcast(this->ref, array.get_data(), 8, 0);
-
-//     auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-//     GKO_ASSERT_ARRAY_EQ(ref, array);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
-// {
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     auto array = gko::array<TypeParam>{this->ref, 8};
-//     if (my_rank == 0) {
-//         array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-//     }
-
-//     auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0);
-
-//     req.wait();
-//     auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
-//     GKO_ASSERT_ARRAY_EQ(ref, array);
-// }
-
-
-// TYPED_TEST(MpiBindings, CanReduceValues)
-// {
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     TypeParam data, sum, max, min;
-//     if (my_rank == 0) {
-//         data = 3;
-//     } else if (my_rank == 1) {
-//         data = 5;
-//     } else if (my_rank == 2) {
-//         data = 2;
-//     } else if (my_rank == 3) {
-//         data = 6;
-//     }
-
-//     comm.reduce(this->ref, &data, &sum, 1,
-//     gko::experimental::mpi::sum<TypeParam>(), 0); comm.reduce(this->ref,
-//     &data, &max, 1, MPI_MAX, 0); comm.reduce(this->ref, &data, &min, 1,
-//     MPI_MIN, 0);
-
-//     if (my_rank == 0) {
-//         EXPECT_EQ(sum, TypeParam{16});
-//         EXPECT_EQ(max, TypeParam{6});
-//         EXPECT_EQ(min, TypeParam{2});
-//     }
-// }
-
-
-// TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
-// {
-//     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-//     auto my_rank = comm.rank();
-//     auto num_ranks = comm.size();
-//     TypeParam data, sum, max, min;
-//     if (my_rank == 0) {
-//         data = 3;
-//     } else if (my_rank == 1) {
-//         data = 5;
-//     } else if (my_rank == 2) {
-//         data = 2;
-//     } else if (my_rank == 3) {
-//         data = 6;
-//     }
-
-//     auto req1 = comm.i_reduce(this->ref, &data, &sum, 1,
-//     gko::experimental::mpi::sum<TypeParam>(), 0); auto req2 =
-//     comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0); auto req3 =
-//     comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0);
-
-//     req1.wait();
-//     req2.wait();
-//     req3.wait();
-//     if (my_rank == 0) {
-//         EXPECT_EQ(sum, TypeParam{16});
-//         EXPECT_EQ(max, TypeParam{6});
-//         EXPECT_EQ(min, TypeParam{2});
-//     }
-// }
-
-struct sum_op {
-    template <typename ValueType>
-    void operator()(void* input, void* output, int* len, MPI_Datatype* datatype)
+
+TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
+{
+    // one-side accumlation only supports native type
+    SKIP_IF_HALF(TypeParam);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else if (my_rank == 1) {
+        data = std::vector<TypeParam>{5, 6, 7, 8};
+    } else if (my_rank == 2) {
+        data = std::vector<TypeParam>{9, 10, 11, 12};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+
+    gko::experimental::mpi::request req;
     {
-        ValueType* input_ptr = static_cast<ValueType*>(input);
-        ValueType* output_ptr = static_cast<ValueType*>(output);
-        for (int i = 0; i < *len; i++) {
-            output_ptr[i] += input_ptr[i];
+        auto win = window(this->ref, data.data(), 4, comm);
+        if (my_rank == 0) {
+            win.lock_all();
+            for (auto rank = 0; rank < num_ranks; ++rank) {
+                if (rank != my_rank) {
+                    req = win.r_accumulate(this->ref, data.data(), 4, rank, 0,
+                                           4, MPI_SUM);
+                }
+            }
+            win.unlock_all();
         }
     }
-};
+
+    req.wait();
+    std::vector<TypeParam> ref;
+    if (my_rank == 0) {
+        ref = std::vector<TypeParam>{1, 2, 3, 4};
+        ASSERT_EQ(data, ref);
+    } else if (my_rank == 1) {
+        ref = std::vector<TypeParam>{6, 8, 10, 12};
+        ASSERT_EQ(data, ref);
+    } else if (my_rank == 2) {
+        ref = std::vector<TypeParam>{10, 12, 14, 16};
+        ASSERT_EQ(data, ref);
+    } else {
+        ref = std::vector<TypeParam>{1, 2, 3, 4};
+        ASSERT_EQ(data, ref);
+    }
+}
+
+
+TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
+{
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+    auto win = window(this->ref, data.data(), 4, comm);
+
+    if (my_rank != 0) {
+        win.lock_all();
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
+        win.unlock_all();
+    }
+
+    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+    ASSERT_EQ(data, ref);
+}
+
+
+TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
+{
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+    gko::experimental::mpi::request req;
+    auto win = window(this->ref, data.data(), 4, comm);
+
+    if (my_rank != 0) {
+        win.lock_all();
+        req = win.r_get(this->ref, data.data(), 4, 0, 0, 4);
+        win.unlock_all();
+    }
+
+    req.wait();
+    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+    ASSERT_EQ(data, ref);
+}
+
+
+TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
+{
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+    auto win = window(this->ref, data.data(), 4, comm);
+
+    if (my_rank != 0) {
+        win.lock(0, window::lock_type::exclusive);
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
+        win.unlock(0);
+    }
+
+    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+    ASSERT_EQ(data, ref);
+}
+
+
+TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
+{
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+    auto win = window(this->ref, data.data(), 4, comm);
+
+    if (my_rank != 0) {
+        win.lock(0);
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
+        win.unlock(0);
+    }
+
+    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+    ASSERT_EQ(data, ref);
+}
+
+
+TYPED_TEST(MpiBindings, CanGetValuesWithFence)
+{
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+    auto win = window(this->ref, data.data(), 4, comm);
+
+    win.fence();
+    if (my_rank != 0) {
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
+    }
+    win.fence();
+
+    auto ref = std::vector<TypeParam>{1, 2, 3, 4};
+    ASSERT_EQ(data, ref);
+}
+
+
+TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
+{
+    // one-side accumlation only supports native type
+    SKIP_IF_HALF(TypeParam);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    std::vector<TypeParam> target;
+    std::vector<TypeParam> result(4, 0);
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+        target = std::vector<TypeParam>{1, 2, 3, 4};
+    } else if (my_rank == 1) {
+        data = std::vector<TypeParam>{5, 6, 7, 8};
+        target = std::vector<TypeParam>{5, 6, 7, 8};
+    } else if (my_rank == 2) {
+        data = std::vector<TypeParam>{9, 10, 11, 12};
+        target = std::vector<TypeParam>{9, 10, 11, 12};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+        target = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+
+    {
+        auto win = window(this->ref, target.data(), 4, comm);
+
+        if (my_rank == 2) {
+            win.lock_all();
+            win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, 0,
+                               0, 4, MPI_SUM);
+            win.unlock_all();
+        }
+    }
+
+    std::vector<TypeParam> ref;
+    std::vector<TypeParam> ref2;
+    if (my_rank == 0) {
+        ref = std::vector<TypeParam>{10, 12, 14, 16};
+        EXPECT_EQ(target, ref);
+    } else if (my_rank == 2) {
+        ref = std::vector<TypeParam>{1, 2, 3, 4};
+        EXPECT_EQ(result, ref);
+    }
+}
+
+
+TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
+{
+    // one-side accumlation only supports native type
+    SKIP_IF_HALF(TypeParam);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    std::vector<TypeParam> target;
+    std::vector<TypeParam> result(4, 0);
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+        target = std::vector<TypeParam>{1, 2, 3, 4};
+    } else if (my_rank == 1) {
+        data = std::vector<TypeParam>{5, 6, 7, 8};
+        target = std::vector<TypeParam>{5, 6, 7, 8};
+    } else if (my_rank == 2) {
+        data = std::vector<TypeParam>{9, 10, 11, 12};
+        target = std::vector<TypeParam>{9, 10, 11, 12};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+        target = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+    gko::experimental::mpi::request req;
+
+    {
+        auto win = window(this->ref, target.data(), 4, comm);
+
+        if (my_rank == 2) {
+            win.lock_all();
+            req = win.r_get_accumulate(this->ref, data.data(), 4, result.data(),
+                                       4, 0, 0, 4, MPI_SUM);
+            win.unlock_all();
+        }
+    }
+
+    req.wait();
+    std::vector<TypeParam> ref;
+    std::vector<TypeParam> ref2;
+    if (my_rank == 0) {
+        ref = std::vector<TypeParam>{10, 12, 14, 16};
+        ref2 = std::vector<TypeParam>{1, 2, 3, 4};
+        EXPECT_EQ(target, ref);
+        EXPECT_EQ(data, ref2);
+    } else if (my_rank == 2) {
+        ref = std::vector<TypeParam>{1, 2, 3, 4};
+        ref2 = std::vector<TypeParam>{9, 10, 11, 12};
+        EXPECT_EQ(result, ref);
+        EXPECT_EQ(target, ref2);
+        EXPECT_EQ(data, ref2);
+    }
+}
+
+
+TYPED_TEST(MpiBindings, CanFetchAndOperate)
+{
+    // one-side operation only supports native type
+    SKIP_IF_HALF(TypeParam);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    std::vector<TypeParam> data;
+    std::vector<TypeParam> target;
+    std::vector<TypeParam> result(4, 0);
+    if (my_rank == 0) {
+        data = std::vector<TypeParam>{1, 2, 3, 4};
+        target = std::vector<TypeParam>{1, 2, 3, 4};
+    } else if (my_rank == 1) {
+        data = std::vector<TypeParam>{5, 6, 7, 8};
+        target = std::vector<TypeParam>{5, 6, 7, 8};
+    } else if (my_rank == 2) {
+        data = std::vector<TypeParam>{9, 10, 11, 12};
+        target = std::vector<TypeParam>{9, 10, 11, 12};
+    } else {
+        data = std::vector<TypeParam>{0, 0, 0, 0};
+        target = std::vector<TypeParam>{0, 0, 0, 0};
+    }
+
+    {
+        auto win = window(this->ref, target.data(), 4, comm);
+
+        if (my_rank == 2) {
+            win.lock_all();
+            win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1,
+                             MPI_SUM);
+            win.unlock_all();
+        }
+    }
+
+    std::vector<TypeParam> ref;
+    std::vector<TypeParam> ref2;
+    if (my_rank == 0) {
+        ref = std::vector<TypeParam>{1, 11, 3, 4};
+        EXPECT_EQ(target, ref);
+    } else if (my_rank == 2) {
+        ref = std::vector<TypeParam>{2, 0, 0, 0};
+        EXPECT_EQ(result, ref);
+    }
+}
+
+
+TYPED_TEST(MpiBindings, CanBroadcastValues)
+{
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    auto array = gko::array<TypeParam>{this->ref, 8};
+    if (my_rank == 0) {
+        array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+    }
+
+    comm.broadcast(this->ref, array.get_data(), 8, 0);
+
+    auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+    GKO_ASSERT_ARRAY_EQ(ref, array);
+}
+
+
+TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
+{
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    auto array = gko::array<TypeParam>{this->ref, 8};
+    if (my_rank == 0) {
+        array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+    }
+
+    auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0);
+
+    req.wait();
+    auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
+    GKO_ASSERT_ARRAY_EQ(ref, array);
+}
+
+
+TYPED_TEST(MpiBindings, CanReduceValues)
+{
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    TypeParam data, sum, max, min;
+    if (my_rank == 0) {
+        data = 3;
+    } else if (my_rank == 1) {
+        data = 5;
+    } else if (my_rank == 2) {
+        data = 2;
+    } else if (my_rank == 3) {
+        data = 6;
+    }
+
+    comm.reduce(this->ref, &data, &sum, 1, this->sum_op.get(), 0);
+    comm.reduce(this->ref, &data, &max, 1, this->max_op.get(), 0);
+    comm.reduce(this->ref, &data, &min, 1, this->min_op.get(), 0);
+
+    if (my_rank == 0) {
+        EXPECT_EQ(sum, TypeParam{16});
+        EXPECT_EQ(max, TypeParam{6});
+        EXPECT_EQ(min, TypeParam{2});
+    }
+}
+
+
+TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
+{
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    auto my_rank = comm.rank();
+    auto num_ranks = comm.size();
+    TypeParam data, sum, max, min;
+    if (my_rank == 0) {
+        data = 3;
+    } else if (my_rank == 1) {
+        data = 5;
+    } else if (my_rank == 2) {
+        data = 2;
+    } else if (my_rank == 3) {
+        data = 6;
+    }
+
+    auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, this->sum_op.get(), 0);
+    auto req2 = comm.i_reduce(this->ref, &data, &max, 1, this->max_op.get(), 0);
+    auto req3 = comm.i_reduce(this->ref, &data, &min, 1, this->min_op.get(), 0);
+
+    req1.wait();
+    req2.wait();
+    req3.wait();
+    if (my_rank == 0) {
+        EXPECT_EQ(sum, TypeParam{16});
+        EXPECT_EQ(max, TypeParam{6});
+        EXPECT_EQ(min, TypeParam{2});
+    }
+}
+
+
 TYPED_TEST(MpiBindings, CanAllReduceValues)
 {
     auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
@@ -787,20 +814,9 @@ TYPED_TEST(MpiBindings, CanAllReduceValues)
     } else if (my_rank == 3) {
         data = 6;
     }
-    MPI_Op operation;
-    MPI_Op_create(&half_sum, 1, &operation);
-    // if (std::is_same_v<gko::half, TypeParam>) {
-    //     comm.all_reduce(this->ref, &data, &sum, 1, operation);
-    // } else {
-    // gko::experimental::mpi::op_type<TypeParam> op(1, MPI_SUM,
-    // gko::experimental::mpi::detail::sum);
-    // gko::experimental::mpi::op_type<TypeParam> op(1, MPI_SUM, sum_op());
-    auto op = gko::experimental::mpi::sum<TypeParam>();
-    comm.all_reduce(this->ref, &data, &sum, 1, op.get());
-    // }
-    // comm.all_reduce(this->ref, &data, &sum, 1,
-    // gko::experimental::mpi::sum<TypeParam>());
-    MPI_Op_free(&operation);
+
+    comm.all_reduce(this->ref, &data, &sum, 1, this->sum_op.get());
+
     ASSERT_EQ(sum, TypeParam{16});
 }
 
@@ -821,7 +837,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace)
         data = 6;
     }
 
-    comm.all_reduce(this->ref, &data, 1, MPI_SUM);
+    comm.all_reduce(this->ref, &data, 1, this->sum_op.get());
 
     ASSERT_EQ(data, TypeParam{16});
 }
@@ -843,7 +859,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues)
         data = 6;
     }
 
-    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
+    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, this->sum_op.get());
 
     req.wait();
     ASSERT_EQ(sum, TypeParam{16});
@@ -866,7 +882,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace)
         data = 6;
     }
 
-    auto req = comm.i_all_reduce(this->ref, &data, 1, MPI_SUM);
+    auto req = comm.i_all_reduce(this->ref, &data, 1, this->sum_op.get());
 
     req.wait();
     ASSERT_EQ(data, TypeParam{16});
@@ -1482,9 +1498,9 @@ TYPED_TEST(MpiBindings, CanScanValues)
         data = 6;
     }
 
-    comm.scan(this->ref, &data, &sum, 1, MPI_SUM);
-    comm.scan(this->ref, &data, &max, 1, MPI_MAX);
-    comm.scan(this->ref, &data, &min, 1, MPI_MIN);
+    comm.scan(this->ref, &data, &sum, 1, this->sum_op.get());
+    comm.scan(this->ref, &data, &max, 1, this->max_op.get());
+    comm.scan(this->ref, &data, &min, 1, this->min_op.get());
 
     if (my_rank == 0) {
         EXPECT_EQ(sum, TypeParam{3});
@@ -1522,9 +1538,9 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues)
         data = 6;
     }
 
-    auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM);
-    auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX);
-    auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN);
+    auto req1 = comm.i_scan(this->ref, &data, &sum, 1, this->sum_op.get());
+    auto req2 = comm.i_scan(this->ref, &data, &max, 1, this->max_op.get());
+    auto req3 = comm.i_scan(this->ref, &data, &min, 1, this->min_op.get());
 
     req1.wait();
     req2.wait();
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
index 82c6319f6f9..5642abbd4d7 100644
--- a/include/ginkgo/core/base/mpi.hpp
+++ b/include/ginkgo/core/base/mpi.hpp
@@ -101,30 +101,35 @@ GKO_REGISTER_MPI_TYPE(std::complex<double>, MPI_C_DOUBLE_COMPLEX);
 
 namespace detail {
 
-inline void half_sum(void* input, void* output, int* len,
-                     MPI_Datatype* datatype)
+
+template <typename ValueType>
+inline void sum(void* input, void* output, int* len, MPI_Datatype* datatype)
 {
-    gko::half* input_ptr = static_cast<gko::half*>(input);
-    gko::half* output_ptr = static_cast<gko::half*>(output);
+    ValueType* input_ptr = static_cast<ValueType*>(input);
+    ValueType* output_ptr = static_cast<ValueType*>(output);
     for (int i = 0; i < *len; i++) {
         output_ptr[i] += input_ptr[i];
     }
 }
 
 template <typename ValueType>
-inline void sum(void* input, void* output, int* len, MPI_Datatype* datatype)
+inline void max(void* input, void* output, int* len, MPI_Datatype* datatype)
 {
     ValueType* input_ptr = static_cast<ValueType*>(input);
     ValueType* output_ptr = static_cast<ValueType*>(output);
     for (int i = 0; i < *len; i++) {
-        output_ptr[i] += input_ptr[i];
+        if (input_ptr[i] > output_ptr[i]) {
+            output_ptr[i] = input_ptr[i];
+        }
     }
 }
 
+
 }  // namespace detail
 
 
-using op_manager = std::unique_ptr<ompi_op_t, std::function<void(MPI_Op)>>;
+using op_manager = std::unique_ptr<std::pointer_traits<MPI_Op>::element_type,
+                                   std::function<void(MPI_Op)>>;
 
 template <typename ValueType,
           std::enable_if_t<std::is_arithmetic_v<ValueType>>* = nullptr>
@@ -137,113 +142,36 @@ template <typename ValueType,
           std::enable_if_t<!std::is_arithmetic_v<ValueType>>* = nullptr>
 inline op_manager sum()
 {
-    // MPI_Op is MPI_ABI_Op*
     return op_manager(
         []() {
             MPI_Op operation;
             MPI_Op_create(&detail::sum<ValueType>, 1, &operation);
-            // MPI_Op_create(&detail::half_sum, 1, operation);
-            std::cout << "custom operator" << std::endl;
             return operation;
         }(),
         [](MPI_Op op) { MPI_Op_free(&op); });
 }
 
-/**
- * A move-only wrapper for a contiguous MPI_Datatype.
- *
- * The underlying MPI_Datatype is automatically created and committed when an
- * object of this type is constructed, and freed when it is destructed.
- */
-template <typename ValueType>
-class op_type {
-public:
-    template <typename T>
-    struct mpi_native_type
-        : std::conditional_t<std::is_arithmetic_v<T>, std::true_type,
-                             std::false_type> {};
-
-    /**
-     * Constructs a wrapper for a contiguous MPI_Datatype.
-     *
-     * @param count  the number of old_type elements the new datatype contains.
-     * @param old_type  the MPI_Datatype that is contained.
-     */
-    template <typename AltFunc>
-    op_type(int commutativity, MPI_Op default_op, AltFunc&& alt_func)
-        : custom_(false), handle_(MPI_OP_NULL)
-    {
-        custom_ = mpi_native_type<ValueType>::value;
-        if constexpr (mpi_native_type<ValueType>::value) {
-            auto op = alt_func.template operator()<ValueType>;
-            GKO_ASSERT_NO_MPI_ERRORS(
-                MPI_Op_create(&op, commutativity, &handle_));
-        } else {
-            handle_ = default_op;
-        }
-    }
-
-    /**
-     * Constructs empty wrapper with MPI_OP_NULL.
-     */
-    op_type() : handle_(MPI_OP_NULL) {}
-
-    /**
-     * Disallow copying of wrapper type.
-     */
-    op_type(const op_type&) = delete;
-
-    /**
-     * Disallow copying of wrapper type.
-     */
-    op_type& operator=(const op_type&) = delete;
-
-    /**
-     * Move constructor, leaves other with MPI_OP_NULL.
-     *
-     * @param other  to be moved from object.
-     */
-    op_type(op_type&& other) noexcept : handle_(MPI_OP_NULL)
-    {
-        *this = std::move(other);
-    }
-
-    /**
-     * Move assignment, leaves other with MPI_OP_NULL.
-     *
-     * @param other  to be moved from object.
-     *
-     * @return  this object.
-     */
-    op_type& operator=(op_type&& other) noexcept
-    {
-        if (this != &other) {
-            this->handle_ = std::exchange(other.handle_, MPI_OP_NULL);
-        }
-        return *this;
-    }
 
-    /**
-     * Destructs object by freeing wrapped MPI_Datatype.
-     */
-    ~op_type()
-    {
-        if (custom_ && handle_ != MPI_OP_NULL) {
-            MPI_Op_free(&handle_);
-        }
-    }
+template <typename ValueType,
+          std::enable_if_t<std::is_arithmetic_v<ValueType>>* = nullptr>
+inline op_manager max()
+{
+    return op_manager([]() { return MPI_MAX; }(), [](MPI_Op op) {});
+}
 
-    /**
-     * Access the underlying MPI_Op.
-     *
-     * @return  the underlying MPI_Op.
-     */
-    MPI_Op get() const { return handle_; }
+template <typename ValueType,
+          std::enable_if_t<!std::is_arithmetic_v<ValueType>>* = nullptr>
+inline op_manager max()
+{
+    return op_manager(
+        []() {
+            MPI_Op operation;
+            MPI_Op_create(&detail::max<ValueType>, 1, &operation);
+            return operation;
+        }(),
+        [](MPI_Op op) { MPI_Op_free(&op); });
+}
 
-private:
-    bool custom_;
-    MPI_Op handle_;
-};
 
 /**
  * A move-only wrapper for a contiguous MPI_Datatype.
@@ -893,22 +821,9 @@ class communicator {
                     ReduceType* recv_buffer, int count, MPI_Op operation) const
     {
         auto guard = exec->get_scoped_device_id_guard();
-        if constexpr (std::is_same_v<ReduceType, gko::half>) {
-            if (operation == MPI_SUM) {
-                MPI_Op op;
-                MPI_Op_create(&detail::half_sum, 1, &op);
-                GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
-                    MPI_IN_PLACE, recv_buffer, count,
-                    type_impl<ReduceType>::get_type(), op, this->get()));
-                MPI_Op_free(&op);
-            } else {
-                GKO_NOT_IMPLEMENTED;
-            }
-        } else {
-            GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
-                MPI_IN_PLACE, recv_buffer, count,
-                type_impl<ReduceType>::get_type(), operation, this->get()));
-        }
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
+            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
+            operation, this->get()));
     }
 
     /**
@@ -933,24 +848,9 @@ class communicator {
     {
         auto guard = exec->get_scoped_device_id_guard();
         request req;
-        if constexpr (std::is_same_v<ReduceType, gko::half>) {
-            if (operation == MPI_SUM) {
-                MPI_Op op;
-                MPI_Op_create(&detail::half_sum, 1, &op);
-                GKO_ASSERT_NO_MPI_ERRORS(
-                    MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count,
-                                   type_impl<ReduceType>::get_type(), op,
-                                   this->get(), req.get()));
-                MPI_Op_free(&op);
-            } else {
-                GKO_NOT_IMPLEMENTED;
-            }
-        } else {
-            GKO_ASSERT_NO_MPI_ERRORS(
-                MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count,
-                               type_impl<ReduceType>::get_type(), operation,
-                               this->get(), req.get()));
-        }
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
+            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
+            operation, this->get(), req.get()));
         return req;
     }
 
@@ -974,22 +874,9 @@ class communicator {
                     int count, MPI_Op operation) const
     {
         auto guard = exec->get_scoped_device_id_guard();
-        // if constexpr (std::is_same_v<ReduceType, gko::half>) {
-        //     if (operation == MPI_SUM) {
-        //         MPI_Op op;
-        //         MPI_Op_create(&detail::half_sum, 1, &op);
-        //         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
-        //             send_buffer, recv_buffer, count,
-        //             type_impl<ReduceType>::get_type(), op, this->get()));
-        //         MPI_Op_free(&op);
-        //     } else {
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // } else {
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
             send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
             operation, this->get()));
-        // }
     }
 
     /**
@@ -1015,24 +902,9 @@ class communicator {
     {
         auto guard = exec->get_scoped_device_id_guard();
         request req;
-        if constexpr (std::is_same_v<ReduceType, gko::half>) {
-            if (operation == MPI_SUM) {
-                MPI_Op op;
-                MPI_Op_create(&detail::half_sum, 1, &op);
-                GKO_ASSERT_NO_MPI_ERRORS(
-                    MPI_Iallreduce(send_buffer, recv_buffer, count,
-                                   type_impl<ReduceType>::get_type(), op,
-                                   this->get(), req.get()));
-                MPI_Op_free(&op);
-            } else {
-                GKO_NOT_IMPLEMENTED;
-            }
-        } else {
-            GKO_ASSERT_NO_MPI_ERRORS(
-                MPI_Iallreduce(send_buffer, recv_buffer, count,
-                               type_impl<ReduceType>::get_type(), operation,
-                               this->get(), req.get()));
-        }
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
+            send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
+            operation, this->get(), req.get()));
         return req;
     }
 

From bed8415c8571df5c0bd14fc7f98513e52cfa5258 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 30 Dec 2024 16:56:17 +0100
Subject: [PATCH 4/5] enable vector half

---
 core/distributed/vector.cpp                | 38 ++++++++++++++--------
 core/test/utils.hpp                        |  3 ++
 include/ginkgo/core/base/mpi.hpp           | 21 ++++++++----
 include/ginkgo/core/distributed/vector.hpp |  2 ++
 test/mpi/vector.cpp                        | 19 +++++------
 5 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 86eb450b888..732f8d5b3ef 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -64,7 +64,9 @@ Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
                           dim<2> local_size, size_type stride)
     : EnableLinOp<Vector>{exec, global_size},
       DistributedBase{comm},
-      local_{exec, local_size, stride}
+      local_{exec, local_size, stride},
+      sum_op_(mpi::sum<ValueType>()),
+      norm_sum_op_(mpi::sum<remove_complex<ValueType>>())
 {
     GKO_ASSERT_EQUAL_COLS(global_size, local_size);
 }
@@ -75,7 +77,9 @@ Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
                           std::unique_ptr<local_vector_type> local_vector)
     : EnableLinOp<Vector>{exec, global_size},
       DistributedBase{comm},
-      local_{exec}
+      local_{exec},
+      sum_op_(mpi::sum<ValueType>()),
+      norm_sum_op_(mpi::sum<remove_complex<ValueType>>())
 {
     local_vector->move_to(&local_);
 }
@@ -85,7 +89,11 @@ template <typename ValueType>
 Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
                           mpi::communicator comm,
                           std::unique_ptr<local_vector_type> local_vector)
-    : EnableLinOp<Vector>{exec, {}}, DistributedBase{comm}, local_{exec}
+    : EnableLinOp<Vector>{exec, {}},
+      DistributedBase{comm},
+      local_{exec},
+      sum_op_(mpi::sum<ValueType>()),
+      norm_sum_op_(mpi::sum<remove_complex<ValueType>>())
 {
     this->set_size(compute_global_size(exec, comm, local_vector->get_size()));
     local_vector->move_to(&local_);
@@ -467,11 +475,11 @@ void Vector<ValueType>::compute_dot(ptr_param<const LinOp> b,
         host_reduction_buffer_->copy_from(dense_res.get());
         comm.all_reduce(exec->get_master(),
                         host_reduction_buffer_->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]), sum_op_.get());
         dense_res->copy_from(host_reduction_buffer_.get());
     } else {
         comm.all_reduce(exec, dense_res->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]), sum_op_.get());
     }
 }
 
@@ -503,11 +511,11 @@ void Vector<ValueType>::compute_conj_dot(ptr_param<const LinOp> b,
         host_reduction_buffer_->copy_from(dense_res.get());
         comm.all_reduce(exec->get_master(),
                         host_reduction_buffer_->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]), sum_op_.get());
         dense_res->copy_from(host_reduction_buffer_.get());
     } else {
         comm.all_reduce(exec, dense_res->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]), sum_op_.get());
     }
 }
 
@@ -556,11 +564,13 @@ void Vector<ValueType>::compute_norm1(ptr_param<LinOp> result,
         host_norm_buffer_.init(exec->get_master(), dense_res->get_size());
         host_norm_buffer_->copy_from(dense_res.get());
         comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]),
+                        norm_sum_op_.get());
         dense_res->copy_from(host_norm_buffer_.get());
     } else {
         comm.all_reduce(exec, dense_res->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]),
+                        norm_sum_op_.get());
     }
 }
 
@@ -589,11 +599,13 @@ void Vector<ValueType>::compute_squared_norm2(ptr_param<LinOp> result,
         host_norm_buffer_.init(exec->get_master(), dense_res->get_size());
         host_norm_buffer_->copy_from(dense_res.get());
         comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]),
+                        norm_sum_op_.get());
         dense_res->copy_from(host_norm_buffer_.get());
     } else {
         comm.all_reduce(exec, dense_res->get_values(),
-                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+                        static_cast<int>(this->get_size()[1]),
+                        norm_sum_op_.get());
     }
 }
 
@@ -632,10 +644,10 @@ void Vector<ValueType>::compute_mean(ptr_param<LinOp> result,
         host_reduction_buffer_->copy_from(dense_res.get());
         comm.all_reduce(exec->get_master(),
                         host_reduction_buffer_->get_values(), num_vecs,
-                        MPI_SUM);
+                        sum_op_.get());
         dense_res->copy_from(host_reduction_buffer_.get());
     } else {
-        comm.all_reduce(exec, dense_res->get_values(), num_vecs, MPI_SUM);
+        comm.all_reduce(exec, dense_res->get_values(), num_vecs, sum_op_.get());
     }
 }
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index e9c4c5e0c99..2c9a570b7fe 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -396,6 +396,9 @@ using TwoValueIndexTypes = add_to_cartesian_type_product_t<
 using ValueLocalGlobalIndexTypesBase =
     add_to_cartesian_type_product_left_t<ValueTypesBase, LocalGlobalIndexTypes>;
 
+using ValueLocalGlobalIndexTypes =
+    add_to_cartesian_type_product_left_t<ValueTypes, LocalGlobalIndexTypes>;
+
 
 template <typename Precision, typename OutputType>
 struct reduction_factor {
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
index 5642abbd4d7..555ab7099b1 100644
--- a/include/ginkgo/core/base/mpi.hpp
+++ b/include/ginkgo/core/base/mpi.hpp
@@ -124,22 +124,31 @@ inline void max(void* input, void* output, int* len, MPI_Datatype* datatype)
     }
 }
 
+template <typename ValueType>
+struct is_mpi_native {
+    constexpr static bool value =
+        std::is_arithmetic_v<ValueType> ||
+        std::is_same_v<ValueType, std::complex<float>> ||
+        std::is_same_v<ValueType, std::complex<double>>;
+};
+
 
 }  // namespace detail
 
 
-using op_manager = std::unique_ptr<std::pointer_traits<MPI_Op>::element_type,
-                                   std::function<void(MPI_Op)>>;
+// using op_manager = std::unique_ptr<std::pointer_traits<MPI_Op>::element_type,
+//                                    std::function<void(MPI_Op)>>;
+using op_manager = std::shared_ptr<std::pointer_traits<MPI_Op>::element_type>;
 
 template <typename ValueType,
-          std::enable_if_t<std::is_arithmetic_v<ValueType>>* = nullptr>
+          std::enable_if_t<detail::is_mpi_native<ValueType>::value>* = nullptr>
 inline op_manager sum()
 {
     return op_manager([]() { return MPI_SUM; }(), [](MPI_Op op) {});
 }
 
 template <typename ValueType,
-          std::enable_if_t<!std::is_arithmetic_v<ValueType>>* = nullptr>
+          std::enable_if_t<!detail::is_mpi_native<ValueType>::value>* = nullptr>
 inline op_manager sum()
 {
     return op_manager(
@@ -153,14 +162,14 @@ inline op_manager sum()
 
 
 template <typename ValueType,
-          std::enable_if_t<std::is_arithmetic_v<ValueType>>* = nullptr>
+          std::enable_if_t<detail::is_mpi_native<ValueType>::value>* = nullptr>
 inline op_manager max()
 {
     return op_manager([]() { return MPI_MAX; }(), [](MPI_Op op) {});
 }
 
 template <typename ValueType,
-          std::enable_if_t<!std::is_arithmetic_v<ValueType>>* = nullptr>
+          std::enable_if_t<!detail::is_mpi_native<ValueType>::value>* = nullptr>
 inline op_manager max()
 {
     return op_manager(
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
index 20ccfb6435e..181a7de3460 100644
--- a/include/ginkgo/core/distributed/vector.hpp
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -662,6 +662,8 @@ class Vector
     local_vector_type local_;
     ::gko::detail::DenseCache<ValueType> host_reduction_buffer_;
     ::gko::detail::DenseCache<remove_complex<ValueType>> host_norm_buffer_;
+    mpi::op_manager sum_op_;
+    mpi::op_manager norm_sum_op_;
 };
 
 
diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp
index a65bbc7fd36..01e58ddd517 100644
--- a/test/mpi/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -95,7 +95,7 @@ class VectorCreation : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
@@ -361,7 +361,7 @@ class VectorCreationHelpers : public CommonMpiTestFixture {
     std::unique_ptr<vec_type> dst;
 };
 
-TYPED_TEST_SUITE(VectorCreationHelpers, gko::test::ValueTypesBase,
+TYPED_TEST_SUITE(VectorCreationHelpers, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -462,8 +462,7 @@ class VectorReductions : public CommonMpiTestFixture {
                                                            global_index_type>(
             size[0], size[1],
             std::uniform_int_distribution<gko::size_type>(size[1], size[1]),
-            std::normal_distribution<gko::remove_complex<value_type>>(),
-            engine);
+            std::normal_distribution<>(), engine);
         dense_x->read(md_x);
         auto tmp_x = dist_vec_type::create(ref, comm);
         tmp_x->read_distributed(md_x, part);
@@ -473,8 +472,7 @@ class VectorReductions : public CommonMpiTestFixture {
                                                            global_index_type>(
             size[0], size[1],
             std::uniform_int_distribution<gko::size_type>(size[1], size[1]),
-            std::normal_distribution<gko::remove_complex<value_type>>(),
-            engine);
+            std::normal_distribution<>(), engine);
         dense_y->read(md_y);
         auto tmp_y = dist_vec_type::create(ref, comm);
         tmp_y->read_distributed(md_y, part);
@@ -513,7 +511,7 @@ class VectorReductions : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypesBase,
+TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
@@ -799,8 +797,7 @@ class VectorLocalOps : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypesBase,
-                 TypenameNameGenerator);
+TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(VectorLocalOps, ApplyNotSupported)
@@ -838,7 +835,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported)
 TYPED_TEST(VectorLocalOps, ConvertsToPrecision)
 {
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_base<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherVector = typename gko::experimental::distributed::Vector<OtherT>;
     auto local_tmp = OtherVector::local_vector_type::create(this->exec);
     auto tmp = OtherVector::create(this->exec, this->comm);
@@ -854,7 +851,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision)
 TYPED_TEST(VectorLocalOps, MovesToPrecision)
 {
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision_base<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherVector = typename gko::experimental::distributed::Vector<OtherT>;
     auto local_tmp = OtherVector::local_vector_type::create(this->exec);
     auto tmp = OtherVector::create(this->exec, this->comm);

From f50a7d127cf17ac9157c93ee4cfcb3d182001d47 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 2 Jan 2025 09:29:24 +0100
Subject: [PATCH 5/5] enable half in distributed matrix/vector/pgm/gmres

---
 core/distributed/vector_cache.cpp             |   4 +-
 core/multigrid/pgm.cpp                        | 269 +++++++++---------
 core/solver/gmres.cpp                         |   7 +-
 core/test/mpi/distributed/matrix.cpp          |   4 +-
 .../distributed/preconditioner/schwarz.cpp    |   4 +-
 .../ginkgo/core/base/precision_dispatch.hpp   | 151 +++++-----
 .../test/distributed/assembly_kernels.cpp     |   4 +-
 reference/test/distributed/matrix_kernels.cpp |   4 +-
 reference/test/distributed/vector_kernels.cpp |   4 +-
 test/distributed/assembly_kernels.cpp         |   4 +-
 test/distributed/matrix_kernels.cpp           |   4 +-
 test/distributed/vector_kernels.cpp           |   4 +-
 test/mpi/assembly.cpp                         |   4 +-
 test/mpi/matrix.cpp                           |  10 +-
 test/mpi/multigrid/pgm.cpp                    |   4 +-
 test/mpi/preconditioner/schwarz.cpp           |   5 +-
 16 files changed, 229 insertions(+), 257 deletions(-)

diff --git a/core/distributed/vector_cache.cpp b/core/distributed/vector_cache.cpp
index 683d18dfd98..acc23ed0fef 100644
--- a/core/distributed/vector_cache.cpp
+++ b/core/distributed/vector_cache.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -48,7 +48,7 @@ void VectorCache<ValueType>::init_from(
 
 
 #define GKO_DECLARE_VECTOR_CACHE(_type) class VectorCache<_type>
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_VECTOR_CACHE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_VECTOR_CACHE);
 
 
 }  // namespace detail
diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp
index 468a27e8ce4..a6fac943db9 100644
--- a/core/multigrid/pgm.cpp
+++ b/core/multigrid/pgm.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -389,147 +389,138 @@ void Pgm<ValueType, IndexType>::generate()
 #if GINKGO_BUILD_MPI
     if (std::dynamic_pointer_cast<
             const experimental::distributed::DistributedBase>(system_matrix_)) {
-        if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
-            GKO_NOT_SUPPORTED(nullptr);
-        } else {
-            auto convert_fine_op = [&](auto matrix) {
-                using global_index_type = typename std::decay_t<
-                    decltype(*matrix)>::result_type::global_index_type;
-                auto exec = as<LinOp>(matrix)->get_executor();
-                auto comm =
-                    as<experimental::distributed::DistributedBase>(matrix)
-                        ->get_communicator();
-                auto fine = share(
-                    experimental::distributed::
-                        Matrix<ValueType, IndexType, global_index_type>::create(
-                            exec, comm,
-                            matrix::Csr<ValueType, IndexType>::create(exec),
-                            matrix::Csr<ValueType, IndexType>::create(exec)));
-                matrix->convert_to(fine);
-                this->set_fine_op(fine);
-            };
-            auto setup_fine_op = [&](auto matrix) {
-                // Only support csr matrix currently.
-                auto local_csr = std::dynamic_pointer_cast<const csr_type>(
-                    matrix->get_local_matrix());
-                auto non_local_csr = std::dynamic_pointer_cast<const csr_type>(
-                    matrix->get_non_local_matrix());
-                // If system matrix is not csr or need sorting, generate the
-                // csr.
-                if (!parameters_.skip_sorting || !local_csr || !non_local_csr) {
-                    using global_index_type = typename std::decay_t<
-                        decltype(*matrix)>::global_index_type;
-                    convert_fine_op(
-                        as<ConvertibleTo<experimental::distributed::Matrix<
-                            ValueType, IndexType, global_index_type>>>(matrix));
-                }
-            };
-
-            using fst_mtx_type =
-                experimental::distributed::Matrix<ValueType, IndexType,
-                                                  IndexType>;
-            using snd_mtx_type =
-                experimental::distributed::Matrix<ValueType, IndexType, int64>;
-            // setup the fine op using Csr with current ValueType
-            // we do not use dispatcher run in the first place because we have
-            // the fallback option for that.
-            if (auto obj = std::dynamic_pointer_cast<const fst_mtx_type>(
-                    system_matrix_)) {
-                setup_fine_op(obj);
-            } else if (auto obj = std::dynamic_pointer_cast<const snd_mtx_type>(
-                           system_matrix_)) {
-                setup_fine_op(obj);
-            } else {
-                // handle other ValueTypes.
-                run<ConvertibleTo, fst_mtx_type, snd_mtx_type>(system_matrix_,
-                                                               convert_fine_op);
-            }
-
-            auto distributed_setup = [&](auto matrix) {
-                auto exec = gko::as<LinOp>(matrix)->get_executor();
-                auto comm =
-                    gko::as<experimental::distributed::DistributedBase>(matrix)
-                        ->get_communicator();
-                auto num_rank = comm.size();
-                auto pgm_local_op =
-                    gko::as<const csr_type>(matrix->get_local_matrix());
-                auto result = this->generate_local(pgm_local_op);
-
-                auto non_local_csr =
-                    as<const csr_type>(matrix->get_non_local_matrix());
-                auto non_local_size = non_local_csr->get_size()[1];
-                array<IndexType> non_local_agg(exec, non_local_size);
-                // get agg information (prolong_row_gather row idx)
-                communicate(matrix, agg_, non_local_agg);
-                // generate non_local_col_map
-                non_local_agg.set_executor(exec->get_master());
-                array<IndexType> non_local_col_map(exec->get_master(),
-                                                   non_local_size);
-                // add additional entry in tail such that the offset easily
-                // handle it.
-                array<IndexType> renumber(exec->get_master(),
-                                          non_local_size + 1);
-                auto recv_offsets = matrix->recv_offsets_;
-                generate_non_local_map(recv_offsets, non_local_agg,
-                                       non_local_col_map, renumber);
-
-                // get new recv_size and recv_offsets
-                std::vector<experimental::distributed::comm_index_type>
-                    new_recv_size(num_rank);
-                std::vector<experimental::distributed::comm_index_type>
-                    new_recv_offsets(num_rank + 1);
-                array<IndexType> new_recv_gather_idxs(exec->get_master());
-                compute_communication(recv_offsets, non_local_agg, renumber,
-                                      new_recv_size, new_recv_offsets,
-                                      new_recv_gather_idxs);
-
-                non_local_col_map.set_executor(exec);
-                IndexType non_local_num_agg = new_recv_gather_idxs.get_size();
-                // build csr from row and col map
-                // unlike non-distributed version, generate_coarse uses
-                // different row and col maps.
-                auto result_non_local_csr = generate_coarse(
-                    exec, non_local_csr.get(),
-                    static_cast<IndexType>(std::get<1>(result)->get_size()[0]),
-                    agg_, non_local_num_agg, non_local_col_map);
-                // use local and non-local to build coarse matrix
-                // also restriction and prolongation (Local-only-global matrix)
-                auto coarse_size =
-                    static_cast<int64>(std::get<1>(result)->get_size()[0]);
-                comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM);
-                new_recv_gather_idxs.set_executor(exec);
-
-                // setup the generated linop.
+        auto convert_fine_op = [&](auto matrix) {
+            using global_index_type = typename std::decay_t<
+                decltype(*matrix)>::result_type::global_index_type;
+            auto exec = as<LinOp>(matrix)->get_executor();
+            auto comm = as<experimental::distributed::DistributedBase>(matrix)
+                            ->get_communicator();
+            auto fine = share(
+                experimental::distributed::
+                    Matrix<ValueType, IndexType, global_index_type>::create(
+                        exec, comm,
+                        matrix::Csr<ValueType, IndexType>::create(exec),
+                        matrix::Csr<ValueType, IndexType>::create(exec)));
+            matrix->convert_to(fine);
+            this->set_fine_op(fine);
+        };
+        auto setup_fine_op = [&](auto matrix) {
+            // Only support csr matrix currently.
+            auto local_csr = std::dynamic_pointer_cast<const csr_type>(
+                matrix->get_local_matrix());
+            auto non_local_csr = std::dynamic_pointer_cast<const csr_type>(
+                matrix->get_non_local_matrix());
+            // If system matrix is not csr or need sorting, generate the
+            // csr.
+            if (!parameters_.skip_sorting || !local_csr || !non_local_csr) {
                 using global_index_type =
                     typename std::decay_t<decltype(*matrix)>::global_index_type;
-                auto coarse = share(
-                    experimental::distributed::
-                        Matrix<ValueType, IndexType, global_index_type>::create(
-                            exec, comm, gko::dim<2>(coarse_size, coarse_size),
-                            std::get<1>(result), result_non_local_csr,
-                            new_recv_size, new_recv_offsets,
-                            new_recv_gather_idxs));
-                auto restrict_op = share(
-                    experimental::distributed::
-                        Matrix<ValueType, IndexType, global_index_type>::create(
-                            exec, comm,
-                            dim<2>(coarse_size,
-                                   gko::as<LinOp>(matrix)->get_size()[0]),
-                            std::get<2>(result)));
-                auto prolong_op = share(
-                    experimental::distributed::
-                        Matrix<ValueType, IndexType, global_index_type>::create(
-                            exec, comm,
-                            dim<2>(gko::as<LinOp>(matrix)->get_size()[0],
-                                   coarse_size),
-                            std::get<0>(result)));
-                this->set_multigrid_level(prolong_op, coarse, restrict_op);
-            };
-
-            // the fine op is using csr with the current ValueType
-            run<fst_mtx_type, snd_mtx_type>(this->get_fine_op(),
-                                            distributed_setup);
+                convert_fine_op(
+                    as<ConvertibleTo<experimental::distributed::Matrix<
+                        ValueType, IndexType, global_index_type>>>(matrix));
+            }
+        };
+
+        using fst_mtx_type =
+            experimental::distributed::Matrix<ValueType, IndexType, IndexType>;
+        using snd_mtx_type =
+            experimental::distributed::Matrix<ValueType, IndexType, int64>;
+        // setup the fine op using Csr with current ValueType
+        // we do not use dispatcher run in the first place because we have
+        // the fallback option for that.
+        if (auto obj =
+                std::dynamic_pointer_cast<const fst_mtx_type>(system_matrix_)) {
+            setup_fine_op(obj);
+        } else if (auto obj = std::dynamic_pointer_cast<const snd_mtx_type>(
+                       system_matrix_)) {
+            setup_fine_op(obj);
+        } else {
+            // handle other ValueTypes.
+            run<ConvertibleTo, fst_mtx_type, snd_mtx_type>(system_matrix_,
+                                                           convert_fine_op);
         }
+
+        auto distributed_setup = [&](auto matrix) {
+            auto exec = gko::as<LinOp>(matrix)->get_executor();
+            auto comm =
+                gko::as<experimental::distributed::DistributedBase>(matrix)
+                    ->get_communicator();
+            auto num_rank = comm.size();
+            auto pgm_local_op =
+                gko::as<const csr_type>(matrix->get_local_matrix());
+            auto result = this->generate_local(pgm_local_op);
+
+            auto non_local_csr =
+                as<const csr_type>(matrix->get_non_local_matrix());
+            auto non_local_size = non_local_csr->get_size()[1];
+            array<IndexType> non_local_agg(exec, non_local_size);
+            // get agg information (prolong_row_gather row idx)
+            communicate(matrix, agg_, non_local_agg);
+            // generate non_local_col_map
+            non_local_agg.set_executor(exec->get_master());
+            array<IndexType> non_local_col_map(exec->get_master(),
+                                               non_local_size);
+            // add additional entry in tail such that the offset easily
+            // handle it.
+            array<IndexType> renumber(exec->get_master(), non_local_size + 1);
+            auto recv_offsets = matrix->recv_offsets_;
+            generate_non_local_map(recv_offsets, non_local_agg,
+                                   non_local_col_map, renumber);
+
+            // get new recv_size and recv_offsets
+            std::vector<experimental::distributed::comm_index_type>
+                new_recv_size(num_rank);
+            std::vector<experimental::distributed::comm_index_type>
+                new_recv_offsets(num_rank + 1);
+            array<IndexType> new_recv_gather_idxs(exec->get_master());
+            compute_communication(recv_offsets, non_local_agg, renumber,
+                                  new_recv_size, new_recv_offsets,
+                                  new_recv_gather_idxs);
+
+            non_local_col_map.set_executor(exec);
+            IndexType non_local_num_agg = new_recv_gather_idxs.get_size();
+            // build csr from row and col map
+            // unlike non-distributed version, generate_coarse uses
+            // different row and col maps.
+            auto result_non_local_csr = generate_coarse(
+                exec, non_local_csr.get(),
+                static_cast<IndexType>(std::get<1>(result)->get_size()[0]),
+                agg_, non_local_num_agg, non_local_col_map);
+            // use local and non-local to build coarse matrix
+            // also restriction and prolongation (Local-only-global matrix)
+            auto coarse_size =
+                static_cast<int64>(std::get<1>(result)->get_size()[0]);
+            comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM);
+            new_recv_gather_idxs.set_executor(exec);
+
+            // setup the generated linop.
+            using global_index_type =
+                typename std::decay_t<decltype(*matrix)>::global_index_type;
+            auto coarse = share(
+                experimental::distributed::
+                    Matrix<ValueType, IndexType, global_index_type>::create(
+                        exec, comm, gko::dim<2>(coarse_size, coarse_size),
+                        std::get<1>(result), result_non_local_csr,
+                        new_recv_size, new_recv_offsets, new_recv_gather_idxs));
+            auto restrict_op = share(
+                experimental::distributed::
+                    Matrix<ValueType, IndexType, global_index_type>::create(
+                        exec, comm,
+                        dim<2>(coarse_size,
+                               gko::as<LinOp>(matrix)->get_size()[0]),
+                        std::get<2>(result)));
+            auto prolong_op = share(
+                experimental::distributed::
+                    Matrix<ValueType, IndexType, global_index_type>::create(
+                        exec, comm,
+                        dim<2>(gko::as<LinOp>(matrix)->get_size()[0],
+                               coarse_size),
+                        std::get<0>(result)));
+            this->set_multigrid_level(prolong_op, coarse, restrict_op);
+        };
+
+        // the fine op is using csr with the current ValueType
+        run<fst_mtx_type, snd_mtx_type>(this->get_fine_op(), distributed_setup);
     } else
 #endif  // GINKGO_BUILD_MPI
     {
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index e066fc696a1..067d7d7aad2 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -196,17 +196,18 @@ void finish_reduce(matrix::Dense<ValueType>* hessenberg_iter,
     auto hessenberg_reduce = hessenberg_iter->create_submatrix(
         span{0, restart_iter + 1}, span{0, num_rhs});
     int message_size = static_cast<int>((restart_iter + 1) * num_rhs);
+    auto sum_op = gko::experimental::mpi::sum<ValueType>();
     if (experimental::mpi::requires_host_buffer(exec, comm)) {
         ::gko::detail::DenseCache<ValueType> host_reduction_buffer;
         host_reduction_buffer.init(exec->get_master(),
                                    hessenberg_reduce->get_size());
         host_reduction_buffer->copy_from(hessenberg_reduce);
         comm.all_reduce(exec->get_master(), host_reduction_buffer->get_values(),
-                        message_size, MPI_SUM);
+                        message_size, sum_op.get());
         hessenberg_reduce->copy_from(host_reduction_buffer.get());
     } else {
         comm.all_reduce(exec, hessenberg_reduce->get_values(), message_size,
-                        MPI_SUM);
+                        sum_op.get());
     }
 }
 #endif
diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp
index 26a551b5758..efc929a19c5 100644
--- a/core/test/mpi/distributed/matrix.cpp
+++ b/core/test/mpi/distributed/matrix.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -177,7 +177,7 @@ class MatrixBuilder : public ::testing::Test {
     gko::experimental::mpi::communicator comm;
 };
 
-TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp
index b55ec6a80ce..1cf3f04d311 100644
--- a/core/test/mpi/distributed/preconditioner/schwarz.cpp
+++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -65,7 +65,7 @@ class SchwarzFactory : public ::testing::Test {
     std::shared_ptr<Mtx> mtx;
 };
 
-TYPED_TEST_SUITE(SchwarzFactory, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(SchwarzFactory, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 29aa4bfcab1..ffbe0645484 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -340,7 +340,8 @@ gko::detail::temporary_conversion<Vector<ValueType>> make_temporary_conversion(
 {
     auto result =
         gko::detail::temporary_conversion<Vector<ValueType>>::template create<
-            Vector<next_precision_base<ValueType>>>(matrix);
+            Vector<next_precision<ValueType>>,
+            Vector<next_precision<next_precision<ValueType>>>>(matrix);
     if (!result) {
         GKO_NOT_SUPPORTED(matrix);
     }
@@ -356,7 +357,9 @@ gko::detail::temporary_conversion<const Vector<ValueType>>
 make_temporary_conversion(const LinOp* matrix)
 {
     auto result = gko::detail::temporary_conversion<const Vector<ValueType>>::
-        template create<Vector<next_precision_base<ValueType>>>(matrix);
+        template create<Vector<next_precision<ValueType>>,
+                        Vector<next_precision<next_precision<ValueType>>>>(
+            matrix);
     if (!result) {
         GKO_NOT_SUPPORTED(matrix);
     }
@@ -381,11 +384,7 @@ make_temporary_conversion(const LinOp* matrix)
 template <typename ValueType, typename Function, typename... Args>
 void precision_dispatch(Function fn, Args*... linops)
 {
-    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
-        GKO_NOT_SUPPORTED(nullptr);
-    } else {
-        fn(distributed::make_temporary_conversion<ValueType>(linops).get()...);
-    }
+    fn(distributed::make_temporary_conversion<ValueType>(linops).get()...);
 }
 
 
@@ -401,29 +400,23 @@ void precision_dispatch(Function fn, Args*... linops)
 template <typename ValueType, typename Function>
 void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out)
 {
-    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
-        GKO_NOT_SUPPORTED(nullptr);
+    auto complex_to_real = !(
+        is_complex<ValueType>() ||
+        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
+            in));
+    if (complex_to_real) {
+        auto dense_in =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
+        using Vector = experimental::distributed::Vector<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
     } else {
-        auto complex_to_real = !(
-            is_complex<ValueType>() ||
-            dynamic_cast<
-                const ConvertibleTo<experimental::distributed::Vector<>>*>(in));
-        if (complex_to_real) {
-            auto dense_in =
-                distributed::make_temporary_conversion<to_complex<ValueType>>(
-                    in);
-            auto dense_out =
-                distributed::make_temporary_conversion<to_complex<ValueType>>(
-                    out);
-            using Vector = experimental::distributed::Vector<ValueType>;
-            // These dynamic_casts are only needed to make the code compile
-            // If ValueType is complex, this branch will never be taken
-            // If ValueType is real, the cast is a no-op
-            fn(dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
-               dynamic_cast<Vector*>(dense_out->create_real_view().get()));
-        } else {
-            distributed::precision_dispatch<ValueType>(fn, in, out);
-        }
+        distributed::precision_dispatch<ValueType>(fn, in, out);
     }
 }
 
@@ -435,33 +428,27 @@ template <typename ValueType, typename Function>
 void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
                                      const LinOp* in, LinOp* out)
 {
-    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
-        GKO_NOT_SUPPORTED(nullptr);
+    auto complex_to_real = !(
+        is_complex<ValueType>() ||
+        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
+            in));
+    if (complex_to_real) {
+        auto dense_in =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
+        auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
+        using Vector = experimental::distributed::Vector<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dense_alpha.get(),
+           dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
     } else {
-        auto complex_to_real = !(
-            is_complex<ValueType>() ||
-            dynamic_cast<
-                const ConvertibleTo<experimental::distributed::Vector<>>*>(in));
-        if (complex_to_real) {
-            auto dense_in =
-                distributed::make_temporary_conversion<to_complex<ValueType>>(
-                    in);
-            auto dense_out =
-                distributed::make_temporary_conversion<to_complex<ValueType>>(
-                    out);
-            auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
-            using Vector = experimental::distributed::Vector<ValueType>;
-            // These dynamic_casts are only needed to make the code compile
-            // If ValueType is complex, this branch will never be taken
-            // If ValueType is real, the cast is a no-op
-            fn(dense_alpha.get(),
-               dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
-               dynamic_cast<Vector*>(dense_out->create_real_view().get()));
-        } else {
-            fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
-               distributed::make_temporary_conversion<ValueType>(in).get(),
-               distributed::make_temporary_conversion<ValueType>(out).get());
-        }
+        fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
+           distributed::make_temporary_conversion<ValueType>(in).get(),
+           distributed::make_temporary_conversion<ValueType>(out).get());
     }
 }
 
@@ -474,36 +461,30 @@ void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
                                      const LinOp* in, const LinOp* beta,
                                      LinOp* out)
 {
-    if constexpr (std::is_same_v<remove_complex<ValueType>, half>) {
-        GKO_NOT_SUPPORTED(nullptr);
+    auto complex_to_real = !(
+        is_complex<ValueType>() ||
+        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
+            in));
+    if (complex_to_real) {
+        auto dense_in =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
+        auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
+        auto dense_beta = gko::make_temporary_conversion<ValueType>(beta);
+        using Vector = experimental::distributed::Vector<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dense_alpha.get(),
+           dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+           dense_beta.get(),
+           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
     } else {
-        auto complex_to_real = !(
-            is_complex<ValueType>() ||
-            dynamic_cast<
-                const ConvertibleTo<experimental::distributed::Vector<>>*>(in));
-        if (complex_to_real) {
-            auto dense_in =
-                distributed::make_temporary_conversion<to_complex<ValueType>>(
-                    in);
-            auto dense_out =
-                distributed::make_temporary_conversion<to_complex<ValueType>>(
-                    out);
-            auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
-            auto dense_beta = gko::make_temporary_conversion<ValueType>(beta);
-            using Vector = experimental::distributed::Vector<ValueType>;
-            // These dynamic_casts are only needed to make the code compile
-            // If ValueType is complex, this branch will never be taken
-            // If ValueType is real, the cast is a no-op
-            fn(dense_alpha.get(),
-               dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
-               dense_beta.get(),
-               dynamic_cast<Vector*>(dense_out->create_real_view().get()));
-        } else {
-            fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
-               distributed::make_temporary_conversion<ValueType>(in).get(),
-               gko::make_temporary_conversion<ValueType>(beta).get(),
-               distributed::make_temporary_conversion<ValueType>(out).get());
-        }
+        fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
+           distributed::make_temporary_conversion<ValueType>(in).get(),
+           gko::make_temporary_conversion<ValueType>(beta).get(),
+           distributed::make_temporary_conversion<ValueType>(out).get());
     }
 }
 
diff --git a/reference/test/distributed/assembly_kernels.cpp b/reference/test/distributed/assembly_kernels.cpp
index 4823f465a31..57981d254e5 100644
--- a/reference/test/distributed/assembly_kernels.cpp
+++ b/reference/test/distributed/assembly_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -48,7 +48,7 @@ class AssemblyHelpers : public ::testing::Test {
     gko::array<comm_index_type> mapping;
 };
 
-TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index b5e2e3f5fb9..40a9b15e0ec 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -182,7 +182,7 @@ class Matrix : public ::testing::Test {
     gko::array<value_type> non_local_values;
 };
 
-TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/reference/test/distributed/vector_kernels.cpp b/reference/test/distributed/vector_kernels.cpp
index 43f11967488..4e03cc6995e 100644
--- a/reference/test/distributed/vector_kernels.cpp
+++ b/reference/test/distributed/vector_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -69,7 +69,7 @@ class Vector : public ::testing::Test {
     std::shared_ptr<const gko::ReferenceExecutor> ref;
 };
 
-TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/distributed/assembly_kernels.cpp b/test/distributed/assembly_kernels.cpp
index 4ab4c9173ac..f27682cc883 100644
--- a/test/distributed/assembly_kernels.cpp
+++ b/test/distributed/assembly_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -100,7 +100,7 @@ class AssemblyHelpers : public CommonTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 9a5d4f2cf7b..afae8c130dc 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -87,7 +87,7 @@ class Matrix : public CommonTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
index a212699a2ca..c195318dde2 100644
--- a/test/distributed/vector_kernels.cpp
+++ b/test/distributed/vector_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -67,7 +67,7 @@ class Vector : public CommonTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/mpi/assembly.cpp b/test/mpi/assembly.cpp
index 3ad47565d44..819f4a47515 100644
--- a/test/mpi/assembly.cpp
+++ b/test/mpi/assembly.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -83,7 +83,7 @@ class AssemblyHelpers : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 8e4eeb3921c..80d7aaea364 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -101,7 +101,7 @@ class MatrixCreation : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
@@ -463,7 +463,7 @@ class Matrix : public CommonMpiTestFixture {
     std::default_random_engine engine;
 };
 
-TYPED_TEST_SUITE(Matrix, gko::test::ValueTypesBase, TypenameNameGenerator);
+TYPED_TEST_SUITE(Matrix, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Matrix, CanApplyToSingleVector)
@@ -729,7 +729,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision)
     using csr = typename TestFixture::local_matrix_type;
     using local_index_type = typename TestFixture::local_index_type;
     using global_index_type = typename TestFixture::global_index_type;
-    using OtherT = typename gko::next_precision_base<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherDist = typename gko::experimental::distributed::Matrix<
         OtherT, local_index_type, global_index_type>;
     auto tmp = OtherDist::create(this->ref, this->comm);
@@ -755,7 +755,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision)
     using csr = typename TestFixture::local_matrix_type;
     using local_index_type = typename TestFixture::local_index_type;
     using global_index_type = typename TestFixture::global_index_type;
-    using OtherT = typename gko::next_precision_base<T>;
+    using OtherT = typename gko::next_precision<T>;
     using OtherDist = typename gko::experimental::distributed::Matrix<
         OtherT, local_index_type, global_index_type>;
     auto tmp = OtherDist::create(this->ref, this->comm);
diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp
index df198f235c3..3f3638ff7e1 100644
--- a/test/mpi/multigrid/pgm.cpp
+++ b/test/mpi/multigrid/pgm.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -80,7 +80,7 @@ class Pgm : public CommonMpiTestFixture {
     std::shared_ptr<dist_mtx_type> dist_mat;
 };
 
-TYPED_TEST_SUITE(Pgm, gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(Pgm, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp
index 113f8922aae..e1e740c787a 100644
--- a/test/mpi/preconditioner/schwarz.cpp
+++ b/test/mpi/preconditioner/schwarz.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -143,8 +143,7 @@ class SchwarzPreconditioner : public CommonMpiTestFixture {
     }
 };
 
-TYPED_TEST_SUITE(SchwarzPreconditioner,
-                 gko::test::ValueLocalGlobalIndexTypesBase,
+TYPED_TEST_SUITE(SchwarzPreconditioner, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState)