From e59dd3503ed9ee29f6ba620a9ce5ae4ed4cad125 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Dec 2024 09:08:55 +0100 Subject: [PATCH 1/5] WIP: half distributed --- .../cuda_hip/distributed/matrix_kernels.cpp | 14 +- core/distributed/helpers.hpp | 14 +- core/distributed/matrix.cpp | 40 + core/distributed/vector.cpp | 20 + core/test/mpi/base/bindings.cpp | 971 +++++++++--------- include/ginkgo/core/base/mpi.hpp | 42 + include/ginkgo/core/distributed/matrix.hpp | 25 +- include/ginkgo/core/distributed/vector.hpp | 42 +- test/mpi/matrix.cpp | 12 +- test/mpi/solver/solver.cpp | 10 +- test/mpi/vector.cpp | 9 +- 11 files changed, 689 insertions(+), 510 deletions(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index bdf189d9785..cf3ac70822b 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -137,11 +137,11 @@ void separate_local_nonlocal( col_range_starting_indices[range_id]; }; - using input_type = input_type; + using input_type = input_type, GlobalIndexType>; auto input_it = thrust::make_zip_iterator(thrust::make_tuple( input.get_const_row_idxs(), input.get_const_col_idxs(), - input.get_const_values(), row_range_ids.get_const_data(), - col_range_ids.get_const_data())); + as_device_type(input.get_const_values()), + row_range_ids.get_const_data(), col_range_ids.get_const_data())); // copy and transform local entries into arrays local_row_idxs.resize_and_reset(num_local_elements); @@ -157,9 +157,9 @@ void separate_local_nonlocal( thrust::copy_if( policy, local_it, local_it + input.get_num_stored_elements(), range_ids_it, - thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(), - local_col_idxs.get_data(), - local_values.get_data())), + thrust::make_zip_iterator(thrust::make_tuple( + local_row_idxs.get_data(), local_col_idxs.get_data(), + as_device_type(local_values.get_data()))), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; @@ -185,7 +185,7 @@ void separate_local_nonlocal( range_ids_it, thrust::make_zip_iterator(thrust::make_tuple( non_local_row_idxs.get_data(), non_local_col_idxs.get_data(), - non_local_values.get_data())), + as_device_type(non_local_values.get_data()))), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp index 9ce7d3b6ab4..5536dbe32f0 100644 --- a/core/distributed/helpers.hpp +++ b/core/distributed/helpers.hpp @@ -122,15 +122,11 @@ void vector_dispatch(T* linop, F&& f, Args&&... args) { #if GINKGO_BUILD_MPI if (is_distributed(linop)) { - if constexpr (std::is_same_v, half>) { - GKO_NOT_SUPPORTED(linop); - } else { - using type = std::conditional_t< - std::is_const::value, - const experimental::distributed::Vector, - experimental::distributed::Vector>; - f(dynamic_cast(linop), std::forward(args)...); - } + using type = std::conditional_t< + std::is_const::value, + const experimental::distributed::Vector, + experimental::distributed::Vector>; + f(dynamic_cast(linop), std::forward(args)...); } else #endif { diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index fd0ff9565b9..b25b4d2bba8 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -237,6 +237,46 @@ void Matrix::move_to( } +#if GINKGO_ENABLE_HALF +template +void Matrix::convert_to( + Matrix>, local_index_type, + global_index_type>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix>, local_index_type, + global_index_type>* result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} +#endif + template void Matrix::read_distributed( const device_matrix_data& data, diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 4a79eb64a9b..3c9c77f1b4a 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -295,6 +295,26 @@ void Vector::move_to(Vector>* result) } +#if GINKGO_ENABLE_HALF +template +void Vector::convert_to( + Vector>>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to( + Vector>>* result) +{ + this->convert_to(result); +} +#endif + template std::unique_ptr::absolute_type> Vector::compute_absolute() const diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp index 9feebb76ca9..cbd13820050 100644 --- a/core/test/mpi/base/bindings.cpp +++ b/core/test/mpi/base/bindings.cpp @@ -24,8 +24,9 @@ class MpiBindings : public ::testing::Test { std::shared_ptr ref; }; -using TestTypes = gko::test::merge_type_list_t; +using TestTypes = + gko::test::merge_type_list_t; TYPED_TEST_SUITE(MpiBindings, TestTypes, TypenameNameGenerator); @@ -274,469 +275,492 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence) ASSERT_EQ(data, ref); } - -TYPED_TEST(MpiBindings, CanAccumulateValues) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else if (my_rank == 1) { - data = std::vector{5, 6, 7, 8}; - } else if (my_rank == 2) { - data = std::vector{9, 10, 11, 12}; - } else { - data = std::vector{0, 0, 0, 0}; - } - - { - auto win = window(this->ref, data.data(), 4, comm); - if (my_rank == 0) { - win.lock_all(); - for (auto rank = 0; rank < num_ranks; ++rank) { - if (rank != my_rank) { - win.accumulate(this->ref, data.data(), 4, rank, 0, 4, - MPI_SUM); - } - } - win.unlock_all(); - } - } - - std::vector ref; - if (my_rank == 0) { - ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); - } else if (my_rank == 1) { - ref = std::vector{6, 8, 10, 12}; - ASSERT_EQ(data, ref); - } else if (my_rank == 2) { - ref = std::vector{10, 12, 14, 16}; - ASSERT_EQ(data, ref); - } else { - ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); - } -} - - -TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else if (my_rank == 1) { - data = std::vector{5, 6, 7, 8}; - } else if (my_rank == 2) { - data = std::vector{9, 10, 11, 12}; - } else { - data = std::vector{0, 0, 0, 0}; - } - - gko::experimental::mpi::request req; - { - auto win = window(this->ref, data.data(), 4, comm); - if (my_rank == 0) { - win.lock_all(); - for (auto rank = 0; rank < num_ranks; ++rank) { - if (rank != my_rank) { - req = win.r_accumulate(this->ref, data.data(), 4, rank, 0, - 4, MPI_SUM); - } - } - win.unlock_all(); - } - } - - req.wait(); - std::vector ref; - if (my_rank == 0) { - ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); - } else if (my_rank == 1) { - ref = std::vector{6, 8, 10, 12}; - ASSERT_EQ(data, ref); - } else if (my_rank == 2) { - ref = std::vector{10, 12, 14, 16}; - ASSERT_EQ(data, ref); - } else { - ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); - } -} - - -TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else { - data = std::vector{0, 0, 0, 0}; - } - auto win = window(this->ref, data.data(), 4, comm); - - if (my_rank != 0) { - win.lock_all(); - win.get(this->ref, data.data(), 4, 0, 0, 4); - win.unlock_all(); - } - - auto ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); -} - - -TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else { - data = std::vector{0, 0, 0, 0}; - } - gko::experimental::mpi::request req; - auto win = window(this->ref, data.data(), 4, comm); - - if (my_rank != 0) { - win.lock_all(); - req = win.r_get(this->ref, data.data(), 4, 0, 0, 4); - win.unlock_all(); - } - - req.wait(); - auto ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); -} - - -TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else { - data = std::vector{0, 0, 0, 0}; - } - auto win = window(this->ref, data.data(), 4, comm); - - if (my_rank != 0) { - win.lock(0, window::lock_type::exclusive); - win.get(this->ref, data.data(), 4, 0, 0, 4); - win.unlock(0); - } - - auto ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); -} - - -TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else { - data = std::vector{0, 0, 0, 0}; - } - auto win = window(this->ref, data.data(), 4, comm); - - if (my_rank != 0) { - win.lock(0); - win.get(this->ref, data.data(), 4, 0, 0, 4); - win.unlock(0); - } - - auto ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); -} - - -TYPED_TEST(MpiBindings, CanGetValuesWithFence) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - } else { - data = std::vector{0, 0, 0, 0}; - } - auto win = window(this->ref, data.data(), 4, comm); - - win.fence(); - if (my_rank != 0) { - win.get(this->ref, data.data(), 4, 0, 0, 4); - } - win.fence(); - - auto ref = std::vector{1, 2, 3, 4}; - ASSERT_EQ(data, ref); -} - - -TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - std::vector target; - std::vector result(4, 0); - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - target = std::vector{1, 2, 3, 4}; - } else if (my_rank == 1) { - data = std::vector{5, 6, 7, 8}; - target = std::vector{5, 6, 7, 8}; - } else if (my_rank == 2) { - data = std::vector{9, 10, 11, 12}; - target = std::vector{9, 10, 11, 12}; - } else { - data = std::vector{0, 0, 0, 0}; - target = std::vector{0, 0, 0, 0}; - } - - { - auto win = window(this->ref, target.data(), 4, comm); - - if (my_rank == 2) { - win.lock_all(); - win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, 0, - 0, 4, MPI_SUM); - win.unlock_all(); - } - } - - std::vector ref; - std::vector ref2; - if (my_rank == 0) { - ref = std::vector{10, 12, 14, 16}; - EXPECT_EQ(target, ref); - } else if (my_rank == 2) { - ref = std::vector{1, 2, 3, 4}; - EXPECT_EQ(result, ref); - } -} - - -TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - std::vector target; - std::vector result(4, 0); - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - target = std::vector{1, 2, 3, 4}; - } else if (my_rank == 1) { - data = std::vector{5, 6, 7, 8}; - target = std::vector{5, 6, 7, 8}; - } else if (my_rank == 2) { - data = std::vector{9, 10, 11, 12}; - target = std::vector{9, 10, 11, 12}; - } else { - data = std::vector{0, 0, 0, 0}; - target = std::vector{0, 0, 0, 0}; - } - gko::experimental::mpi::request req; - - { - auto win = window(this->ref, target.data(), 4, comm); - - if (my_rank == 2) { - win.lock_all(); - req = win.r_get_accumulate(this->ref, data.data(), 4, result.data(), - 4, 0, 0, 4, MPI_SUM); - win.unlock_all(); - } - } - - req.wait(); - std::vector ref; - std::vector ref2; - if (my_rank == 0) { - ref = std::vector{10, 12, 14, 16}; - ref2 = std::vector{1, 2, 3, 4}; - EXPECT_EQ(target, ref); - EXPECT_EQ(data, ref2); - } else if (my_rank == 2) { - ref = std::vector{1, 2, 3, 4}; - ref2 = std::vector{9, 10, 11, 12}; - EXPECT_EQ(result, ref); - EXPECT_EQ(target, ref2); - EXPECT_EQ(data, ref2); - } -} - - -TYPED_TEST(MpiBindings, CanFetchAndOperate) -{ - using window = gko::experimental::mpi::window; - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - std::vector data; - std::vector target; - std::vector result(4, 0); - if (my_rank == 0) { - data = std::vector{1, 2, 3, 4}; - target = std::vector{1, 2, 3, 4}; - } else if (my_rank == 1) { - data = std::vector{5, 6, 7, 8}; - target = std::vector{5, 6, 7, 8}; - } else if (my_rank == 2) { - data = std::vector{9, 10, 11, 12}; - target = std::vector{9, 10, 11, 12}; - } else { - data = std::vector{0, 0, 0, 0}; - target = std::vector{0, 0, 0, 0}; - } - - { - auto win = window(this->ref, target.data(), 4, comm); - - if (my_rank == 2) { - win.lock_all(); - win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1, - MPI_SUM); - win.unlock_all(); - } - } - - std::vector ref; - std::vector ref2; - if (my_rank == 0) { - ref = std::vector{1, 11, 3, 4}; - EXPECT_EQ(target, ref); - } else if (my_rank == 2) { - ref = std::vector{2, 0, 0, 0}; - EXPECT_EQ(result, ref); - } -} - - -TYPED_TEST(MpiBindings, CanBroadcastValues) +void half_sum(void* input, void* output, int* len, MPI_Datatype* datatype) { - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - auto array = gko::array{this->ref, 8}; - if (my_rank == 0) { - array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); + gko::half* input_ptr = static_cast(input); + gko::half* output_ptr = static_cast(output); + for (int i = 0; i < *len; i++) { + output_ptr[i] += input_ptr[i]; } - - comm.broadcast(this->ref, array.get_data(), 8, 0); - - auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); - GKO_ASSERT_ARRAY_EQ(ref, array); } - -TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) -{ - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - auto array = gko::array{this->ref, 8}; - if (my_rank == 0) { - array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); - } - - auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0); - - req.wait(); - auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); - GKO_ASSERT_ARRAY_EQ(ref, array); -} - - -TYPED_TEST(MpiBindings, CanReduceValues) -{ - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - TypeParam data, sum, max, min; - if (my_rank == 0) { - data = 3; - } else if (my_rank == 1) { - data = 5; - } else if (my_rank == 2) { - data = 2; - } else if (my_rank == 3) { - data = 6; - } - - comm.reduce(this->ref, &data, &sum, 1, MPI_SUM, 0); - comm.reduce(this->ref, &data, &max, 1, MPI_MAX, 0); - comm.reduce(this->ref, &data, &min, 1, MPI_MIN, 0); - - if (my_rank == 0) { - EXPECT_EQ(sum, TypeParam{16}); - EXPECT_EQ(max, TypeParam{6}); - EXPECT_EQ(min, TypeParam{2}); - } -} - - -TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) -{ - auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto my_rank = comm.rank(); - auto num_ranks = comm.size(); - TypeParam data, sum, max, min; - if (my_rank == 0) { - data = 3; - } else if (my_rank == 1) { - data = 5; - } else if (my_rank == 2) { - data = 2; - } else if (my_rank == 3) { - data = 6; - } - - auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, MPI_SUM, 0); - auto req2 = comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0); - auto req3 = comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0); - - req1.wait(); - req2.wait(); - req3.wait(); - if (my_rank == 0) { - EXPECT_EQ(sum, TypeParam{16}); - EXPECT_EQ(max, TypeParam{6}); - EXPECT_EQ(min, TypeParam{2}); - } -} +// TYPED_TEST(MpiBindings, CanAccumulateValues) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else if (my_rank == 1) { +// data = std::vector{5, 6, 7, 8}; +// } else if (my_rank == 2) { +// data = std::vector{9, 10, 11, 12}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } +// MPI_Op operation; +// MPI_Op_create(&half_sum, 1, &operation); +// { +// auto win = window(this->ref, data.data(), 4, comm); +// if (my_rank == 0) { +// win.lock_all(); +// for (auto rank = 0; rank < num_ranks; ++rank) { +// if (rank != my_rank) { +// if (std::is_same_v) { +// win.accumulate(this->ref, data.data(), 4, rank, 0, 4, +// operation); +// } else { +// win.accumulate(this->ref, data.data(), 4, rank, 0, 4, +// MPI_SUM); +// } +// } +// } +// win.unlock_all(); +// } +// } +// MPI_Op_free(&operation); + +// std::vector ref; +// if (my_rank == 0) { +// ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } else if (my_rank == 1) { +// ref = std::vector{6, 8, 10, 12}; +// ASSERT_EQ(data, ref); +// } else if (my_rank == 2) { +// ref = std::vector{10, 12, 14, 16}; +// ASSERT_EQ(data, ref); +// } else { +// ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } +// } + + +// TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else if (my_rank == 1) { +// data = std::vector{5, 6, 7, 8}; +// } else if (my_rank == 2) { +// data = std::vector{9, 10, 11, 12}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } + +// gko::experimental::mpi::request req; +// { +// auto win = window(this->ref, data.data(), 4, comm); +// if (my_rank == 0) { +// win.lock_all(); +// for (auto rank = 0; rank < num_ranks; ++rank) { +// if (rank != my_rank) { +// req = win.r_accumulate(this->ref, data.data(), 4, rank, +// 0, +// 4, +// gko::experimental::mpi::sum()); +// } +// } +// win.unlock_all(); +// } +// } + +// req.wait(); +// std::vector ref; +// if (my_rank == 0) { +// ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } else if (my_rank == 1) { +// ref = std::vector{6, 8, 10, 12}; +// ASSERT_EQ(data, ref); +// } else if (my_rank == 2) { +// ref = std::vector{10, 12, 14, 16}; +// ASSERT_EQ(data, ref); +// } else { +// ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } +// } + + +// TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } +// auto win = window(this->ref, data.data(), 4, comm); + +// if (my_rank != 0) { +// win.lock_all(); +// win.get(this->ref, data.data(), 4, 0, 0, 4); +// win.unlock_all(); +// } + +// auto ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } + + +// TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } +// gko::experimental::mpi::request req; +// auto win = window(this->ref, data.data(), 4, comm); + +// if (my_rank != 0) { +// win.lock_all(); +// req = win.r_get(this->ref, data.data(), 4, 0, 0, 4); +// win.unlock_all(); +// } + +// req.wait(); +// auto ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } + + +// TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } +// auto win = window(this->ref, data.data(), 4, comm); + +// if (my_rank != 0) { +// win.lock(0, window::lock_type::exclusive); +// win.get(this->ref, data.data(), 4, 0, 0, 4); +// win.unlock(0); +// } + +// auto ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } + + +// TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } +// auto win = window(this->ref, data.data(), 4, comm); + +// if (my_rank != 0) { +// win.lock(0); +// win.get(this->ref, data.data(), 4, 0, 0, 4); +// win.unlock(0); +// } + +// auto ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } + + +// TYPED_TEST(MpiBindings, CanGetValuesWithFence) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// } +// auto win = window(this->ref, data.data(), 4, comm); + +// win.fence(); +// if (my_rank != 0) { +// win.get(this->ref, data.data(), 4, 0, 0, 4); +// } +// win.fence(); + +// auto ref = std::vector{1, 2, 3, 4}; +// ASSERT_EQ(data, ref); +// } + + +// TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// std::vector target; +// std::vector result(4, 0); +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// target = std::vector{1, 2, 3, 4}; +// } else if (my_rank == 1) { +// data = std::vector{5, 6, 7, 8}; +// target = std::vector{5, 6, 7, 8}; +// } else if (my_rank == 2) { +// data = std::vector{9, 10, 11, 12}; +// target = std::vector{9, 10, 11, 12}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// target = std::vector{0, 0, 0, 0}; +// } + +// { +// auto win = window(this->ref, target.data(), 4, comm); + +// if (my_rank == 2) { +// win.lock_all(); +// win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, +// 0, +// 0, 4, +// gko::experimental::mpi::sum()); +// win.unlock_all(); +// } +// } + +// std::vector ref; +// std::vector ref2; +// if (my_rank == 0) { +// ref = std::vector{10, 12, 14, 16}; +// EXPECT_EQ(target, ref); +// } else if (my_rank == 2) { +// ref = std::vector{1, 2, 3, 4}; +// EXPECT_EQ(result, ref); +// } +// } + + +// TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// std::vector target; +// std::vector result(4, 0); +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// target = std::vector{1, 2, 3, 4}; +// } else if (my_rank == 1) { +// data = std::vector{5, 6, 7, 8}; +// target = std::vector{5, 6, 7, 8}; +// } else if (my_rank == 2) { +// data = std::vector{9, 10, 11, 12}; +// target = std::vector{9, 10, 11, 12}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// target = std::vector{0, 0, 0, 0}; +// } +// gko::experimental::mpi::request req; + +// { +// auto win = window(this->ref, target.data(), 4, comm); + +// if (my_rank == 2) { +// win.lock_all(); +// req = win.r_get_accumulate(this->ref, data.data(), 4, +// result.data(), +// 4, 0, 0, 4, +// gko::experimental::mpi::sum()); +// win.unlock_all(); +// } +// } + +// req.wait(); +// std::vector ref; +// std::vector ref2; +// if (my_rank == 0) { +// ref = std::vector{10, 12, 14, 16}; +// ref2 = std::vector{1, 2, 3, 4}; +// EXPECT_EQ(target, ref); +// EXPECT_EQ(data, ref2); +// } else if (my_rank == 2) { +// ref = std::vector{1, 2, 3, 4}; +// ref2 = std::vector{9, 10, 11, 12}; +// EXPECT_EQ(result, ref); +// EXPECT_EQ(target, ref2); +// EXPECT_EQ(data, ref2); +// } +// } + + +// TYPED_TEST(MpiBindings, CanFetchAndOperate) +// { +// using window = gko::experimental::mpi::window; +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// std::vector data; +// std::vector target; +// std::vector result(4, 0); +// if (my_rank == 0) { +// data = std::vector{1, 2, 3, 4}; +// target = std::vector{1, 2, 3, 4}; +// } else if (my_rank == 1) { +// data = std::vector{5, 6, 7, 8}; +// target = std::vector{5, 6, 7, 8}; +// } else if (my_rank == 2) { +// data = std::vector{9, 10, 11, 12}; +// target = std::vector{9, 10, 11, 12}; +// } else { +// data = std::vector{0, 0, 0, 0}; +// target = std::vector{0, 0, 0, 0}; +// } + +// { +// auto win = window(this->ref, target.data(), 4, comm); + +// if (my_rank == 2) { +// win.lock_all(); +// win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1, +// gko::experimental::mpi::sum()); +// win.unlock_all(); +// } +// } + +// std::vector ref; +// std::vector ref2; +// if (my_rank == 0) { +// ref = std::vector{1, 11, 3, 4}; +// EXPECT_EQ(target, ref); +// } else if (my_rank == 2) { +// ref = std::vector{2, 0, 0, 0}; +// EXPECT_EQ(result, ref); +// } +// } + + +// TYPED_TEST(MpiBindings, CanBroadcastValues) +// { +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// auto array = gko::array{this->ref, 8}; +// if (my_rank == 0) { +// array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); +// } + +// comm.broadcast(this->ref, array.get_data(), 8, 0); + +// auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); +// GKO_ASSERT_ARRAY_EQ(ref, array); +// } + + +// TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) +// { +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// auto array = gko::array{this->ref, 8}; +// if (my_rank == 0) { +// array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); +// } + +// auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0); + +// req.wait(); +// auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); +// GKO_ASSERT_ARRAY_EQ(ref, array); +// } + + +// TYPED_TEST(MpiBindings, CanReduceValues) +// { +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// TypeParam data, sum, max, min; +// if (my_rank == 0) { +// data = 3; +// } else if (my_rank == 1) { +// data = 5; +// } else if (my_rank == 2) { +// data = 2; +// } else if (my_rank == 3) { +// data = 6; +// } + +// comm.reduce(this->ref, &data, &sum, 1, +// gko::experimental::mpi::sum(), 0); comm.reduce(this->ref, +// &data, &max, 1, MPI_MAX, 0); comm.reduce(this->ref, &data, &min, 1, +// MPI_MIN, 0); + +// if (my_rank == 0) { +// EXPECT_EQ(sum, TypeParam{16}); +// EXPECT_EQ(max, TypeParam{6}); +// EXPECT_EQ(min, TypeParam{2}); +// } +// } + + +// TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) +// { +// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); +// auto my_rank = comm.rank(); +// auto num_ranks = comm.size(); +// TypeParam data, sum, max, min; +// if (my_rank == 0) { +// data = 3; +// } else if (my_rank == 1) { +// data = 5; +// } else if (my_rank == 2) { +// data = 2; +// } else if (my_rank == 3) { +// data = 6; +// } + +// auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, +// gko::experimental::mpi::sum(), 0); auto req2 = +// comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0); auto req3 = +// comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0); + +// req1.wait(); +// req2.wait(); +// req3.wait(); +// if (my_rank == 0) { +// EXPECT_EQ(sum, TypeParam{16}); +// EXPECT_EQ(max, TypeParam{6}); +// EXPECT_EQ(min, TypeParam{2}); +// } +// } TYPED_TEST(MpiBindings, CanAllReduceValues) @@ -754,9 +778,16 @@ TYPED_TEST(MpiBindings, CanAllReduceValues) } else if (my_rank == 3) { data = 6; } - - comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM); - + MPI_Op operation; + MPI_Op_create(&half_sum, 1, &operation); + if (std::is_same_v) { + comm.all_reduce(this->ref, &data, &sum, 1, operation); + } else { + comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM); + } + // comm.all_reduce(this->ref, &data, &sum, 1, + // gko::experimental::mpi::sum()); + MPI_Op_free(&operation); ASSERT_EQ(sum, TypeParam{16}); } @@ -777,7 +808,8 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace) data = 6; } - comm.all_reduce(this->ref, &data, 1, MPI_SUM); + comm.all_reduce(this->ref, &data, 1, + gko::experimental::mpi::sum()); ASSERT_EQ(data, TypeParam{16}); } @@ -799,7 +831,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues) data = 6; } - auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM); + auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, + gko::experimental::mpi::sum()); req.wait(); ASSERT_EQ(sum, TypeParam{16}); @@ -1438,7 +1471,8 @@ TYPED_TEST(MpiBindings, CanScanValues) data = 6; } - comm.scan(this->ref, &data, &sum, 1, MPI_SUM); + comm.scan(this->ref, &data, &sum, 1, + gko::experimental::mpi::sum()); comm.scan(this->ref, &data, &max, 1, MPI_MAX); comm.scan(this->ref, &data, &min, 1, MPI_MIN); @@ -1478,7 +1512,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues) data = 6; } - auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM); + auto req1 = comm.i_scan(this->ref, &data, &sum, 1, + gko::experimental::mpi::sum()); auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX); auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN); diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 64c04e1805a..94c5f0263c1 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -88,10 +89,51 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); +#if GINKGO_ENABLE_HALF +// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 +// TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); +GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); +#endif // GKO_ENABLE_HALF GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); +namespace detail { + +inline void half_sum(void* input, void* output, int* len, + MPI_Datatype* datatype) +{ + gko::half* input_ptr = static_cast(input); + gko::half* output_ptr = static_cast(output); + for (int i = 0; i < *len; i++) { + output_ptr[i] += input_ptr[i]; + } +} + +} // namespace detail + +template +inline MPI_Op sum() +{ + return MPI_SUM; +} + +template <> +inline MPI_Op sum() +{ + using handle_manager = + std::unique_ptr>; + static handle_manager mpi_op( + []() { + MPI_Op* operation = new MPI_Op; + MPI_Op_create(&detail::half_sum, 1, operation); + return operation; + }(), + [](MPI_Op* op) { MPI_Op_free(op); }); + return *mpi_op.get(); +} + /** * A move-only wrapper for a contiguous MPI_Datatype. * diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 4aca13df237..86c94fc74fb 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -259,14 +259,19 @@ template class Matrix : public EnableLinOp>, - public ConvertibleTo, + public ConvertibleTo< + Matrix, LocalIndexType, GlobalIndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>, LocalIndexType, GlobalIndexType>>, +#endif public DistributedBase { friend class EnablePolymorphicObject; - friend class Matrix, LocalIndexType, + friend class Matrix, LocalIndexType, GlobalIndexType>; friend class multigrid::Pgm; + public: using value_type = ValueType; using index_type = GlobalIndexType; @@ -288,7 +293,23 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; +#if GINKGO_ENABLE_HALF + friend class Matrix>, + LocalIndexType, GlobalIndexType>; + using ConvertibleTo< + Matrix>, local_index_type, + global_index_type>>::convert_to; + using ConvertibleTo>, + local_index_type, global_index_type>>::move_to; + + void convert_to( + Matrix>, local_index_type, + global_index_type>* result) const override; + + void move_to(Matrix>, + local_index_type, global_index_type>* result) override; +#endif /** * Reads a square matrix from the device_matrix_data structure and a global * partition. diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index c5d039f7e30..80d59beca7d 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -66,13 +66,16 @@ class Partition; template class Vector : public EnableLinOp>, - public ConvertibleTo>>, + public ConvertibleTo>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>>>, +#endif public EnableAbsoluteComputation>>, public DistributedBase { friend class EnablePolymorphicObject; friend class Vector>; friend class Vector>; - friend class Vector>; + friend class Vector>; friend class detail::VectorCache; public: @@ -173,6 +176,20 @@ class Vector void move_to(Vector>* result) override; +#if GINKGO_ENABLE_HALF + friend class Vector>>; + using ConvertibleTo< + Vector>>>::convert_to; + using ConvertibleTo< + Vector>>>::move_to; + + void convert_to(Vector>>* result) + const override; + + void move_to( + Vector>>* result) override; +#endif + std::unique_ptr compute_absolute() const override; void compute_absolute_inplace() override; @@ -680,6 +697,27 @@ struct conversion_target_helper> { return target_type::create(source->get_executor(), source->get_communicator()); } + + // Allow to create_empty of the same type + // For distributed case, next> will be V in the candicated list. + // TODO: decide to whether to add this or add condition to the list + static std::unique_ptr create_empty(const target_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } + +#if GINKGO_ENABLE_HALF + using snd_source_type = experimental::distributed::Vector< + previous_precision>>; + + static std::unique_ptr create_empty( + const snd_source_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } +#endif }; diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index e761aab159e..8e4eeb3921c 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -367,12 +367,10 @@ class Matrix : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); beta = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); } void SetUp() override { ASSERT_EQ(comm.size(), 3); } @@ -412,14 +410,12 @@ class Matrix : public CommonMpiTestFixture { num_rows, num_cols, std::uniform_int_distribution(static_cast(num_cols), static_cast(num_cols)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto mat_md = gko::test::generate_random_matrix_data( num_rows, num_rows, std::uniform_int_distribution(0, static_cast(num_rows)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto row_mapping = gko::test::generate_random_array< gko::experimental::distributed::comm_index_type>( diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 392bf9990b8..23ee506d51a 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -264,10 +264,7 @@ class Solver : public CommonMpiTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -294,10 +291,7 @@ class Solver : public CommonMpiTestFixture { { return gko::share(gko::initialize( {gko::test::detail::get_rand_value( - std::normal_distribution< - gko::remove_complex>(0.0, - 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, exec)); } diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 752342a8e64..a65bbc7fd36 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -762,8 +762,7 @@ class VectorLocalOps : public CommonMpiTestFixture { local_size[0], local_size[1], std::uniform_int_distribution(local_size[1], local_size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); dist = DistVectorType::create(exec, comm, size, gko::clone(local)); } @@ -775,8 +774,7 @@ class VectorLocalOps : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, size[1], std::uniform_int_distribution(size[1], size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); } void init_complex_vectors() @@ -971,8 +969,7 @@ TYPED_TEST(VectorLocalOps, FillSameAsLocal) { using value_type = typename TestFixture::value_type; auto value = gko::test::detail::get_rand_value( - std::normal_distribution>(), - this->engine); + std::normal_distribution<>(), this->engine); this->init_vectors(); this->x->fill(value); From b8ea909640770f600cc36087f64b6044e22dc4c9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 30 Dec 2024 14:56:25 +0100 Subject: [PATCH 2/5] WIP: change the instantiate type and implement the draft --- .../cuda_hip/distributed/assembly_kernels.cpp | 2 +- .../cuda_hip/distributed/matrix_kernels.cpp | 2 +- .../cuda_hip/distributed/vector_kernels.cpp | 2 +- .../unified/distributed/assembly_kernels.cpp | 2 +- core/device_hooks/common_kernels.inc.cpp | 2 +- core/distributed/assembly.cpp | 2 +- core/distributed/matrix.cpp | 10 +- core/distributed/preconditioner/schwarz.cpp | 3 +- core/distributed/vector.cpp | 6 +- core/test/mpi/base/bindings.cpp | 43 ++-- dpcpp/distributed/assembly_kernels.dp.cpp | 2 +- dpcpp/distributed/matrix_kernels.dp.cpp | 2 +- dpcpp/distributed/vector_kernels.dp.cpp | 2 +- include/ginkgo/core/base/mpi.hpp | 209 ++++++++++++++++-- include/ginkgo/core/distributed/matrix.hpp | 10 +- include/ginkgo/core/distributed/vector.hpp | 11 +- omp/distributed/assembly_kernels.cpp | 2 +- omp/distributed/matrix_kernels.cpp | 2 +- omp/distributed/vector_kernels.cpp | 2 +- reference/distributed/assembly_kernels.cpp | 4 +- reference/distributed/matrix_kernels.cpp | 2 +- reference/distributed/vector_kernels.cpp | 2 +- 22 files changed, 249 insertions(+), 75 deletions(-) diff --git a/common/cuda_hip/distributed/assembly_kernels.cpp b/common/cuda_hip/distributed/assembly_kernels.cpp index fb1a8dbc75d..81478538477 100644 --- a/common/cuda_hip/distributed/assembly_kernels.cpp +++ b/common/cuda_hip/distributed/assembly_kernels.cpp @@ -90,7 +90,7 @@ void count_non_owning_entries( num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index cf3ac70822b..551eafe6c8c 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -194,7 +194,7 @@ void separate_local_nonlocal( }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/common/cuda_hip/distributed/vector_kernels.cpp b/common/cuda_hip/distributed/vector_kernels.cpp index 668a721d249..1bacc93489a 100644 --- a/common/cuda_hip/distributed/vector_kernels.cpp +++ b/common/cuda_hip/distributed/vector_kernels.cpp @@ -83,7 +83,7 @@ void build_local( range_id.get_data(), local_mtx->get_values(), is_local_row); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/common/unified/distributed/assembly_kernels.cpp b/common/unified/distributed/assembly_kernels.cpp index a33fca28796..a3ac5207f17 100644 --- a/common/unified/distributed/assembly_kernels.cpp +++ b/common/unified/distributed/assembly_kernels.cpp @@ -48,7 +48,7 @@ void fill_send_buffers( send_values.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_FILL_SEND_BUFFERS); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 0240dabc7e4..40c7102fe9a 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -183,7 +183,7 @@ typename GlobalIndexType> \ _macro(ValueType, LocalIndexType, GlobalIndexType) \ GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ - GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(_macro) + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) #define GKO_STUB_TEMPLATE_TYPE_BASE(_macro) \ template \ diff --git a/core/distributed/assembly.cpp b/core/distributed/assembly.cpp index 116cf83ee94..424e641f845 100644 --- a/core/distributed/assembly.cpp +++ b/core/distributed/assembly.cpp @@ -135,7 +135,7 @@ device_matrix_data assemble_rows_from_neighbors( mpi::communicator comm, \ const device_matrix_data<_value_type, _global_type>& input, \ ptr_param> partition) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_ASSEMBLE_ROWS_FROM_NEIGHBORS); diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index b25b4d2bba8..7320bc27914 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -200,8 +200,8 @@ Matrix::create( template void Matrix::convert_to( - Matrix, local_index_type, - global_index_type>* result) const + Matrix, local_index_type, global_index_type>* + result) const { GKO_ASSERT(this->get_communicator().size() == result->get_communicator().size()); @@ -219,8 +219,8 @@ void Matrix::convert_to( template void Matrix::move_to( - Matrix, local_index_type, - global_index_type>* result) + Matrix, local_index_type, global_index_type>* + result) { GKO_ASSERT(this->get_communicator().size() == result->get_communicator().size()); @@ -701,7 +701,7 @@ Matrix::operator=(Matrix&& other) #define GKO_DECLARE_DISTRIBUTED_MATRIX(ValueType, LocalIndexType, \ GlobalIndexType) \ class Matrix -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_MATRIX); diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 901d2ee1527..965414349d6 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -144,8 +144,7 @@ void Schwarz::generate( #define GKO_DECLARE_SCHWARZ(ValueType, LocalIndexType, GlobalIndexType) \ class Schwarz -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( - GKO_DECLARE_SCHWARZ); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SCHWARZ); } // namespace preconditioner diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 3c9c77f1b4a..86eb450b888 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -279,7 +279,7 @@ void Vector::fill(const ValueType value) template void Vector::convert_to( - Vector>* result) const + Vector>* result) const { GKO_ASSERT(this->get_communicator().size() == result->get_communicator().size()); @@ -289,7 +289,7 @@ void Vector::convert_to( template -void Vector::move_to(Vector>* result) +void Vector::move_to(Vector>* result) { this->convert_to(result); } @@ -740,7 +740,7 @@ std::unique_ptr> Vector::create_with_type_of_impl( #define GKO_DECLARE_DISTRIBUTED_VECTOR(ValueType) class Vector -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_DISTRIBUTED_VECTOR); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DISTRIBUTED_VECTOR); } // namespace distributed diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp index cbd13820050..f914075ce60 100644 --- a/core/test/mpi/base/bindings.cpp +++ b/core/test/mpi/base/bindings.cpp @@ -24,9 +24,8 @@ class MpiBindings : public ::testing::Test { std::shared_ptr ref; }; -using TestTypes = - gko::test::merge_type_list_t; +using TestTypes = gko::test::merge_type_list_t; TYPED_TEST_SUITE(MpiBindings, TestTypes, TypenameNameGenerator); @@ -762,7 +761,17 @@ void half_sum(void* input, void* output, int* len, MPI_Datatype* datatype) // } // } - +struct sum_op { + template + void operator()(void* input, void* output, int* len, MPI_Datatype* datatype) + { + ValueType* input_ptr = static_cast(input); + ValueType* output_ptr = static_cast(output); + for (int i = 0; i < *len; i++) { + output_ptr[i] += input_ptr[i]; + } + } +}; TYPED_TEST(MpiBindings, CanAllReduceValues) { auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); @@ -780,11 +789,15 @@ TYPED_TEST(MpiBindings, CanAllReduceValues) } MPI_Op operation; MPI_Op_create(&half_sum, 1, &operation); - if (std::is_same_v) { - comm.all_reduce(this->ref, &data, &sum, 1, operation); - } else { - comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM); - } + // if (std::is_same_v) { + // comm.all_reduce(this->ref, &data, &sum, 1, operation); + // } else { + // gko::experimental::mpi::op_type op(1, MPI_SUM, + // gko::experimental::mpi::detail::sum); + // gko::experimental::mpi::op_type op(1, MPI_SUM, sum_op()); + auto op = gko::experimental::mpi::sum(); + comm.all_reduce(this->ref, &data, &sum, 1, op.get()); + // } // comm.all_reduce(this->ref, &data, &sum, 1, // gko::experimental::mpi::sum()); MPI_Op_free(&operation); @@ -808,8 +821,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace) data = 6; } - comm.all_reduce(this->ref, &data, 1, - gko::experimental::mpi::sum()); + comm.all_reduce(this->ref, &data, 1, MPI_SUM); ASSERT_EQ(data, TypeParam{16}); } @@ -831,8 +843,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues) data = 6; } - auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, - gko::experimental::mpi::sum()); + auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM); req.wait(); ASSERT_EQ(sum, TypeParam{16}); @@ -1471,8 +1482,7 @@ TYPED_TEST(MpiBindings, CanScanValues) data = 6; } - comm.scan(this->ref, &data, &sum, 1, - gko::experimental::mpi::sum()); + comm.scan(this->ref, &data, &sum, 1, MPI_SUM); comm.scan(this->ref, &data, &max, 1, MPI_MAX); comm.scan(this->ref, &data, &min, 1, MPI_MIN); @@ -1512,8 +1522,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues) data = 6; } - auto req1 = comm.i_scan(this->ref, &data, &sum, 1, - gko::experimental::mpi::sum()); + auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM); auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX); auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN); diff --git a/dpcpp/distributed/assembly_kernels.dp.cpp b/dpcpp/distributed/assembly_kernels.dp.cpp index 3f89c45ff1f..e0cc872b783 100644 --- a/dpcpp/distributed/assembly_kernels.dp.cpp +++ b/dpcpp/distributed/assembly_kernels.dp.cpp @@ -23,7 +23,7 @@ void count_non_owning_entries( array& send_positions, array& original_positions) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index ec9bc367e5a..47adaaeca59 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -27,7 +27,7 @@ void separate_local_nonlocal( array& non_local_col_idxs, array& non_local_values) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index 4f451e2f76b..fdc5dd2e52d 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -22,7 +22,7 @@ void build_local( comm_index_type local_part, matrix::Dense* local_mtx) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 94c5f0263c1..82c6319f6f9 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -111,29 +111,140 @@ inline void half_sum(void* input, void* output, int* len, } } +template +inline void sum(void* input, void* output, int* len, MPI_Datatype* datatype) +{ + ValueType* input_ptr = static_cast(input); + ValueType* output_ptr = static_cast(output); + for (int i = 0; i < *len; i++) { + output_ptr[i] += input_ptr[i]; + } +} + } // namespace detail -template -inline MPI_Op sum() + +using op_manager = std::unique_ptr>; + +template >* = nullptr> +inline op_manager sum() { - return MPI_SUM; + return op_manager([]() { return MPI_SUM; }(), [](MPI_Op op) {}); } -template <> -inline MPI_Op sum() +template >* = nullptr> +inline op_manager sum() { - using handle_manager = - std::unique_ptr>; - static handle_manager mpi_op( + // MPI_Op is MPI_ABI_Op* + return op_manager( []() { - MPI_Op* operation = new MPI_Op; - MPI_Op_create(&detail::half_sum, 1, operation); + MPI_Op operation; + MPI_Op_create(&detail::sum, 1, &operation); + // MPI_Op_create(&detail::half_sum, 1, operation); + std::cout << "custom operator" << std::endl; return operation; }(), - [](MPI_Op* op) { MPI_Op_free(op); }); - return *mpi_op.get(); + [](MPI_Op op) { MPI_Op_free(&op); }); } +/** + * A move-only wrapper for a contiguous MPI_Datatype. + * + * The underlying MPI_Datatype is automatically created and committed when an + * object of this type is constructed, and freed when it is destructed. + */ +template +class op_type { +public: + template + struct mpi_native_type + : std::conditional_t, std::true_type, + std::false_type> {}; + + /** + * Constructs a wrapper for a contiguous MPI_Datatype. + * + * @param count the number of old_type elements the new datatype contains. + * @param old_type the MPI_Datatype that is contained. + */ + template + op_type(int commutativity, MPI_Op default_op, AltFunc&& alt_func) + : custom_(false), handle_(MPI_OP_NULL) + { + custom_ = mpi_native_type::value; + if constexpr (mpi_native_type::value) { + auto op = alt_func.template operator(); + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Op_create(&op, commutativity, &handle_)); + } else { + handle_ = default_op; + } + } + + /** + * Constructs empty wrapper with MPI_OP_NULL. + */ + op_type() : handle_(MPI_OP_NULL) {} + + /** + * Disallow copying of wrapper type. + */ + op_type(const op_type&) = delete; + + /** + * Disallow copying of wrapper type. + */ + op_type& operator=(const op_type&) = delete; + + /** + * Move constructor, leaves other with MPI_OP_NULL. + * + * @param other to be moved from object. + */ + op_type(op_type&& other) noexcept : handle_(MPI_OP_NULL) + { + *this = std::move(other); + } + + /** + * Move assignment, leaves other with MPI_OP_NULL. + * + * @param other to be moved from object. + * + * @return this object. + */ + op_type& operator=(op_type&& other) noexcept + { + if (this != &other) { + this->handle_ = std::exchange(other.handle_, MPI_OP_NULL); + } + return *this; + } + + /** + * Destructs object by freeing wrapped MPI_Datatype. + */ + ~op_type() + { + if (custom_ && handle_ != MPI_OP_NULL) { + MPI_Op_free(&handle_); + } + } + + /** + * Access the underlying MPI_Op. + * + * @return the underlying MPI_Op. + */ + MPI_Op get() const { return handle_; } + +private: + bool custom_; + MPI_Op handle_; +}; + /** * A move-only wrapper for a contiguous MPI_Datatype. * @@ -782,9 +893,22 @@ class communicator { ReduceType* recv_buffer, int count, MPI_Op operation) const { auto guard = exec->get_scoped_device_id_guard(); - GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( - MPI_IN_PLACE, recv_buffer, count, type_impl::get_type(), - operation, this->get())); + if constexpr (std::is_same_v) { + if (operation == MPI_SUM) { + MPI_Op op; + MPI_Op_create(&detail::half_sum, 1, &op); + GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( + MPI_IN_PLACE, recv_buffer, count, + type_impl::get_type(), op, this->get())); + MPI_Op_free(&op); + } else { + GKO_NOT_IMPLEMENTED; + } + } else { + GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( + MPI_IN_PLACE, recv_buffer, count, + type_impl::get_type(), operation, this->get())); + } } /** @@ -809,9 +933,24 @@ class communicator { { auto guard = exec->get_scoped_device_id_guard(); request req; - GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce( - MPI_IN_PLACE, recv_buffer, count, type_impl::get_type(), - operation, this->get(), req.get())); + if constexpr (std::is_same_v) { + if (operation == MPI_SUM) { + MPI_Op op; + MPI_Op_create(&detail::half_sum, 1, &op); + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count, + type_impl::get_type(), op, + this->get(), req.get())); + MPI_Op_free(&op); + } else { + GKO_NOT_IMPLEMENTED; + } + } else { + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count, + type_impl::get_type(), operation, + this->get(), req.get())); + } return req; } @@ -835,9 +974,22 @@ class communicator { int count, MPI_Op operation) const { auto guard = exec->get_scoped_device_id_guard(); + // if constexpr (std::is_same_v) { + // if (operation == MPI_SUM) { + // MPI_Op op; + // MPI_Op_create(&detail::half_sum, 1, &op); + // GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( + // send_buffer, recv_buffer, count, + // type_impl::get_type(), op, this->get())); + // MPI_Op_free(&op); + // } else { + // GKO_NOT_IMPLEMENTED; + // } + // } else { GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( send_buffer, recv_buffer, count, type_impl::get_type(), operation, this->get())); + // } } /** @@ -863,9 +1015,24 @@ class communicator { { auto guard = exec->get_scoped_device_id_guard(); request req; - GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce( - send_buffer, recv_buffer, count, type_impl::get_type(), - operation, this->get(), req.get())); + if constexpr (std::is_same_v) { + if (operation == MPI_SUM) { + MPI_Op op; + MPI_Op_create(&detail::half_sum, 1, &op); + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Iallreduce(send_buffer, recv_buffer, count, + type_impl::get_type(), op, + this->get(), req.get())); + MPI_Op_free(&op); + } else { + GKO_NOT_IMPLEMENTED; + } + } else { + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Iallreduce(send_buffer, recv_buffer, count, + type_impl::get_type(), operation, + this->get(), req.get())); + } return req; } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 86c94fc74fb..ceda97c0427 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -267,7 +267,7 @@ class Matrix #endif public DistributedBase { friend class EnablePolymorphicObject; - friend class Matrix, LocalIndexType, + friend class Matrix, LocalIndexType, GlobalIndexType>; friend class multigrid::Pgm; @@ -283,15 +283,15 @@ class Matrix using EnableLinOp::convert_to; using EnableLinOp::move_to; - using ConvertibleTo, LocalIndexType, + using ConvertibleTo, LocalIndexType, GlobalIndexType>>::convert_to; - using ConvertibleTo, LocalIndexType, + using ConvertibleTo, LocalIndexType, GlobalIndexType>>::move_to; - void convert_to(Matrix, local_index_type, + void convert_to(Matrix, local_index_type, global_index_type>* result) const override; - void move_to(Matrix, local_index_type, + void move_to(Matrix, local_index_type, global_index_type>* result) override; #if GINKGO_ENABLE_HALF friend class Matrix>, diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 80d59beca7d..20ccfb6435e 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -81,8 +81,8 @@ class Vector public: using EnableLinOp::convert_to; using EnableLinOp::move_to; - using ConvertibleTo>>::convert_to; - using ConvertibleTo>>::move_to; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; using value_type = ValueType; using absolute_type = remove_complex; @@ -171,10 +171,9 @@ class Vector void read_distributed(const matrix_data& data, ptr_param> partition); - void convert_to( - Vector>* result) const override; + void convert_to(Vector>* result) const override; - void move_to(Vector>* result) override; + void move_to(Vector>* result) override; #if GINKGO_ENABLE_HALF friend class Vector>>; @@ -690,7 +689,7 @@ template struct conversion_target_helper> { using target_type = experimental::distributed::Vector; using source_type = - experimental::distributed::Vector>; + experimental::distributed::Vector>; static std::unique_ptr create_empty(const source_type* source) { diff --git a/omp/distributed/assembly_kernels.cpp b/omp/distributed/assembly_kernels.cpp index 44c9c908c52..9fa9976e607 100644 --- a/omp/distributed/assembly_kernels.cpp +++ b/omp/distributed/assembly_kernels.cpp @@ -73,7 +73,7 @@ void count_non_owning_entries( num_input_elements); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index d60b31ac6a8..2f36ec4a778 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -149,7 +149,7 @@ void separate_local_nonlocal( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/omp/distributed/vector_kernels.cpp b/omp/distributed/vector_kernels.cpp index 007509f50fd..1ae60ed108e 100644 --- a/omp/distributed/vector_kernels.cpp +++ b/omp/distributed/vector_kernels.cpp @@ -42,7 +42,7 @@ void build_local( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/reference/distributed/assembly_kernels.cpp b/reference/distributed/assembly_kernels.cpp index 36c44ca4022..e38680243a0 100644 --- a/reference/distributed/assembly_kernels.cpp +++ b/reference/distributed/assembly_kernels.cpp @@ -67,7 +67,7 @@ void count_non_owning_entries( num_input_elements); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); @@ -97,7 +97,7 @@ void fill_send_buffers( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_FILL_SEND_BUFFERS); diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index ab0e07070ff..95176b34656 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -86,7 +86,7 @@ void separate_local_nonlocal( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/reference/distributed/vector_kernels.cpp b/reference/distributed/vector_kernels.cpp index 1425f1dc9ab..76a8be06a0f 100644 --- a/reference/distributed/vector_kernels.cpp +++ b/reference/distributed/vector_kernels.cpp @@ -40,7 +40,7 @@ void build_local( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); From 1df60284b4583e71f51c69c41d9bd55d7fb9895f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 30 Dec 2024 15:47:47 +0100 Subject: [PATCH 3/5] implement and test --- core/test/mpi/base/bindings.cpp | 1046 +++++++++++++++--------------- include/ginkgo/core/base/mpi.hpp | 206 ++---- 2 files changed, 570 insertions(+), 682 deletions(-) diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp index f914075ce60..c7e9e6fc294 100644 --- a/core/test/mpi/base/bindings.cpp +++ b/core/test/mpi/base/bindings.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause #include +#include #include @@ -15,13 +16,62 @@ #include "core/test/utils.hpp" +namespace detail { + + +template +inline void min(void* input, void* output, int* len, MPI_Datatype* datatype) +{ + ValueType* input_ptr = static_cast(input); + ValueType* output_ptr = static_cast(output); + for (int i = 0; i < *len; i++) { + if (input_ptr[i] < output_ptr[i]) { + output_ptr[i] = input_ptr[i]; + } + } +} + + +} // namespace detail + + +using gko::experimental::mpi::op_manager; + +template >* = nullptr> +inline op_manager min() +{ + return op_manager([]() { return MPI_MIN; }(), [](MPI_Op op) {}); +} + +template >* = nullptr> +inline op_manager min() +{ + return op_manager( + []() { + MPI_Op operation; + MPI_Op_create(&detail::min, 1, &operation); + return operation; + }(), + [](MPI_Op op) { MPI_Op_free(&op); }); +} + template class MpiBindings : public ::testing::Test { protected: using value_type = T; - MpiBindings() : ref(gko::ReferenceExecutor::create()) {} + MpiBindings() + : ref(gko::ReferenceExecutor::create()), + sum_op(gko::experimental::mpi::sum()), + max_op(gko::experimental::mpi::max()), + min_op(min()) + {} std::shared_ptr ref; + gko::experimental::mpi::op_manager sum_op; + gko::experimental::mpi::op_manager max_op; + gko::experimental::mpi::op_manager min_op; }; using TestTypes = gko::test::merge_type_list_t(input); - gko::half* output_ptr = static_cast(output); - for (int i = 0; i < *len; i++) { - output_ptr[i] += input_ptr[i]; + // one-side accumlation only supports native type + SKIP_IF_HALF(TypeParam); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else if (my_rank == 1) { + data = std::vector{5, 6, 7, 8}; + } else if (my_rank == 2) { + data = std::vector{9, 10, 11, 12}; + } else { + data = std::vector{0, 0, 0, 0}; + } + + { + auto win = window(this->ref, data.data(), 4, comm); + if (my_rank == 0) { + win.lock_all(); + for (auto rank = 0; rank < num_ranks; ++rank) { + if (rank != my_rank) { + win.accumulate(this->ref, data.data(), 4, rank, 0, 4, + MPI_SUM); + } + } + win.unlock_all(); + } + } + + std::vector ref; + if (my_rank == 0) { + ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); + } else if (my_rank == 1) { + ref = std::vector{6, 8, 10, 12}; + ASSERT_EQ(data, ref); + } else if (my_rank == 2) { + ref = std::vector{10, 12, 14, 16}; + ASSERT_EQ(data, ref); + } else { + ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); } } -// TYPED_TEST(MpiBindings, CanAccumulateValues) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else if (my_rank == 1) { -// data = std::vector{5, 6, 7, 8}; -// } else if (my_rank == 2) { -// data = std::vector{9, 10, 11, 12}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } -// MPI_Op operation; -// MPI_Op_create(&half_sum, 1, &operation); -// { -// auto win = window(this->ref, data.data(), 4, comm); -// if (my_rank == 0) { -// win.lock_all(); -// for (auto rank = 0; rank < num_ranks; ++rank) { -// if (rank != my_rank) { -// if (std::is_same_v) { -// win.accumulate(this->ref, data.data(), 4, rank, 0, 4, -// operation); -// } else { -// win.accumulate(this->ref, data.data(), 4, rank, 0, 4, -// MPI_SUM); -// } -// } -// } -// win.unlock_all(); -// } -// } -// MPI_Op_free(&operation); - -// std::vector ref; -// if (my_rank == 0) { -// ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } else if (my_rank == 1) { -// ref = std::vector{6, 8, 10, 12}; -// ASSERT_EQ(data, ref); -// } else if (my_rank == 2) { -// ref = std::vector{10, 12, 14, 16}; -// ASSERT_EQ(data, ref); -// } else { -// ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } -// } - - -// TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else if (my_rank == 1) { -// data = std::vector{5, 6, 7, 8}; -// } else if (my_rank == 2) { -// data = std::vector{9, 10, 11, 12}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } - -// gko::experimental::mpi::request req; -// { -// auto win = window(this->ref, data.data(), 4, comm); -// if (my_rank == 0) { -// win.lock_all(); -// for (auto rank = 0; rank < num_ranks; ++rank) { -// if (rank != my_rank) { -// req = win.r_accumulate(this->ref, data.data(), 4, rank, -// 0, -// 4, -// gko::experimental::mpi::sum()); -// } -// } -// win.unlock_all(); -// } -// } - -// req.wait(); -// std::vector ref; -// if (my_rank == 0) { -// ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } else if (my_rank == 1) { -// ref = std::vector{6, 8, 10, 12}; -// ASSERT_EQ(data, ref); -// } else if (my_rank == 2) { -// ref = std::vector{10, 12, 14, 16}; -// ASSERT_EQ(data, ref); -// } else { -// ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } -// } - - -// TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } -// auto win = window(this->ref, data.data(), 4, comm); - -// if (my_rank != 0) { -// win.lock_all(); -// win.get(this->ref, data.data(), 4, 0, 0, 4); -// win.unlock_all(); -// } - -// auto ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } - - -// TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } -// gko::experimental::mpi::request req; -// auto win = window(this->ref, data.data(), 4, comm); - -// if (my_rank != 0) { -// win.lock_all(); -// req = win.r_get(this->ref, data.data(), 4, 0, 0, 4); -// win.unlock_all(); -// } - -// req.wait(); -// auto ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } - - -// TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } -// auto win = window(this->ref, data.data(), 4, comm); - -// if (my_rank != 0) { -// win.lock(0, window::lock_type::exclusive); -// win.get(this->ref, data.data(), 4, 0, 0, 4); -// win.unlock(0); -// } - -// auto ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } - - -// TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } -// auto win = window(this->ref, data.data(), 4, comm); - -// if (my_rank != 0) { -// win.lock(0); -// win.get(this->ref, data.data(), 4, 0, 0, 4); -// win.unlock(0); -// } - -// auto ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } - - -// TYPED_TEST(MpiBindings, CanGetValuesWithFence) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// } -// auto win = window(this->ref, data.data(), 4, comm); - -// win.fence(); -// if (my_rank != 0) { -// win.get(this->ref, data.data(), 4, 0, 0, 4); -// } -// win.fence(); - -// auto ref = std::vector{1, 2, 3, 4}; -// ASSERT_EQ(data, ref); -// } - - -// TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// std::vector target; -// std::vector result(4, 0); -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// target = std::vector{1, 2, 3, 4}; -// } else if (my_rank == 1) { -// data = std::vector{5, 6, 7, 8}; -// target = std::vector{5, 6, 7, 8}; -// } else if (my_rank == 2) { -// data = std::vector{9, 10, 11, 12}; -// target = std::vector{9, 10, 11, 12}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// target = std::vector{0, 0, 0, 0}; -// } - -// { -// auto win = window(this->ref, target.data(), 4, comm); - -// if (my_rank == 2) { -// win.lock_all(); -// win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, -// 0, -// 0, 4, -// gko::experimental::mpi::sum()); -// win.unlock_all(); -// } -// } - -// std::vector ref; -// std::vector ref2; -// if (my_rank == 0) { -// ref = std::vector{10, 12, 14, 16}; -// EXPECT_EQ(target, ref); -// } else if (my_rank == 2) { -// ref = std::vector{1, 2, 3, 4}; -// EXPECT_EQ(result, ref); -// } -// } - - -// TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// std::vector target; -// std::vector result(4, 0); -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// target = std::vector{1, 2, 3, 4}; -// } else if (my_rank == 1) { -// data = std::vector{5, 6, 7, 8}; -// target = std::vector{5, 6, 7, 8}; -// } else if (my_rank == 2) { -// data = std::vector{9, 10, 11, 12}; -// target = std::vector{9, 10, 11, 12}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// target = std::vector{0, 0, 0, 0}; -// } -// gko::experimental::mpi::request req; - -// { -// auto win = window(this->ref, target.data(), 4, comm); - -// if (my_rank == 2) { -// win.lock_all(); -// req = win.r_get_accumulate(this->ref, data.data(), 4, -// result.data(), -// 4, 0, 0, 4, -// gko::experimental::mpi::sum()); -// win.unlock_all(); -// } -// } - -// req.wait(); -// std::vector ref; -// std::vector ref2; -// if (my_rank == 0) { -// ref = std::vector{10, 12, 14, 16}; -// ref2 = std::vector{1, 2, 3, 4}; -// EXPECT_EQ(target, ref); -// EXPECT_EQ(data, ref2); -// } else if (my_rank == 2) { -// ref = std::vector{1, 2, 3, 4}; -// ref2 = std::vector{9, 10, 11, 12}; -// EXPECT_EQ(result, ref); -// EXPECT_EQ(target, ref2); -// EXPECT_EQ(data, ref2); -// } -// } - - -// TYPED_TEST(MpiBindings, CanFetchAndOperate) -// { -// using window = gko::experimental::mpi::window; -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// std::vector data; -// std::vector target; -// std::vector result(4, 0); -// if (my_rank == 0) { -// data = std::vector{1, 2, 3, 4}; -// target = std::vector{1, 2, 3, 4}; -// } else if (my_rank == 1) { -// data = std::vector{5, 6, 7, 8}; -// target = std::vector{5, 6, 7, 8}; -// } else if (my_rank == 2) { -// data = std::vector{9, 10, 11, 12}; -// target = std::vector{9, 10, 11, 12}; -// } else { -// data = std::vector{0, 0, 0, 0}; -// target = std::vector{0, 0, 0, 0}; -// } - -// { -// auto win = window(this->ref, target.data(), 4, comm); - -// if (my_rank == 2) { -// win.lock_all(); -// win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1, -// gko::experimental::mpi::sum()); -// win.unlock_all(); -// } -// } - -// std::vector ref; -// std::vector ref2; -// if (my_rank == 0) { -// ref = std::vector{1, 11, 3, 4}; -// EXPECT_EQ(target, ref); -// } else if (my_rank == 2) { -// ref = std::vector{2, 0, 0, 0}; -// EXPECT_EQ(result, ref); -// } -// } - - -// TYPED_TEST(MpiBindings, CanBroadcastValues) -// { -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// auto array = gko::array{this->ref, 8}; -// if (my_rank == 0) { -// array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); -// } - -// comm.broadcast(this->ref, array.get_data(), 8, 0); - -// auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); -// GKO_ASSERT_ARRAY_EQ(ref, array); -// } - - -// TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) -// { -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// auto array = gko::array{this->ref, 8}; -// if (my_rank == 0) { -// array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); -// } - -// auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0); - -// req.wait(); -// auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); -// GKO_ASSERT_ARRAY_EQ(ref, array); -// } - - -// TYPED_TEST(MpiBindings, CanReduceValues) -// { -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// TypeParam data, sum, max, min; -// if (my_rank == 0) { -// data = 3; -// } else if (my_rank == 1) { -// data = 5; -// } else if (my_rank == 2) { -// data = 2; -// } else if (my_rank == 3) { -// data = 6; -// } - -// comm.reduce(this->ref, &data, &sum, 1, -// gko::experimental::mpi::sum(), 0); comm.reduce(this->ref, -// &data, &max, 1, MPI_MAX, 0); comm.reduce(this->ref, &data, &min, 1, -// MPI_MIN, 0); - -// if (my_rank == 0) { -// EXPECT_EQ(sum, TypeParam{16}); -// EXPECT_EQ(max, TypeParam{6}); -// EXPECT_EQ(min, TypeParam{2}); -// } -// } - - -// TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) -// { -// auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); -// auto my_rank = comm.rank(); -// auto num_ranks = comm.size(); -// TypeParam data, sum, max, min; -// if (my_rank == 0) { -// data = 3; -// } else if (my_rank == 1) { -// data = 5; -// } else if (my_rank == 2) { -// data = 2; -// } else if (my_rank == 3) { -// data = 6; -// } - -// auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, -// gko::experimental::mpi::sum(), 0); auto req2 = -// comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0); auto req3 = -// comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0); - -// req1.wait(); -// req2.wait(); -// req3.wait(); -// if (my_rank == 0) { -// EXPECT_EQ(sum, TypeParam{16}); -// EXPECT_EQ(max, TypeParam{6}); -// EXPECT_EQ(min, TypeParam{2}); -// } -// } - -struct sum_op { - template - void operator()(void* input, void* output, int* len, MPI_Datatype* datatype) + +TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) +{ + // one-side accumlation only supports native type + SKIP_IF_HALF(TypeParam); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else if (my_rank == 1) { + data = std::vector{5, 6, 7, 8}; + } else if (my_rank == 2) { + data = std::vector{9, 10, 11, 12}; + } else { + data = std::vector{0, 0, 0, 0}; + } + + gko::experimental::mpi::request req; { - ValueType* input_ptr = static_cast(input); - ValueType* output_ptr = static_cast(output); - for (int i = 0; i < *len; i++) { - output_ptr[i] += input_ptr[i]; + auto win = window(this->ref, data.data(), 4, comm); + if (my_rank == 0) { + win.lock_all(); + for (auto rank = 0; rank < num_ranks; ++rank) { + if (rank != my_rank) { + req = win.r_accumulate(this->ref, data.data(), 4, rank, 0, + 4, MPI_SUM); + } + } + win.unlock_all(); } } -}; + + req.wait(); + std::vector ref; + if (my_rank == 0) { + ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); + } else if (my_rank == 1) { + ref = std::vector{6, 8, 10, 12}; + ASSERT_EQ(data, ref); + } else if (my_rank == 2) { + ref = std::vector{10, 12, 14, 16}; + ASSERT_EQ(data, ref); + } else { + ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); + } +} + + +TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) +{ + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else { + data = std::vector{0, 0, 0, 0}; + } + auto win = window(this->ref, data.data(), 4, comm); + + if (my_rank != 0) { + win.lock_all(); + win.get(this->ref, data.data(), 4, 0, 0, 4); + win.unlock_all(); + } + + auto ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); +} + + +TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) +{ + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else { + data = std::vector{0, 0, 0, 0}; + } + gko::experimental::mpi::request req; + auto win = window(this->ref, data.data(), 4, comm); + + if (my_rank != 0) { + win.lock_all(); + req = win.r_get(this->ref, data.data(), 4, 0, 0, 4); + win.unlock_all(); + } + + req.wait(); + auto ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); +} + + +TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) +{ + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else { + data = std::vector{0, 0, 0, 0}; + } + auto win = window(this->ref, data.data(), 4, comm); + + if (my_rank != 0) { + win.lock(0, window::lock_type::exclusive); + win.get(this->ref, data.data(), 4, 0, 0, 4); + win.unlock(0); + } + + auto ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); +} + + +TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) +{ + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else { + data = std::vector{0, 0, 0, 0}; + } + auto win = window(this->ref, data.data(), 4, comm); + + if (my_rank != 0) { + win.lock(0); + win.get(this->ref, data.data(), 4, 0, 0, 4); + win.unlock(0); + } + + auto ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); +} + + +TYPED_TEST(MpiBindings, CanGetValuesWithFence) +{ + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + } else { + data = std::vector{0, 0, 0, 0}; + } + auto win = window(this->ref, data.data(), 4, comm); + + win.fence(); + if (my_rank != 0) { + win.get(this->ref, data.data(), 4, 0, 0, 4); + } + win.fence(); + + auto ref = std::vector{1, 2, 3, 4}; + ASSERT_EQ(data, ref); +} + + +TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) +{ + // one-side accumlation only supports native type + SKIP_IF_HALF(TypeParam); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + std::vector target; + std::vector result(4, 0); + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + target = std::vector{1, 2, 3, 4}; + } else if (my_rank == 1) { + data = std::vector{5, 6, 7, 8}; + target = std::vector{5, 6, 7, 8}; + } else if (my_rank == 2) { + data = std::vector{9, 10, 11, 12}; + target = std::vector{9, 10, 11, 12}; + } else { + data = std::vector{0, 0, 0, 0}; + target = std::vector{0, 0, 0, 0}; + } + + { + auto win = window(this->ref, target.data(), 4, comm); + + if (my_rank == 2) { + win.lock_all(); + win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, 0, + 0, 4, MPI_SUM); + win.unlock_all(); + } + } + + std::vector ref; + std::vector ref2; + if (my_rank == 0) { + ref = std::vector{10, 12, 14, 16}; + EXPECT_EQ(target, ref); + } else if (my_rank == 2) { + ref = std::vector{1, 2, 3, 4}; + EXPECT_EQ(result, ref); + } +} + + +TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) +{ + // one-side accumlation only supports native type + SKIP_IF_HALF(TypeParam); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + std::vector target; + std::vector result(4, 0); + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + target = std::vector{1, 2, 3, 4}; + } else if (my_rank == 1) { + data = std::vector{5, 6, 7, 8}; + target = std::vector{5, 6, 7, 8}; + } else if (my_rank == 2) { + data = std::vector{9, 10, 11, 12}; + target = std::vector{9, 10, 11, 12}; + } else { + data = std::vector{0, 0, 0, 0}; + target = std::vector{0, 0, 0, 0}; + } + gko::experimental::mpi::request req; + + { + auto win = window(this->ref, target.data(), 4, comm); + + if (my_rank == 2) { + win.lock_all(); + req = win.r_get_accumulate(this->ref, data.data(), 4, result.data(), + 4, 0, 0, 4, MPI_SUM); + win.unlock_all(); + } + } + + req.wait(); + std::vector ref; + std::vector ref2; + if (my_rank == 0) { + ref = std::vector{10, 12, 14, 16}; + ref2 = std::vector{1, 2, 3, 4}; + EXPECT_EQ(target, ref); + EXPECT_EQ(data, ref2); + } else if (my_rank == 2) { + ref = std::vector{1, 2, 3, 4}; + ref2 = std::vector{9, 10, 11, 12}; + EXPECT_EQ(result, ref); + EXPECT_EQ(target, ref2); + EXPECT_EQ(data, ref2); + } +} + + +TYPED_TEST(MpiBindings, CanFetchAndOperate) +{ + // one-side operation only supports native type + SKIP_IF_HALF(TypeParam); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + std::vector data; + std::vector target; + std::vector result(4, 0); + if (my_rank == 0) { + data = std::vector{1, 2, 3, 4}; + target = std::vector{1, 2, 3, 4}; + } else if (my_rank == 1) { + data = std::vector{5, 6, 7, 8}; + target = std::vector{5, 6, 7, 8}; + } else if (my_rank == 2) { + data = std::vector{9, 10, 11, 12}; + target = std::vector{9, 10, 11, 12}; + } else { + data = std::vector{0, 0, 0, 0}; + target = std::vector{0, 0, 0, 0}; + } + + { + auto win = window(this->ref, target.data(), 4, comm); + + if (my_rank == 2) { + win.lock_all(); + win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1, + MPI_SUM); + win.unlock_all(); + } + } + + std::vector ref; + std::vector ref2; + if (my_rank == 0) { + ref = std::vector{1, 11, 3, 4}; + EXPECT_EQ(target, ref); + } else if (my_rank == 2) { + ref = std::vector{2, 0, 0, 0}; + EXPECT_EQ(result, ref); + } +} + + +TYPED_TEST(MpiBindings, CanBroadcastValues) +{ + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + auto array = gko::array{this->ref, 8}; + if (my_rank == 0) { + array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); + } + + comm.broadcast(this->ref, array.get_data(), 8, 0); + + auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); + GKO_ASSERT_ARRAY_EQ(ref, array); +} + + +TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) +{ + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + auto array = gko::array{this->ref, 8}; + if (my_rank == 0) { + array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); + } + + auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0); + + req.wait(); + auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); + GKO_ASSERT_ARRAY_EQ(ref, array); +} + + +TYPED_TEST(MpiBindings, CanReduceValues) +{ + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + TypeParam data, sum, max, min; + if (my_rank == 0) { + data = 3; + } else if (my_rank == 1) { + data = 5; + } else if (my_rank == 2) { + data = 2; + } else if (my_rank == 3) { + data = 6; + } + + comm.reduce(this->ref, &data, &sum, 1, this->sum_op.get(), 0); + comm.reduce(this->ref, &data, &max, 1, this->max_op.get(), 0); + comm.reduce(this->ref, &data, &min, 1, this->min_op.get(), 0); + + if (my_rank == 0) { + EXPECT_EQ(sum, TypeParam{16}); + EXPECT_EQ(max, TypeParam{6}); + EXPECT_EQ(min, TypeParam{2}); + } +} + + +TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) +{ + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + auto my_rank = comm.rank(); + auto num_ranks = comm.size(); + TypeParam data, sum, max, min; + if (my_rank == 0) { + data = 3; + } else if (my_rank == 1) { + data = 5; + } else if (my_rank == 2) { + data = 2; + } else if (my_rank == 3) { + data = 6; + } + + auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, this->sum_op.get(), 0); + auto req2 = comm.i_reduce(this->ref, &data, &max, 1, this->max_op.get(), 0); + auto req3 = comm.i_reduce(this->ref, &data, &min, 1, this->min_op.get(), 0); + + req1.wait(); + req2.wait(); + req3.wait(); + if (my_rank == 0) { + EXPECT_EQ(sum, TypeParam{16}); + EXPECT_EQ(max, TypeParam{6}); + EXPECT_EQ(min, TypeParam{2}); + } +} + + TYPED_TEST(MpiBindings, CanAllReduceValues) { auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); @@ -787,20 +814,9 @@ TYPED_TEST(MpiBindings, CanAllReduceValues) } else if (my_rank == 3) { data = 6; } - MPI_Op operation; - MPI_Op_create(&half_sum, 1, &operation); - // if (std::is_same_v) { - // comm.all_reduce(this->ref, &data, &sum, 1, operation); - // } else { - // gko::experimental::mpi::op_type op(1, MPI_SUM, - // gko::experimental::mpi::detail::sum); - // gko::experimental::mpi::op_type op(1, MPI_SUM, sum_op()); - auto op = gko::experimental::mpi::sum(); - comm.all_reduce(this->ref, &data, &sum, 1, op.get()); - // } - // comm.all_reduce(this->ref, &data, &sum, 1, - // gko::experimental::mpi::sum()); - MPI_Op_free(&operation); + + comm.all_reduce(this->ref, &data, &sum, 1, this->sum_op.get()); + ASSERT_EQ(sum, TypeParam{16}); } @@ -821,7 +837,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace) data = 6; } - comm.all_reduce(this->ref, &data, 1, MPI_SUM); + comm.all_reduce(this->ref, &data, 1, this->sum_op.get()); ASSERT_EQ(data, TypeParam{16}); } @@ -843,7 +859,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues) data = 6; } - auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM); + auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, this->sum_op.get()); req.wait(); ASSERT_EQ(sum, TypeParam{16}); @@ -866,7 +882,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace) data = 6; } - auto req = comm.i_all_reduce(this->ref, &data, 1, MPI_SUM); + auto req = comm.i_all_reduce(this->ref, &data, 1, this->sum_op.get()); req.wait(); ASSERT_EQ(data, TypeParam{16}); @@ -1482,9 +1498,9 @@ TYPED_TEST(MpiBindings, CanScanValues) data = 6; } - comm.scan(this->ref, &data, &sum, 1, MPI_SUM); - comm.scan(this->ref, &data, &max, 1, MPI_MAX); - comm.scan(this->ref, &data, &min, 1, MPI_MIN); + comm.scan(this->ref, &data, &sum, 1, this->sum_op.get()); + comm.scan(this->ref, &data, &max, 1, this->max_op.get()); + comm.scan(this->ref, &data, &min, 1, this->min_op.get()); if (my_rank == 0) { EXPECT_EQ(sum, TypeParam{3}); @@ -1522,9 +1538,9 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues) data = 6; } - auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM); - auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX); - auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN); + auto req1 = comm.i_scan(this->ref, &data, &sum, 1, this->sum_op.get()); + auto req2 = comm.i_scan(this->ref, &data, &max, 1, this->max_op.get()); + auto req3 = comm.i_scan(this->ref, &data, &min, 1, this->min_op.get()); req1.wait(); req2.wait(); diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 82c6319f6f9..5642abbd4d7 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -101,30 +101,35 @@ GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); namespace detail { -inline void half_sum(void* input, void* output, int* len, - MPI_Datatype* datatype) + +template +inline void sum(void* input, void* output, int* len, MPI_Datatype* datatype) { - gko::half* input_ptr = static_cast(input); - gko::half* output_ptr = static_cast(output); + ValueType* input_ptr = static_cast(input); + ValueType* output_ptr = static_cast(output); for (int i = 0; i < *len; i++) { output_ptr[i] += input_ptr[i]; } } template -inline void sum(void* input, void* output, int* len, MPI_Datatype* datatype) +inline void max(void* input, void* output, int* len, MPI_Datatype* datatype) { ValueType* input_ptr = static_cast(input); ValueType* output_ptr = static_cast(output); for (int i = 0; i < *len; i++) { - output_ptr[i] += input_ptr[i]; + if (input_ptr[i] > output_ptr[i]) { + output_ptr[i] = input_ptr[i]; + } } } + } // namespace detail -using op_manager = std::unique_ptr>; +using op_manager = std::unique_ptr::element_type, + std::function>; template >* = nullptr> @@ -137,113 +142,36 @@ template >* = nullptr> inline op_manager sum() { - // MPI_Op is MPI_ABI_Op* return op_manager( []() { MPI_Op operation; MPI_Op_create(&detail::sum, 1, &operation); - // MPI_Op_create(&detail::half_sum, 1, operation); - std::cout << "custom operator" << std::endl; return operation; }(), [](MPI_Op op) { MPI_Op_free(&op); }); } -/** - * A move-only wrapper for a contiguous MPI_Datatype. - * - * The underlying MPI_Datatype is automatically created and committed when an - * object of this type is constructed, and freed when it is destructed. - */ -template -class op_type { -public: - template - struct mpi_native_type - : std::conditional_t, std::true_type, - std::false_type> {}; - - /** - * Constructs a wrapper for a contiguous MPI_Datatype. - * - * @param count the number of old_type elements the new datatype contains. - * @param old_type the MPI_Datatype that is contained. - */ - template - op_type(int commutativity, MPI_Op default_op, AltFunc&& alt_func) - : custom_(false), handle_(MPI_OP_NULL) - { - custom_ = mpi_native_type::value; - if constexpr (mpi_native_type::value) { - auto op = alt_func.template operator(); - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Op_create(&op, commutativity, &handle_)); - } else { - handle_ = default_op; - } - } - - /** - * Constructs empty wrapper with MPI_OP_NULL. - */ - op_type() : handle_(MPI_OP_NULL) {} - - /** - * Disallow copying of wrapper type. - */ - op_type(const op_type&) = delete; - - /** - * Disallow copying of wrapper type. - */ - op_type& operator=(const op_type&) = delete; - - /** - * Move constructor, leaves other with MPI_OP_NULL. - * - * @param other to be moved from object. - */ - op_type(op_type&& other) noexcept : handle_(MPI_OP_NULL) - { - *this = std::move(other); - } - - /** - * Move assignment, leaves other with MPI_OP_NULL. - * - * @param other to be moved from object. - * - * @return this object. - */ - op_type& operator=(op_type&& other) noexcept - { - if (this != &other) { - this->handle_ = std::exchange(other.handle_, MPI_OP_NULL); - } - return *this; - } - /** - * Destructs object by freeing wrapped MPI_Datatype. - */ - ~op_type() - { - if (custom_ && handle_ != MPI_OP_NULL) { - MPI_Op_free(&handle_); - } - } +template >* = nullptr> +inline op_manager max() +{ + return op_manager([]() { return MPI_MAX; }(), [](MPI_Op op) {}); +} - /** - * Access the underlying MPI_Op. - * - * @return the underlying MPI_Op. - */ - MPI_Op get() const { return handle_; } +template >* = nullptr> +inline op_manager max() +{ + return op_manager( + []() { + MPI_Op operation; + MPI_Op_create(&detail::max, 1, &operation); + return operation; + }(), + [](MPI_Op op) { MPI_Op_free(&op); }); +} -private: - bool custom_; - MPI_Op handle_; -}; /** * A move-only wrapper for a contiguous MPI_Datatype. @@ -893,22 +821,9 @@ class communicator { ReduceType* recv_buffer, int count, MPI_Op operation) const { auto guard = exec->get_scoped_device_id_guard(); - if constexpr (std::is_same_v) { - if (operation == MPI_SUM) { - MPI_Op op; - MPI_Op_create(&detail::half_sum, 1, &op); - GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( - MPI_IN_PLACE, recv_buffer, count, - type_impl::get_type(), op, this->get())); - MPI_Op_free(&op); - } else { - GKO_NOT_IMPLEMENTED; - } - } else { - GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( - MPI_IN_PLACE, recv_buffer, count, - type_impl::get_type(), operation, this->get())); - } + GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( + MPI_IN_PLACE, recv_buffer, count, type_impl::get_type(), + operation, this->get())); } /** @@ -933,24 +848,9 @@ class communicator { { auto guard = exec->get_scoped_device_id_guard(); request req; - if constexpr (std::is_same_v) { - if (operation == MPI_SUM) { - MPI_Op op; - MPI_Op_create(&detail::half_sum, 1, &op); - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count, - type_impl::get_type(), op, - this->get(), req.get())); - MPI_Op_free(&op); - } else { - GKO_NOT_IMPLEMENTED; - } - } else { - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Iallreduce(MPI_IN_PLACE, recv_buffer, count, - type_impl::get_type(), operation, - this->get(), req.get())); - } + GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce( + MPI_IN_PLACE, recv_buffer, count, type_impl::get_type(), + operation, this->get(), req.get())); return req; } @@ -974,22 +874,9 @@ class communicator { int count, MPI_Op operation) const { auto guard = exec->get_scoped_device_id_guard(); - // if constexpr (std::is_same_v) { - // if (operation == MPI_SUM) { - // MPI_Op op; - // MPI_Op_create(&detail::half_sum, 1, &op); - // GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( - // send_buffer, recv_buffer, count, - // type_impl::get_type(), op, this->get())); - // MPI_Op_free(&op); - // } else { - // GKO_NOT_IMPLEMENTED; - // } - // } else { GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( send_buffer, recv_buffer, count, type_impl::get_type(), operation, this->get())); - // } } /** @@ -1015,24 +902,9 @@ class communicator { { auto guard = exec->get_scoped_device_id_guard(); request req; - if constexpr (std::is_same_v) { - if (operation == MPI_SUM) { - MPI_Op op; - MPI_Op_create(&detail::half_sum, 1, &op); - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Iallreduce(send_buffer, recv_buffer, count, - type_impl::get_type(), op, - this->get(), req.get())); - MPI_Op_free(&op); - } else { - GKO_NOT_IMPLEMENTED; - } - } else { - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Iallreduce(send_buffer, recv_buffer, count, - type_impl::get_type(), operation, - this->get(), req.get())); - } + GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce( + send_buffer, recv_buffer, count, type_impl::get_type(), + operation, this->get(), req.get())); return req; } From bed8415c8571df5c0bd14fc7f98513e52cfa5258 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 30 Dec 2024 16:56:17 +0100 Subject: [PATCH 4/5] enable vector half --- core/distributed/vector.cpp | 38 ++++++++++++++-------- core/test/utils.hpp | 3 ++ include/ginkgo/core/base/mpi.hpp | 21 ++++++++---- include/ginkgo/core/distributed/vector.hpp | 2 ++ test/mpi/vector.cpp | 19 +++++------ 5 files changed, 53 insertions(+), 30 deletions(-) diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 86eb450b888..732f8d5b3ef 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -64,7 +64,9 @@ Vector::Vector(std::shared_ptr exec, dim<2> local_size, size_type stride) : EnableLinOp{exec, global_size}, DistributedBase{comm}, - local_{exec, local_size, stride} + local_{exec, local_size, stride}, + sum_op_(mpi::sum()), + norm_sum_op_(mpi::sum>()) { GKO_ASSERT_EQUAL_COLS(global_size, local_size); } @@ -75,7 +77,9 @@ Vector::Vector(std::shared_ptr exec, std::unique_ptr local_vector) : EnableLinOp{exec, global_size}, DistributedBase{comm}, - local_{exec} + local_{exec}, + sum_op_(mpi::sum()), + norm_sum_op_(mpi::sum>()) { local_vector->move_to(&local_); } @@ -85,7 +89,11 @@ template Vector::Vector(std::shared_ptr exec, mpi::communicator comm, std::unique_ptr local_vector) - : EnableLinOp{exec, {}}, DistributedBase{comm}, local_{exec} + : EnableLinOp{exec, {}}, + DistributedBase{comm}, + local_{exec}, + sum_op_(mpi::sum()), + norm_sum_op_(mpi::sum>()) { this->set_size(compute_global_size(exec, comm, local_vector->get_size())); local_vector->move_to(&local_); @@ -467,11 +475,11 @@ void Vector::compute_dot(ptr_param b, host_reduction_buffer_->copy_from(dense_res.get()); comm.all_reduce(exec->get_master(), host_reduction_buffer_->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), sum_op_.get()); dense_res->copy_from(host_reduction_buffer_.get()); } else { comm.all_reduce(exec, dense_res->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), sum_op_.get()); } } @@ -503,11 +511,11 @@ void Vector::compute_conj_dot(ptr_param b, host_reduction_buffer_->copy_from(dense_res.get()); comm.all_reduce(exec->get_master(), host_reduction_buffer_->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), sum_op_.get()); dense_res->copy_from(host_reduction_buffer_.get()); } else { comm.all_reduce(exec, dense_res->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), sum_op_.get()); } } @@ -556,11 +564,13 @@ void Vector::compute_norm1(ptr_param result, host_norm_buffer_.init(exec->get_master(), dense_res->get_size()); host_norm_buffer_->copy_from(dense_res.get()); comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), + norm_sum_op_.get()); dense_res->copy_from(host_norm_buffer_.get()); } else { comm.all_reduce(exec, dense_res->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), + norm_sum_op_.get()); } } @@ -589,11 +599,13 @@ void Vector::compute_squared_norm2(ptr_param result, host_norm_buffer_.init(exec->get_master(), dense_res->get_size()); host_norm_buffer_->copy_from(dense_res.get()); comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), + norm_sum_op_.get()); dense_res->copy_from(host_norm_buffer_.get()); } else { comm.all_reduce(exec, dense_res->get_values(), - static_cast(this->get_size()[1]), MPI_SUM); + static_cast(this->get_size()[1]), + norm_sum_op_.get()); } } @@ -632,10 +644,10 @@ void Vector::compute_mean(ptr_param result, host_reduction_buffer_->copy_from(dense_res.get()); comm.all_reduce(exec->get_master(), host_reduction_buffer_->get_values(), num_vecs, - MPI_SUM); + sum_op_.get()); dense_res->copy_from(host_reduction_buffer_.get()); } else { - comm.all_reduce(exec, dense_res->get_values(), num_vecs, MPI_SUM); + comm.all_reduce(exec, dense_res->get_values(), num_vecs, sum_op_.get()); } } diff --git a/core/test/utils.hpp b/core/test/utils.hpp index e9c4c5e0c99..2c9a570b7fe 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -396,6 +396,9 @@ using TwoValueIndexTypes = add_to_cartesian_type_product_t< using ValueLocalGlobalIndexTypesBase = add_to_cartesian_type_product_left_t; +using ValueLocalGlobalIndexTypes = + add_to_cartesian_type_product_left_t; + template struct reduction_factor { diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 5642abbd4d7..555ab7099b1 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -124,22 +124,31 @@ inline void max(void* input, void* output, int* len, MPI_Datatype* datatype) } } +template +struct is_mpi_native { + constexpr static bool value = + std::is_arithmetic_v || + std::is_same_v> || + std::is_same_v>; +}; + } // namespace detail -using op_manager = std::unique_ptr::element_type, - std::function>; +// using op_manager = std::unique_ptr::element_type, +// std::function>; +using op_manager = std::shared_ptr::element_type>; template >* = nullptr> + std::enable_if_t::value>* = nullptr> inline op_manager sum() { return op_manager([]() { return MPI_SUM; }(), [](MPI_Op op) {}); } template >* = nullptr> + std::enable_if_t::value>* = nullptr> inline op_manager sum() { return op_manager( @@ -153,14 +162,14 @@ inline op_manager sum() template >* = nullptr> + std::enable_if_t::value>* = nullptr> inline op_manager max() { return op_manager([]() { return MPI_MAX; }(), [](MPI_Op op) {}); } template >* = nullptr> + std::enable_if_t::value>* = nullptr> inline op_manager max() { return op_manager( diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 20ccfb6435e..181a7de3460 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -662,6 +662,8 @@ class Vector local_vector_type local_; ::gko::detail::DenseCache host_reduction_buffer_; ::gko::detail::DenseCache> host_norm_buffer_; + mpi::op_manager sum_op_; + mpi::op_manager norm_sum_op_; }; diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index a65bbc7fd36..01e58ddd517 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -95,7 +95,7 @@ class VectorCreation : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); @@ -361,7 +361,7 @@ class VectorCreationHelpers : public CommonMpiTestFixture { std::unique_ptr dst; }; -TYPED_TEST_SUITE(VectorCreationHelpers, gko::test::ValueTypesBase, +TYPED_TEST_SUITE(VectorCreationHelpers, gko::test::ValueTypes, TypenameNameGenerator); @@ -462,8 +462,7 @@ class VectorReductions : public CommonMpiTestFixture { global_index_type>( size[0], size[1], std::uniform_int_distribution(size[1], size[1]), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); dense_x->read(md_x); auto tmp_x = dist_vec_type::create(ref, comm); tmp_x->read_distributed(md_x, part); @@ -473,8 +472,7 @@ class VectorReductions : public CommonMpiTestFixture { global_index_type>( size[0], size[1], std::uniform_int_distribution(size[1], size[1]), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); dense_y->read(md_y); auto tmp_y = dist_vec_type::create(ref, comm); tmp_y->read_distributed(md_y, part); @@ -513,7 +511,7 @@ class VectorReductions : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypesBase, +TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes, TypenameNameGenerator); @@ -799,8 +797,7 @@ class VectorLocalOps : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypesBase, - TypenameNameGenerator); +TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypes, TypenameNameGenerator); TYPED_TEST(VectorLocalOps, ApplyNotSupported) @@ -838,7 +835,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported) TYPED_TEST(VectorLocalOps, ConvertsToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision_base; + using OtherT = typename gko::next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -854,7 +851,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision) TYPED_TEST(VectorLocalOps, MovesToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision_base; + using OtherT = typename gko::next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); From f50a7d127cf17ac9157c93ee4cfcb3d182001d47 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 2 Jan 2025 09:29:24 +0100 Subject: [PATCH 5/5] enable half in distributed matrix/vector/pgm/gmres --- core/distributed/vector_cache.cpp | 4 +- core/multigrid/pgm.cpp | 269 +++++++++--------- core/solver/gmres.cpp | 7 +- core/test/mpi/distributed/matrix.cpp | 4 +- .../distributed/preconditioner/schwarz.cpp | 4 +- .../ginkgo/core/base/precision_dispatch.hpp | 151 +++++----- .../test/distributed/assembly_kernels.cpp | 4 +- reference/test/distributed/matrix_kernels.cpp | 4 +- reference/test/distributed/vector_kernels.cpp | 4 +- test/distributed/assembly_kernels.cpp | 4 +- test/distributed/matrix_kernels.cpp | 4 +- test/distributed/vector_kernels.cpp | 4 +- test/mpi/assembly.cpp | 4 +- test/mpi/matrix.cpp | 10 +- test/mpi/multigrid/pgm.cpp | 4 +- test/mpi/preconditioner/schwarz.cpp | 5 +- 16 files changed, 229 insertions(+), 257 deletions(-) diff --git a/core/distributed/vector_cache.cpp b/core/distributed/vector_cache.cpp index 683d18dfd98..acc23ed0fef 100644 --- a/core/distributed/vector_cache.cpp +++ b/core/distributed/vector_cache.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -48,7 +48,7 @@ void VectorCache::init_from( #define GKO_DECLARE_VECTOR_CACHE(_type) class VectorCache<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_VECTOR_CACHE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_VECTOR_CACHE); } // namespace detail diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 468a27e8ce4..a6fac943db9 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -389,147 +389,138 @@ void Pgm::generate() #if GINKGO_BUILD_MPI if (std::dynamic_pointer_cast< const experimental::distributed::DistributedBase>(system_matrix_)) { - if constexpr (std::is_same_v, half>) { - GKO_NOT_SUPPORTED(nullptr); - } else { - auto convert_fine_op = [&](auto matrix) { - using global_index_type = typename std::decay_t< - decltype(*matrix)>::result_type::global_index_type; - auto exec = as(matrix)->get_executor(); - auto comm = - as(matrix) - ->get_communicator(); - auto fine = share( - experimental::distributed:: - Matrix::create( - exec, comm, - matrix::Csr::create(exec), - matrix::Csr::create(exec))); - matrix->convert_to(fine); - this->set_fine_op(fine); - }; - auto setup_fine_op = [&](auto matrix) { - // Only support csr matrix currently. - auto local_csr = std::dynamic_pointer_cast( - matrix->get_local_matrix()); - auto non_local_csr = std::dynamic_pointer_cast( - matrix->get_non_local_matrix()); - // If system matrix is not csr or need sorting, generate the - // csr. - if (!parameters_.skip_sorting || !local_csr || !non_local_csr) { - using global_index_type = typename std::decay_t< - decltype(*matrix)>::global_index_type; - convert_fine_op( - as>>(matrix)); - } - }; - - using fst_mtx_type = - experimental::distributed::Matrix; - using snd_mtx_type = - experimental::distributed::Matrix; - // setup the fine op using Csr with current ValueType - // we do not use dispatcher run in the first place because we have - // the fallback option for that. - if (auto obj = std::dynamic_pointer_cast( - system_matrix_)) { - setup_fine_op(obj); - } else if (auto obj = std::dynamic_pointer_cast( - system_matrix_)) { - setup_fine_op(obj); - } else { - // handle other ValueTypes. - run(system_matrix_, - convert_fine_op); - } - - auto distributed_setup = [&](auto matrix) { - auto exec = gko::as(matrix)->get_executor(); - auto comm = - gko::as(matrix) - ->get_communicator(); - auto num_rank = comm.size(); - auto pgm_local_op = - gko::as(matrix->get_local_matrix()); - auto result = this->generate_local(pgm_local_op); - - auto non_local_csr = - as(matrix->get_non_local_matrix()); - auto non_local_size = non_local_csr->get_size()[1]; - array non_local_agg(exec, non_local_size); - // get agg information (prolong_row_gather row idx) - communicate(matrix, agg_, non_local_agg); - // generate non_local_col_map - non_local_agg.set_executor(exec->get_master()); - array non_local_col_map(exec->get_master(), - non_local_size); - // add additional entry in tail such that the offset easily - // handle it. - array renumber(exec->get_master(), - non_local_size + 1); - auto recv_offsets = matrix->recv_offsets_; - generate_non_local_map(recv_offsets, non_local_agg, - non_local_col_map, renumber); - - // get new recv_size and recv_offsets - std::vector - new_recv_size(num_rank); - std::vector - new_recv_offsets(num_rank + 1); - array new_recv_gather_idxs(exec->get_master()); - compute_communication(recv_offsets, non_local_agg, renumber, - new_recv_size, new_recv_offsets, - new_recv_gather_idxs); - - non_local_col_map.set_executor(exec); - IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); - // build csr from row and col map - // unlike non-distributed version, generate_coarse uses - // different row and col maps. - auto result_non_local_csr = generate_coarse( - exec, non_local_csr.get(), - static_cast(std::get<1>(result)->get_size()[0]), - agg_, non_local_num_agg, non_local_col_map); - // use local and non-local to build coarse matrix - // also restriction and prolongation (Local-only-global matrix) - auto coarse_size = - static_cast(std::get<1>(result)->get_size()[0]); - comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); - new_recv_gather_idxs.set_executor(exec); - - // setup the generated linop. + auto convert_fine_op = [&](auto matrix) { + using global_index_type = typename std::decay_t< + decltype(*matrix)>::result_type::global_index_type; + auto exec = as(matrix)->get_executor(); + auto comm = as(matrix) + ->get_communicator(); + auto fine = share( + experimental::distributed:: + Matrix::create( + exec, comm, + matrix::Csr::create(exec), + matrix::Csr::create(exec))); + matrix->convert_to(fine); + this->set_fine_op(fine); + }; + auto setup_fine_op = [&](auto matrix) { + // Only support csr matrix currently. + auto local_csr = std::dynamic_pointer_cast( + matrix->get_local_matrix()); + auto non_local_csr = std::dynamic_pointer_cast( + matrix->get_non_local_matrix()); + // If system matrix is not csr or need sorting, generate the + // csr. + if (!parameters_.skip_sorting || !local_csr || !non_local_csr) { using global_index_type = typename std::decay_t::global_index_type; - auto coarse = share( - experimental::distributed:: - Matrix::create( - exec, comm, gko::dim<2>(coarse_size, coarse_size), - std::get<1>(result), result_non_local_csr, - new_recv_size, new_recv_offsets, - new_recv_gather_idxs)); - auto restrict_op = share( - experimental::distributed:: - Matrix::create( - exec, comm, - dim<2>(coarse_size, - gko::as(matrix)->get_size()[0]), - std::get<2>(result))); - auto prolong_op = share( - experimental::distributed:: - Matrix::create( - exec, comm, - dim<2>(gko::as(matrix)->get_size()[0], - coarse_size), - std::get<0>(result))); - this->set_multigrid_level(prolong_op, coarse, restrict_op); - }; - - // the fine op is using csr with the current ValueType - run(this->get_fine_op(), - distributed_setup); + convert_fine_op( + as>>(matrix)); + } + }; + + using fst_mtx_type = + experimental::distributed::Matrix; + using snd_mtx_type = + experimental::distributed::Matrix; + // setup the fine op using Csr with current ValueType + // we do not use dispatcher run in the first place because we have + // the fallback option for that. + if (auto obj = + std::dynamic_pointer_cast(system_matrix_)) { + setup_fine_op(obj); + } else if (auto obj = std::dynamic_pointer_cast( + system_matrix_)) { + setup_fine_op(obj); + } else { + // handle other ValueTypes. + run(system_matrix_, + convert_fine_op); } + + auto distributed_setup = [&](auto matrix) { + auto exec = gko::as(matrix)->get_executor(); + auto comm = + gko::as(matrix) + ->get_communicator(); + auto num_rank = comm.size(); + auto pgm_local_op = + gko::as(matrix->get_local_matrix()); + auto result = this->generate_local(pgm_local_op); + + auto non_local_csr = + as(matrix->get_non_local_matrix()); + auto non_local_size = non_local_csr->get_size()[1]; + array non_local_agg(exec, non_local_size); + // get agg information (prolong_row_gather row idx) + communicate(matrix, agg_, non_local_agg); + // generate non_local_col_map + non_local_agg.set_executor(exec->get_master()); + array non_local_col_map(exec->get_master(), + non_local_size); + // add additional entry in tail such that the offset easily + // handle it. + array renumber(exec->get_master(), non_local_size + 1); + auto recv_offsets = matrix->recv_offsets_; + generate_non_local_map(recv_offsets, non_local_agg, + non_local_col_map, renumber); + + // get new recv_size and recv_offsets + std::vector + new_recv_size(num_rank); + std::vector + new_recv_offsets(num_rank + 1); + array new_recv_gather_idxs(exec->get_master()); + compute_communication(recv_offsets, non_local_agg, renumber, + new_recv_size, new_recv_offsets, + new_recv_gather_idxs); + + non_local_col_map.set_executor(exec); + IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); + // build csr from row and col map + // unlike non-distributed version, generate_coarse uses + // different row and col maps. + auto result_non_local_csr = generate_coarse( + exec, non_local_csr.get(), + static_cast(std::get<1>(result)->get_size()[0]), + agg_, non_local_num_agg, non_local_col_map); + // use local and non-local to build coarse matrix + // also restriction and prolongation (Local-only-global matrix) + auto coarse_size = + static_cast(std::get<1>(result)->get_size()[0]); + comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); + new_recv_gather_idxs.set_executor(exec); + + // setup the generated linop. + using global_index_type = + typename std::decay_t::global_index_type; + auto coarse = share( + experimental::distributed:: + Matrix::create( + exec, comm, gko::dim<2>(coarse_size, coarse_size), + std::get<1>(result), result_non_local_csr, + new_recv_size, new_recv_offsets, new_recv_gather_idxs)); + auto restrict_op = share( + experimental::distributed:: + Matrix::create( + exec, comm, + dim<2>(coarse_size, + gko::as(matrix)->get_size()[0]), + std::get<2>(result))); + auto prolong_op = share( + experimental::distributed:: + Matrix::create( + exec, comm, + dim<2>(gko::as(matrix)->get_size()[0], + coarse_size), + std::get<0>(result))); + this->set_multigrid_level(prolong_op, coarse, restrict_op); + }; + + // the fine op is using csr with the current ValueType + run(this->get_fine_op(), distributed_setup); } else #endif // GINKGO_BUILD_MPI { diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index e066fc696a1..067d7d7aad2 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -196,17 +196,18 @@ void finish_reduce(matrix::Dense* hessenberg_iter, auto hessenberg_reduce = hessenberg_iter->create_submatrix( span{0, restart_iter + 1}, span{0, num_rhs}); int message_size = static_cast((restart_iter + 1) * num_rhs); + auto sum_op = gko::experimental::mpi::sum(); if (experimental::mpi::requires_host_buffer(exec, comm)) { ::gko::detail::DenseCache host_reduction_buffer; host_reduction_buffer.init(exec->get_master(), hessenberg_reduce->get_size()); host_reduction_buffer->copy_from(hessenberg_reduce); comm.all_reduce(exec->get_master(), host_reduction_buffer->get_values(), - message_size, MPI_SUM); + message_size, sum_op.get()); hessenberg_reduce->copy_from(host_reduction_buffer.get()); } else { comm.all_reduce(exec, hessenberg_reduce->get_values(), message_size, - MPI_SUM); + sum_op.get()); } } #endif diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp index 26a551b5758..efc929a19c5 100644 --- a/core/test/mpi/distributed/matrix.cpp +++ b/core/test/mpi/distributed/matrix.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -177,7 +177,7 @@ class MatrixBuilder : public ::testing::Test { gko::experimental::mpi::communicator comm; }; -TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp index b55ec6a80ce..1cf3f04d311 100644 --- a/core/test/mpi/distributed/preconditioner/schwarz.cpp +++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -65,7 +65,7 @@ class SchwarzFactory : public ::testing::Test { std::shared_ptr mtx; }; -TYPED_TEST_SUITE(SchwarzFactory, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(SchwarzFactory, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 29aa4bfcab1..ffbe0645484 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -340,7 +340,8 @@ gko::detail::temporary_conversion> make_temporary_conversion( { auto result = gko::detail::temporary_conversion>::template create< - Vector>>(matrix); + Vector>, + Vector>>>(matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); } @@ -356,7 +357,9 @@ gko::detail::temporary_conversion> make_temporary_conversion(const LinOp* matrix) { auto result = gko::detail::temporary_conversion>:: - template create>>(matrix); + template create>, + Vector>>>( + matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); } @@ -381,11 +384,7 @@ make_temporary_conversion(const LinOp* matrix) template void precision_dispatch(Function fn, Args*... linops) { - if constexpr (std::is_same_v, half>) { - GKO_NOT_SUPPORTED(nullptr); - } else { - fn(distributed::make_temporary_conversion(linops).get()...); - } + fn(distributed::make_temporary_conversion(linops).get()...); } @@ -401,29 +400,23 @@ void precision_dispatch(Function fn, Args*... linops) template void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out) { - if constexpr (std::is_same_v, half>) { - GKO_NOT_SUPPORTED(nullptr); + auto complex_to_real = !( + is_complex() || + dynamic_cast>*>( + in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>(in); + auto dense_out = + distributed::make_temporary_conversion>(out); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); } else { - auto complex_to_real = !( - is_complex() || - dynamic_cast< - const ConvertibleTo>*>(in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>( - in); - auto dense_out = - distributed::make_temporary_conversion>( - out); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dynamic_cast(dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); - } else { - distributed::precision_dispatch(fn, in, out); - } + distributed::precision_dispatch(fn, in, out); } } @@ -435,33 +428,27 @@ template void precision_dispatch_real_complex(Function fn, const LinOp* alpha, const LinOp* in, LinOp* out) { - if constexpr (std::is_same_v, half>) { - GKO_NOT_SUPPORTED(nullptr); + auto complex_to_real = !( + is_complex() || + dynamic_cast>*>( + in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>(in); + auto dense_out = + distributed::make_temporary_conversion>(out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); } else { - auto complex_to_real = !( - is_complex() || - dynamic_cast< - const ConvertibleTo>*>(in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>( - in); - auto dense_out = - distributed::make_temporary_conversion>( - out); - auto dense_alpha = gko::make_temporary_conversion(alpha); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dense_alpha.get(), - dynamic_cast(dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); - } else { - fn(gko::make_temporary_conversion(alpha).get(), - distributed::make_temporary_conversion(in).get(), - distributed::make_temporary_conversion(out).get()); - } + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + distributed::make_temporary_conversion(out).get()); } } @@ -474,36 +461,30 @@ void precision_dispatch_real_complex(Function fn, const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) { - if constexpr (std::is_same_v, half>) { - GKO_NOT_SUPPORTED(nullptr); + auto complex_to_real = !( + is_complex() || + dynamic_cast>*>( + in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>(in); + auto dense_out = + distributed::make_temporary_conversion>(out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + auto dense_beta = gko::make_temporary_conversion(beta); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dense_beta.get(), + dynamic_cast(dense_out->create_real_view().get())); } else { - auto complex_to_real = !( - is_complex() || - dynamic_cast< - const ConvertibleTo>*>(in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>( - in); - auto dense_out = - distributed::make_temporary_conversion>( - out); - auto dense_alpha = gko::make_temporary_conversion(alpha); - auto dense_beta = gko::make_temporary_conversion(beta); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dense_alpha.get(), - dynamic_cast(dense_in->create_real_view().get()), - dense_beta.get(), - dynamic_cast(dense_out->create_real_view().get())); - } else { - fn(gko::make_temporary_conversion(alpha).get(), - distributed::make_temporary_conversion(in).get(), - gko::make_temporary_conversion(beta).get(), - distributed::make_temporary_conversion(out).get()); - } + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + gko::make_temporary_conversion(beta).get(), + distributed::make_temporary_conversion(out).get()); } } diff --git a/reference/test/distributed/assembly_kernels.cpp b/reference/test/distributed/assembly_kernels.cpp index 4823f465a31..57981d254e5 100644 --- a/reference/test/distributed/assembly_kernels.cpp +++ b/reference/test/distributed/assembly_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -48,7 +48,7 @@ class AssemblyHelpers : public ::testing::Test { gko::array mapping; }; -TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index b5e2e3f5fb9..40a9b15e0ec 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -182,7 +182,7 @@ class Matrix : public ::testing::Test { gko::array non_local_values; }; -TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/reference/test/distributed/vector_kernels.cpp b/reference/test/distributed/vector_kernels.cpp index 43f11967488..4e03cc6995e 100644 --- a/reference/test/distributed/vector_kernels.cpp +++ b/reference/test/distributed/vector_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -69,7 +69,7 @@ class Vector : public ::testing::Test { std::shared_ptr ref; }; -TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/test/distributed/assembly_kernels.cpp b/test/distributed/assembly_kernels.cpp index 4ab4c9173ac..f27682cc883 100644 --- a/test/distributed/assembly_kernels.cpp +++ b/test/distributed/assembly_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -100,7 +100,7 @@ class AssemblyHelpers : public CommonTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 9a5d4f2cf7b..afae8c130dc 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -87,7 +87,7 @@ class Matrix : public CommonTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index a212699a2ca..c195318dde2 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -67,7 +67,7 @@ class Vector : public CommonTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/test/mpi/assembly.cpp b/test/mpi/assembly.cpp index 3ad47565d44..819f4a47515 100644 --- a/test/mpi/assembly.cpp +++ b/test/mpi/assembly.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -83,7 +83,7 @@ class AssemblyHelpers : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(AssemblyHelpers, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 8e4eeb3921c..80d7aaea364 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -101,7 +101,7 @@ class MatrixCreation : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); @@ -463,7 +463,7 @@ class Matrix : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(Matrix, gko::test::ValueTypesBase, TypenameNameGenerator); +TYPED_TEST_SUITE(Matrix, gko::test::ValueTypes, TypenameNameGenerator); TYPED_TEST(Matrix, CanApplyToSingleVector) @@ -729,7 +729,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision_base; + using OtherT = typename gko::next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); @@ -755,7 +755,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision_base; + using OtherT = typename gko::next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp index df198f235c3..3f3638ff7e1 100644 --- a/test/mpi/multigrid/pgm.cpp +++ b/test/mpi/multigrid/pgm.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -80,7 +80,7 @@ class Pgm : public CommonMpiTestFixture { std::shared_ptr dist_mat; }; -TYPED_TEST_SUITE(Pgm, gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(Pgm, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 113f8922aae..e1e740c787a 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -143,8 +143,7 @@ class SchwarzPreconditioner : public CommonMpiTestFixture { } }; -TYPED_TEST_SUITE(SchwarzPreconditioner, - gko::test::ValueLocalGlobalIndexTypesBase, +TYPED_TEST_SUITE(SchwarzPreconditioner, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState)