From de5c2a733ec9cd1617e787820901e2a4772dea83 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 29 Oct 2024 16:58:44 +0100 Subject: [PATCH] dispatch with distributed needs to throw with half --- core/distributed/helpers.hpp | 14 +- core/multigrid/pgm.cpp | 266 +++++++++--------- core/solver/multigrid.cpp | 174 ++++++++---- .../ginkgo/core/base/precision_dispatch.hpp | 143 ++++++---- 4 files changed, 350 insertions(+), 247 deletions(-) diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp index 5536dbe32f0..9ce7d3b6ab4 100644 --- a/core/distributed/helpers.hpp +++ b/core/distributed/helpers.hpp @@ -122,11 +122,15 @@ void vector_dispatch(T* linop, F&& f, Args&&... args) { #if GINKGO_BUILD_MPI if (is_distributed(linop)) { - using type = std::conditional_t< - std::is_const::value, - const experimental::distributed::Vector, - experimental::distributed::Vector>; - f(dynamic_cast(linop), std::forward(args)...); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(linop); + } else { + using type = std::conditional_t< + std::is_const::value, + const experimental::distributed::Vector, + experimental::distributed::Vector>; + f(dynamic_cast(linop), std::forward(args)...); + } } else #endif { diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index e531fb2b996..d4e4ffde4de 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -389,137 +389,147 @@ void Pgm::generate() #if GINKGO_BUILD_MPI if (std::dynamic_pointer_cast< const experimental::distributed::DistributedBase>(system_matrix_)) { - auto convert_fine_op = [&](auto matrix) { - using global_index_type = typename std::decay_t< - decltype(*matrix)>::result_type::global_index_type; - auto exec = as(matrix)->get_executor(); - auto comm = as(matrix) - ->get_communicator(); - auto fine = share( - experimental::distributed:: - Matrix::create( - exec, comm, - matrix::Csr::create(exec), - matrix::Csr::create(exec))); - matrix->convert_to(fine); - this->set_fine_op(fine); - }; - auto setup_fine_op = [&](auto matrix) { - // Only support csr matrix currently. - auto local_csr = std::dynamic_pointer_cast( - matrix->get_local_matrix()); - auto non_local_csr = std::dynamic_pointer_cast( - matrix->get_non_local_matrix()); - // If system matrix is not csr or need sorting, generate the csr. - if (!parameters_.skip_sorting || !local_csr || !non_local_csr) { + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); + } else { + auto convert_fine_op = [&](auto matrix) { + using global_index_type = typename std::decay_t< + decltype(*matrix)>::result_type::global_index_type; + auto exec = as(matrix)->get_executor(); + auto comm = + as(matrix) + ->get_communicator(); + auto fine = share( + experimental::distributed:: + Matrix::create( + exec, comm, + matrix::Csr::create(exec), + matrix::Csr::create(exec))); + matrix->convert_to(fine); + this->set_fine_op(fine); + }; + auto setup_fine_op = [&](auto matrix) { + // Only support csr matrix currently. + auto local_csr = std::dynamic_pointer_cast( + matrix->get_local_matrix()); + auto non_local_csr = std::dynamic_pointer_cast( + matrix->get_non_local_matrix()); + // If system matrix is not csr or need sorting, generate the + // csr. + if (!parameters_.skip_sorting || !local_csr || !non_local_csr) { + using global_index_type = typename std::decay_t< + decltype(*matrix)>::global_index_type; + convert_fine_op( + as>>(matrix)); + } + }; + + using fst_mtx_type = + experimental::distributed::Matrix; + using snd_mtx_type = + experimental::distributed::Matrix; + // setup the fine op using Csr with current ValueType + // we do not use dispatcher run in the first place because we have + // the fallback option for that. + if (auto obj = std::dynamic_pointer_cast( + system_matrix_)) { + setup_fine_op(obj); + } else if (auto obj = std::dynamic_pointer_cast( + system_matrix_)) { + setup_fine_op(obj); + } else { + // handle other ValueTypes. + run(obj, + convert_fine_op); + } + + auto distributed_setup = [&](auto matrix) { + auto exec = gko::as(matrix)->get_executor(); + auto comm = + gko::as(matrix) + ->get_communicator(); + auto num_rank = comm.size(); + auto pgm_local_op = + gko::as(matrix->get_local_matrix()); + auto result = this->generate_local(pgm_local_op); + + auto non_local_csr = + as(matrix->get_non_local_matrix()); + auto non_local_size = non_local_csr->get_size()[1]; + array non_local_agg(exec, non_local_size); + // get agg information (prolong_row_gather row idx) + communicate(matrix, agg_, non_local_agg); + // generate non_local_col_map + non_local_agg.set_executor(exec->get_master()); + array non_local_col_map(exec->get_master(), + non_local_size); + // add additional entry in tail such that the offset easily + // handle it. + array renumber(exec->get_master(), + non_local_size + 1); + auto recv_offsets = matrix->recv_offsets_; + generate_non_local_map(recv_offsets, non_local_agg, + non_local_col_map, renumber); + + // get new recv_size and recv_offsets + std::vector + new_recv_size(num_rank); + std::vector + new_recv_offsets(num_rank + 1); + array new_recv_gather_idxs(exec->get_master()); + compute_communication(recv_offsets, non_local_agg, renumber, + new_recv_size, new_recv_offsets, + new_recv_gather_idxs); + + non_local_col_map.set_executor(exec); + IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); + // build csr from row and col map + // unlike non-distributed version, generate_coarse uses + // different row and col maps. + auto result_non_local_csr = generate_coarse( + exec, non_local_csr.get(), + static_cast(std::get<1>(result)->get_size()[0]), + agg_, non_local_num_agg, non_local_col_map); + // use local and non-local to build coarse matrix + // also restriction and prolongation (Local-only-global matrix) + auto coarse_size = + static_cast(std::get<1>(result)->get_size()[0]); + comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); + new_recv_gather_idxs.set_executor(exec); + + // setup the generated linop. using global_index_type = typename std::decay_t::global_index_type; - convert_fine_op( - as>>(matrix)); - } - }; - - using fst_mtx_type = - experimental::distributed::Matrix; - using snd_mtx_type = - experimental::distributed::Matrix; - // setup the fine op using Csr with current ValueType - // we do not use dispatcher run in the first place because we have the - // fallback option for that. - if (auto obj = - std::dynamic_pointer_cast(system_matrix_)) { - setup_fine_op(obj); - } else if (auto obj = std::dynamic_pointer_cast( - system_matrix_)) { - setup_fine_op(obj); - } else { - // handle other ValueTypes. - run(obj, - convert_fine_op); + auto coarse = share( + experimental::distributed:: + Matrix::create( + exec, comm, gko::dim<2>(coarse_size, coarse_size), + std::get<1>(result), result_non_local_csr, + new_recv_size, new_recv_offsets, + new_recv_gather_idxs)); + auto restrict_op = share( + experimental::distributed:: + Matrix::create( + exec, comm, + dim<2>(coarse_size, + gko::as(matrix)->get_size()[0]), + std::get<2>(result))); + auto prolong_op = share( + experimental::distributed:: + Matrix::create( + exec, comm, + dim<2>(gko::as(matrix)->get_size()[0], + coarse_size), + std::get<0>(result))); + this->set_multigrid_level(prolong_op, coarse, restrict_op); + }; + + // the fine op is using csr with the current ValueType + run(this->get_fine_op(), + distributed_setup); } - - auto distributed_setup = [&](auto matrix) { - auto exec = gko::as(matrix)->get_executor(); - auto comm = - gko::as(matrix) - ->get_communicator(); - auto num_rank = comm.size(); - auto pgm_local_op = - gko::as(matrix->get_local_matrix()); - auto result = this->generate_local(pgm_local_op); - - auto non_local_csr = - as(matrix->get_non_local_matrix()); - auto non_local_size = non_local_csr->get_size()[1]; - array non_local_agg(exec, non_local_size); - // get agg information (prolong_row_gather row idx) - communicate(matrix, agg_, non_local_agg); - // generate non_local_col_map - non_local_agg.set_executor(exec->get_master()); - array non_local_col_map(exec->get_master(), - non_local_size); - // add additional entry in tail such that the offset easily handle - // it. - array renumber(exec->get_master(), non_local_size + 1); - auto recv_offsets = matrix->recv_offsets_; - generate_non_local_map(recv_offsets, non_local_agg, - non_local_col_map, renumber); - - // get new recv_size and recv_offsets - std::vector - new_recv_size(num_rank); - std::vector - new_recv_offsets(num_rank + 1); - array new_recv_gather_idxs(exec->get_master()); - compute_communication(recv_offsets, non_local_agg, renumber, - new_recv_size, new_recv_offsets, - new_recv_gather_idxs); - - non_local_col_map.set_executor(exec); - IndexType non_local_num_agg = new_recv_gather_idxs.get_size(); - // build csr from row and col map - // unlike non-distributed version, generate_coarse uses different - // row and col maps. - auto result_non_local_csr = generate_coarse( - exec, non_local_csr.get(), - static_cast(std::get<1>(result)->get_size()[0]), - agg_, non_local_num_agg, non_local_col_map); - // use local and non-local to build coarse matrix - // also restriction and prolongation (Local-only-global matrix) - auto coarse_size = - static_cast(std::get<1>(result)->get_size()[0]); - comm.all_reduce(exec->get_master(), &coarse_size, 1, MPI_SUM); - new_recv_gather_idxs.set_executor(exec); - - // setup the generated linop. - using global_index_type = - typename std::decay_t::global_index_type; - auto coarse = share( - experimental::distributed:: - Matrix::create( - exec, comm, gko::dim<2>(coarse_size, coarse_size), - std::get<1>(result), result_non_local_csr, - new_recv_size, new_recv_offsets, new_recv_gather_idxs)); - auto restrict_op = share( - experimental::distributed:: - Matrix::create( - exec, comm, - dim<2>(coarse_size, - gko::as(matrix)->get_size()[0]), - std::get<2>(result))); - auto prolong_op = share( - experimental::distributed:: - Matrix::create( - exec, comm, - dim<2>(gko::as(matrix)->get_size()[0], - coarse_size), - std::get<0>(result))); - this->set_multigrid_level(prolong_op, coarse, restrict_op); - }; - - // the fine op is using csr with the current ValueType - run(this->get_fine_op(), distributed_setup); } else #endif // GINKGO_BUILD_MPI { diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index f67c5413ff7..e6b7fac625e 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -101,11 +101,16 @@ void handle_list( auto exec = matrix->get_executor(); #if GINKGO_BUILD_MPI if (gko::detail::is_distributed(matrix.get())) { - using experimental::distributed::Matrix; - return run, - Matrix, - Matrix>( - matrix, [exec, iteration, relaxation_factor](auto matrix) { + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(matrix); + } else { + using experimental::distributed::Matrix; + return run, + Matrix, + Matrix>(matrix, [exec, iteration, + relaxation_factor]( + auto matrix) { using Mtx = typename decltype(matrix)::element_type; return share( build_smoother( @@ -119,6 +124,7 @@ void handle_list( iteration, casting(relaxation_factor)) ->generate(matrix)); }); + } } #endif return share(build_smoother(preconditioner::Jacobi::build() @@ -329,30 +335,37 @@ void MultigridState::generate(const LinOp* system_matrix_in, if (gko::detail::is_distributed(system_matrix_in)) { using value_type = typename std::decay_t::value_type; - using VectorType = - experimental::distributed::Vector; - auto fine = mg_level->get_fine_op().get(); - auto coarse = mg_level->get_coarse_op().get(); - auto distributed_fine = dynamic_cast< - const experimental::distributed::DistributedBase*>( - fine); - auto distributed_coarse = dynamic_cast< - const experimental::distributed::DistributedBase*>( - coarse); - auto current_comm = distributed_fine->get_communicator(); - auto next_comm = distributed_coarse->get_communicator(); - auto current_local_nrows = - ::gko::detail::run_matrix(fine, [](auto* fine_mat) { - return fine_mat->get_local_matrix()->get_size()[0]; - }); - auto next_local_nrows = - ::gko::detail::run_matrix(coarse, [](auto* coarse_mat) { - return coarse_mat->get_non_local_matrix() - ->get_size()[0]; - }); - this->allocate_memory( - i, cycle, current_comm, next_comm, current_nrows, - next_nrows, current_local_nrows, next_local_nrows); + if constexpr (std::is_same_v, + half>) { + GKO_NOT_SUPPORTED(system_matrix_in); + } else { + using VectorType = + experimental::distributed::Vector; + auto fine = mg_level->get_fine_op().get(); + auto coarse = mg_level->get_coarse_op().get(); + auto distributed_fine = dynamic_cast< + const experimental::distributed::DistributedBase*>( + fine); + auto distributed_coarse = dynamic_cast< + const experimental::distributed::DistributedBase*>( + coarse); + auto current_comm = + distributed_fine->get_communicator(); + auto next_comm = distributed_coarse->get_communicator(); + auto current_local_nrows = + ::gko::detail::run_matrix(fine, [](auto* fine_mat) { + return fine_mat->get_local_matrix() + ->get_size()[0]; + }); + auto next_local_nrows = ::gko::detail::run_matrix( + coarse, [](auto* coarse_mat) { + return coarse_mat->get_non_local_matrix() + ->get_size()[0]; + }); + this->allocate_memory( + i, cycle, current_comm, next_comm, current_nrows, + next_nrows, current_local_nrows, next_local_nrows); + } } else #endif { @@ -445,6 +458,32 @@ void MultigridState::allocate_memory( initialize({-one()}, exec)); } +#if GINKGO_ENABLE_HALF +template <> +void MultigridState::allocate_memory< + gko::experimental::distributed::Vector>( + int level, multigrid::cycle cycle, + const experimental::mpi::communicator& current_comm, + const experimental::mpi::communicator& next_comm, size_type current_nrows, + size_type next_nrows, size_type current_local_nrows, + size_type next_local_nrows) +{ + GKO_NOT_SUPPORTED(nullptr); +} + +template <> +void MultigridState::allocate_memory< + gko::experimental::distributed::Vector>>( + int level, multigrid::cycle cycle, + const experimental::mpi::communicator& current_comm, + const experimental::mpi::communicator& next_comm, size_type current_nrows, + size_type next_nrows, size_type current_local_nrows, + size_type next_local_nrows) +{ + GKO_NOT_SUPPORTED(nullptr); +} +#endif + #endif @@ -593,6 +632,27 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level, } } +template <> +void MultigridState::run_cycle< + gko::experimental::distributed::Vector>( + multigrid::cycle cycle, size_type level, + const std::shared_ptr& matrix, const LinOp* b, LinOp* x, + cycle_mode mode) +{ + GKO_NOT_SUPPORTED(nullptr); +} + +template <> +void MultigridState::run_cycle< + gko::experimental::distributed::Vector>>( + multigrid::cycle cycle, size_type level, + const std::shared_ptr& matrix, const LinOp* b, LinOp* x, + cycle_mode mode) +{ + GKO_NOT_SUPPORTED(nullptr); +} + + } // namespace detail } // namespace multigrid @@ -769,35 +829,41 @@ void Multigrid::generate() if (gko::detail::is_distributed(matrix.get())) { using absolute_value_type = remove_complex; using experimental::distributed::Matrix; - return run, - Matrix, - Matrix>(matrix, [exec](auto matrix) { - using Mtx = typename decltype(matrix)::element_type; - return solver::Gmres::build() - .with_criteria( - stop::Iteration::build().with_max_iters( - matrix->get_size()[0]), - stop::ResidualNorm::build() - .with_reduction_factor( - std::numeric_limits< - absolute_value_type>::epsilon() * - absolute_value_type{10})) - .with_krylov_dim( - std::min(size_type(100), matrix->get_size()[0])) - .with_preconditioner( - experimental::distributed::preconditioner:: - Schwarz:: - build() + if constexpr (std::is_same_v) { + GKO_NOT_SUPPORTED(matrix); + } else { + return run, + Matrix, + Matrix>(matrix, [exec](auto matrix) { + using Mtx = typename decltype(matrix)::element_type; + return solver::Gmres::build() + .with_criteria( + stop::Iteration::build().with_max_iters( + matrix->get_size()[0]), + stop::ResidualNorm::build() + .with_reduction_factor( + std::numeric_limits< + absolute_value_type>:: + epsilon() * + absolute_value_type{10})) + .with_krylov_dim(std::min( + size_type(100), matrix->get_size()[0])) + .with_preconditioner( + experimental::distributed::preconditioner:: + Schwarz::build() .with_local_solver( preconditioner::Jacobi< value_type>::build() .with_max_block_size(1u))) - .on(exec) - ->generate(matrix); - }); + .on(exec) + ->generate(matrix); + }); + } } #endif if (dynamic_cast(exec.get())) { diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index ad31a6b19e8..4adc02763f0 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -382,7 +382,11 @@ make_temporary_conversion(const LinOp* matrix) template void precision_dispatch(Function fn, Args*... linops) { - fn(distributed::make_temporary_conversion(linops).get()...); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); + } else { + fn(distributed::make_temporary_conversion(linops).get()...); + } } @@ -398,23 +402,29 @@ void precision_dispatch(Function fn, Args*... linops) template void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out) { - auto complex_to_real = !( - is_complex() || - dynamic_cast>*>( - in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>(in); - auto dense_out = - distributed::make_temporary_conversion>(out); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dynamic_cast(dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); } else { - distributed::precision_dispatch(fn, in, out); + auto complex_to_real = !( + is_complex() || + dynamic_cast< + const ConvertibleTo>*>(in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>( + in); + auto dense_out = + distributed::make_temporary_conversion>( + out); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + } else { + distributed::precision_dispatch(fn, in, out); + } } } @@ -426,27 +436,33 @@ template void precision_dispatch_real_complex(Function fn, const LinOp* alpha, const LinOp* in, LinOp* out) { - auto complex_to_real = !( - is_complex() || - dynamic_cast>*>( - in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>(in); - auto dense_out = - distributed::make_temporary_conversion>(out); - auto dense_alpha = gko::make_temporary_conversion(alpha); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dense_alpha.get(), - dynamic_cast(dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); } else { - fn(gko::make_temporary_conversion(alpha).get(), - distributed::make_temporary_conversion(in).get(), - distributed::make_temporary_conversion(out).get()); + auto complex_to_real = !( + is_complex() || + dynamic_cast< + const ConvertibleTo>*>(in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>( + in); + auto dense_out = + distributed::make_temporary_conversion>( + out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + } else { + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + distributed::make_temporary_conversion(out).get()); + } } } @@ -459,30 +475,36 @@ void precision_dispatch_real_complex(Function fn, const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) { - auto complex_to_real = !( - is_complex() || - dynamic_cast>*>( - in)); - if (complex_to_real) { - auto dense_in = - distributed::make_temporary_conversion>(in); - auto dense_out = - distributed::make_temporary_conversion>(out); - auto dense_alpha = gko::make_temporary_conversion(alpha); - auto dense_beta = gko::make_temporary_conversion(beta); - using Vector = experimental::distributed::Vector; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dense_alpha.get(), - dynamic_cast(dense_in->create_real_view().get()), - dense_beta.get(), - dynamic_cast(dense_out->create_real_view().get())); + if constexpr (std::is_same_v, half>) { + GKO_NOT_SUPPORTED(nullptr); } else { - fn(gko::make_temporary_conversion(alpha).get(), - distributed::make_temporary_conversion(in).get(), - gko::make_temporary_conversion(beta).get(), - distributed::make_temporary_conversion(out).get()); + auto complex_to_real = !( + is_complex() || + dynamic_cast< + const ConvertibleTo>*>(in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>( + in); + auto dense_out = + distributed::make_temporary_conversion>( + out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + auto dense_beta = gko::make_temporary_conversion(beta); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dense_beta.get(), + dynamic_cast(dense_out->create_real_view().get())); + } else { + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + gko::make_temporary_conversion(beta).get(), + distributed::make_temporary_conversion(out).get()); + } } } @@ -547,6 +569,7 @@ void precision_dispatch_real_complex_distributed(Function fn, if (dynamic_cast(in)) { experimental::distributed::precision_dispatch_real_complex( fn, alpha, in, beta, out); + } else { gko::precision_dispatch_real_complex(fn, alpha, in, beta, out);