From fe2890abb89e7e8995e360f8d54f8931a9d5c8ee Mon Sep 17 00:00:00 2001 From: Gerald Paul Bowen Collom Date: Tue, 27 Jun 2023 23:06:19 -0700 Subject: [PATCH] Use config flag for locality-aware mpi --- src/parcsr_ls/par_cycle.c | 11 + src/parcsr_mv/_hypre_parcsr_mv.h | 20 ++ src/parcsr_mv/new_commpkg.c | 58 ++++ src/parcsr_mv/par_csr_communication.c | 418 +++++++++++++++++++++++++- src/parcsr_mv/par_csr_communication.h | 16 + src/parcsr_mv/protos.h | 6 + src/test/ij.c | 16 + src/utilities/_hypre_utilities.h | 11 + src/utilities/general.c | 5 + src/utilities/handle.h | 13 + 10 files changed, 570 insertions(+), 4 deletions(-) diff --git a/src/parcsr_ls/par_cycle.c b/src/parcsr_ls/par_cycle.c index 8819de400d..2561428209 100644 --- a/src/parcsr_ls/par_cycle.c +++ b/src/parcsr_ls/par_cycle.c @@ -286,6 +286,17 @@ hypre_BoomerAMGCycle( void *amg_vdata, hypre_GpuProfilingPushRange(nvtx_name); while (Not_Finished) { +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (level >= hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())) + { + //if (my_id == 0) { printf("LVL %d using node aware, th: %d\n", level, hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())); } + hypre_HandleUsingNodeAwareMPI(hypre_handle()) = 1; + } else + { + //if (my_id == 0) { printf("LVL %d NOT using node aware\n", level); } + hypre_HandleUsingNodeAwareMPI(hypre_handle()) = 0; + } +#endif if (num_levels > 1) { local_size = hypre_VectorSize(hypre_ParVectorLocalVector(F_array[level])); diff --git a/src/parcsr_mv/_hypre_parcsr_mv.h b/src/parcsr_mv/_hypre_parcsr_mv.h index 56c61b3c75..a3d16c18d0 100644 --- a/src/parcsr_mv/_hypre_parcsr_mv.h +++ b/src/parcsr_mv/_hypre_parcsr_mv.h @@ -23,6 +23,10 @@ extern "C" { #ifndef HYPRE_PAR_CSR_COMMUNICATION_HEADER #define HYPRE_PAR_CSR_COMMUNICATION_HEADER +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*-------------------------------------------------------------------------- * hypre_ParCSRCommPkg: * Structure containing information for doing communications @@ -59,6 +63,9 @@ typedef struct void *recv_data_buffer; HYPRE_Int num_requests; hypre_MPI_Request *requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Request *Xrequest; +#endif } hypre_ParCSRCommHandle; typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; @@ -66,6 +73,10 @@ typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; typedef struct _hypre_ParCSRCommPkg { MPI_Comm comm; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Comm *neighbor_comm; + MPIX_Comm *neighborT_comm; +#endif HYPRE_Int num_components; HYPRE_Int num_sends; HYPRE_Int *send_procs; @@ -75,6 +86,11 @@ typedef struct _hypre_ParCSRCommPkg HYPRE_Int num_recvs; HYPRE_Int *recv_procs; HYPRE_Int *recv_vec_starts; + HYPRE_Int use_neighbor; +#ifdef HYPRE_USING_NODE_AWARE_MPI + long *global_send_indices; + long *global_recv_indices; +#endif /* remote communication information */ hypre_MPI_Datatype *send_mpi_types; hypre_MPI_Datatype *recv_mpi_types; @@ -787,6 +803,10 @@ HYPRE_Int hypre_RangeFillResponseIJDetermineRecvProcs ( void *p_recv_contact_buf HYPRE_Int hypre_FillResponseIJDetermineSendProcs ( void *p_recv_contact_buf, HYPRE_Int contact_size, HYPRE_Int contact_proc, void *ro, MPI_Comm comm, void **p_send_response_buf, HYPRE_Int *response_message_size ); +void hypre_ParCSRCreateCommGraph( HYPRE_BigInt first_col_diag, + HYPRE_BigInt *col_map_offd, + MPI_Comm comm, + hypre_ParCSRCommPkg *comm_pkg ); /* numbers.c */ hypre_NumbersNode *hypre_NumbersNewNode ( void ); diff --git a/src/parcsr_mv/new_commpkg.c b/src/parcsr_mv/new_commpkg.c index b12c2112ec..32a979c2f0 100644 --- a/src/parcsr_mv/new_commpkg.c +++ b/src/parcsr_mv/new_commpkg.c @@ -11,6 +11,9 @@ *-----------------------------------------------------*/ #include "_hypre_parcsr_mv.h" +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include +#endif /* some debugging tools*/ #define mydebug 0 @@ -750,3 +753,58 @@ hypre_FillResponseIJDetermineSendProcs(void *p_recv_contact_buf, return hypre_error_flag; } + +/*-------------------------------------------------------------------- + * hypre_ParCSRCreateCommGraph + * + * Create communication topology graph for MPI neighborhood + * collectives + *--------------------------------------------------------------------*/ + +#ifdef HYPRE_USING_NODE_AWARE_MPI +void +hypre_ParCSRCreateCommGraph(HYPRE_BigInt first_col_diag, + HYPRE_BigInt *col_map_offd, + MPI_Comm comm, + hypre_ParCSRCommPkg *comm_pkg) { + HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); + HYPRE_Int *send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg); + HYPRE_Int *recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg); + HYPRE_Int *send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg); + + int *sendcounts = (int *)malloc(num_sends * sizeof(int)); + int *recvcounts = (int *)malloc(num_recvs * sizeof(int)); + for (int i = 0; i < num_sends; i++) { + sendcounts[i] = send_map_starts[i+1] - send_map_starts[i]; + } + for (int i = 0; i < num_recvs; i++) { + recvcounts[i] = recv_vec_starts[i+1] - recv_vec_starts[i]; + } + MPIX_Dist_graph_create_adjacent( comm, num_recvs, hypre_ParCSRCommPkgRecvProcs(comm_pkg), + recvcounts, + num_sends, hypre_ParCSRCommPkgSendProcs(comm_pkg), + sendcounts, + MPI_INFO_NULL, 0, &(comm_pkg->neighbor_comm)); + MPIX_Dist_graph_create_adjacent( comm, num_sends, hypre_ParCSRCommPkgSendProcs(comm_pkg), + sendcounts, + num_recvs, hypre_ParCSRCommPkgRecvProcs(comm_pkg), + recvcounts, + MPI_INFO_NULL, 0, &(comm_pkg->neighborT_comm)); + + HYPRE_Int num_send_elmts = send_map_starts[num_sends]; + comm_pkg->global_send_indices = hypre_CTAlloc(long, num_send_elmts, HYPRE_MEMORY_HOST); + for (int i = 0; i < num_sends; i++) { + for (int j = send_map_starts[i]; j < send_map_starts[i+1]; j++) { + comm_pkg->global_send_indices[j] = send_map_elmts[j] + first_col_diag; + } + } + HYPRE_Int num_recv_elmts = recv_vec_starts[num_recvs]; + comm_pkg->global_recv_indices = hypre_CTAlloc(long, num_recv_elmts, HYPRE_MEMORY_HOST); + for (int i = 0; i < num_recvs; i++) { + for (int j = recv_vec_starts[i]; j < recv_vec_starts[i+1]; j++) { + comm_pkg->global_recv_indices[j] = col_map_offd[j]; + } + } +} +#endif diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index 70a355f1ba..a8ff9153bf 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -7,6 +7,10 @@ #include "_hypre_parcsr_mv.h" +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*==========================================================================*/ #ifdef HYPRE_USING_PERSISTENT_COMM @@ -60,11 +64,46 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); MPI_Comm comm = hypre_ParCSRCommPkgComm(comm_pkg); + +#ifdef HYPRE_USING_NODE_AWARE_MPI + HYPRE_Int num_requests; + hypre_MPI_Request *requests; + + HYPRE_Int *send_sizes; + HYPRE_Int *recv_sizes; + HYPRE_BigInt num_send_elmts; + HYPRE_BigInt num_recv_elmts; + MPIX_Request *Xrequest; + + HYPRE_Int node_aware_on = hypre_HandleUsingNodeAwareMPI(hypre_handle()); + if (comm_pkg->use_neighbor && node_aware_on) { + if (comm_pkg->neighbor_comm == NULL) { + hypre_printf("Trying to communicate with a NULL communicator\n"); + hypre_assert(1 == 0); + } + + send_sizes = hypre_TAlloc(HYPRE_Int, num_sends, HYPRE_MEMORY_HOST); + recv_sizes = hypre_TAlloc(HYPRE_Int, num_recvs, HYPRE_MEMORY_HOST); + + num_send_elmts = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends); + num_recv_elmts = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs); + } else { + num_requests = num_sends + num_recvs; + requests = hypre_CTAlloc(hypre_MPI_Request, num_requests, HYPRE_MEMORY_HOST); + + hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests; + hypre_ParCSRCommHandleRequests(comm_handle) = requests; + } +#else HYPRE_Int num_requests = num_sends + num_recvs; hypre_MPI_Request *requests = hypre_CTAlloc(hypre_MPI_Request, num_requests, HYPRE_MEMORY_HOST); hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests; hypre_ParCSRCommHandleRequests(comm_handle) = requests; +#endif + + //int rank; + //hypre_MPI_Comm_rank(comm_pkg->comm, &rank); void *send_buff = NULL, *recv_buff = NULL; @@ -77,6 +116,48 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor || !node_aware_on) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Complex *)recv_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + num_recvs + i ); + } + //if (rank == 0) { hypre_printf("Standard init done\n"); } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_alltoallv_init( (HYPRE_Complex *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_COMPLEX, + (HYPRE_Complex *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_COMPLEX, comm_pkg->neighbor_comm, + MPI_INFO_NULL, &Xrequest); + //if (rank == 0) { hypre_printf("Node-aware init done\n"); } + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -93,6 +174,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, ip, 0, comm, requests + num_recvs + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_COMPLEX_TRANSPOSE: @@ -102,6 +184,45 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor || !node_aware_on) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Complex *)recv_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + num_sends + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_alltoallv_init( (HYPRE_Complex *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_COMPLEX, + (HYPRE_Complex *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_COMPLEX, comm_pkg->neighborT_comm, + MPI_INFO_NULL, &Xrequest); + //if (rank == 0) { hypre_printf("Node-aware transpose init done\n"); } + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -118,6 +239,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, ip, 0, comm, requests + num_sends + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_INT: @@ -127,6 +249,46 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor || !node_aware_on) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Int *)recv_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + num_recvs + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_alltoallv_init( (HYPRE_Int *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_INT, + (HYPRE_Int *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_INT, comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -143,6 +305,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, ip, 0, comm, requests + num_recvs + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_INT_TRANSPOSE: @@ -152,6 +315,44 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor || !node_aware_on) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Int *)recv_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + num_sends + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_alltoallv_init( (HYPRE_Int *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_INT, + (HYPRE_Int *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_INT, comm_pkg->neighborT_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -168,6 +369,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, ip, 0, comm, requests + num_sends + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_BIGINT: @@ -177,6 +379,48 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor || !node_aware_on) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_BigInt *)recv_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + num_recvs + i); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_alltoallv_init( (HYPRE_BigInt *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_BIG_INT, + (HYPRE_BigInt *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_BIG_INT, comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -195,6 +439,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MPI_BIG_INT, ip, 0, comm, requests + num_recvs + i); } +#endif break; case HYPRE_COMM_PKG_JOB_BIGINT_TRANSPOSE: @@ -204,6 +449,46 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!comm_pkg->use_neighbor || !node_aware_on) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_BigInt *)recv_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + num_sends + i); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_alltoallv_init( (HYPRE_BigInt *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_BIG_INT, + (HYPRE_BigInt *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_BIG_INT, comm_pkg->neighborT_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -218,22 +503,37 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; - hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, HYPRE_MPI_BIG_INT, ip, 0, comm, requests + num_sends + i); } +#endif break; default: hypre_assert(1 == 0); break; } // switch (job_type) + hypre_ParCSRCommHandleCommPkg(comm_handle) = comm_pkg; hypre_ParCSRCommHandleRecvDataBuffer(comm_handle) = recv_buff; hypre_ParCSRCommHandleSendDataBuffer(comm_handle) = send_buff; hypre_ParCSRCommHandleNumSendBytes(comm_handle) = num_bytes_send; hypre_ParCSRCommHandleNumRecvBytes(comm_handle) = num_bytes_recv; + +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_pkg->use_neighbor && node_aware_on) { + comm_handle->Xrequest = Xrequest; + + hypre_TFree(send_sizes, HYPRE_MEMORY_HOST); + hypre_TFree(recv_sizes, HYPRE_MEMORY_HOST); + } + + //int rank; + //hypre_MPI_Comm_rank(comm_handle->comm_pkg->comm, &rank); + //if (rank == 0) { hypre_printf("Comm Pkg Created\n"); } +#endif + return ( comm_handle ); } @@ -262,12 +562,21 @@ hypre_ParCSRCommPkgGetPersistentCommHandle( HYPRE_Int job, hypre_ParCSRCommPkg * void hypre_ParCSRPersistentCommHandleDestroy( hypre_ParCSRPersistentCommHandle *comm_handle ) { + //int rank; + //hypre_MPI_Comm_rank(comm_handle->comm_pkg->comm, &rank); + //if (rank == 0) { hypre_printf("Destroying\n"); } if (comm_handle) { hypre_TFree(hypre_ParCSRCommHandleSendDataBuffer(comm_handle), HYPRE_MEMORY_HOST); hypre_TFree(hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), HYPRE_MEMORY_HOST); - hypre_TFree(comm_handle->requests, HYPRE_MEMORY_HOST); - + if (comm_handle->requests) { + hypre_TFree(comm_handle->requests, HYPRE_MEMORY_HOST); + } +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_handle->Xrequest && hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + MPIX_Request_free(comm_handle->Xrequest); + } +#endif hypre_TFree(comm_handle, HYPRE_MEMORY_HOST); } } @@ -284,6 +593,51 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha hypre_ParCSRCommHandleSendData(comm_handle) = send_data; hypre_ParCSRCommHandleSendMemoryLocation(comm_handle) = send_memory_location; +#ifdef HYPRE_USING_NODE_AWARE_MPI + //int rank; + //hypre_MPI_Comm_rank(comm_handle->comm_pkg->comm, &rank); + //if (rank == 0) { hypre_printf("%d\n", comm_handle->comm_pkg->use_neighbor); } + if (!comm_handle->comm_pkg->use_neighbor || !hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + //if (rank == 0) { hypre_printf("Standard starting\n"); } + if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) + { + hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), + send_data, + char, + hypre_ParCSRCommHandleNumSendBytes(comm_handle), + HYPRE_MEMORY_HOST, + send_memory_location ); + HYPRE_Int ret = hypre_MPI_Startall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle)); + //if (rank == 0) { hypre_printf("Standard started\n"); } + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + } + } else { + //int rank; + //hypre_MPI_Comm_rank(comm_handle->comm_pkg->comm, &rank); + //if (rank == 0) { hypre_printf("Node-aware starting, %p\n", comm_handle->Xrequest); } + if (comm_handle->Xrequest) + { + hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), + send_data, + char, + hypre_ParCSRCommHandleNumSendBytes(comm_handle), + HYPRE_MEMORY_HOST, + send_memory_location ); + HYPRE_Int ret = (HYPRE_Int) MPIX_Start(comm_handle->Xrequest); + //if (rank == 0) { hypre_printf("Node-aware started\n"); } + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + } + } +#else if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) { hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), @@ -292,7 +646,6 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha hypre_ParCSRCommHandleNumSendBytes(comm_handle), HYPRE_MEMORY_HOST, send_memory_location ); - HYPRE_Int ret = hypre_MPI_Startall(hypre_ParCSRCommHandleNumRequests(comm_handle), hypre_ParCSRCommHandleRequests(comm_handle)); if (hypre_MPI_SUCCESS != ret) @@ -301,6 +654,7 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ } } +#endif } /*------------------------------------------------------------------ @@ -315,6 +669,53 @@ hypre_ParCSRPersistentCommHandleWait( hypre_ParCSRPersistentCommHandle *comm_han hypre_ParCSRCommHandleRecvData(comm_handle) = recv_data; hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle) = recv_memory_location; +#ifdef HYPRE_USING_NODE_AWARE_MPI + //int rank; + //hypre_MPI_Comm_rank(comm_handle->comm_pkg->comm, &rank); + if (!comm_handle->comm_pkg->use_neighbor || !hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + //if (rank == 0) { hypre_printf("Standard waiting\n"); } + if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) + { + HYPRE_Int ret = hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle), + hypre_MPI_STATUSES_IGNORE); + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + + hypre_TMemcpy(recv_data, + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + recv_memory_location, + HYPRE_MEMORY_HOST); + } + //if (rank == 0) { hypre_printf("Standard waited\n"); } + } else { + //if (rank == 0) { hypre_printf("Node-aware waiting, %p\n", comm_handle->Xrequest); } + if (comm_handle->Xrequest) + { + HYPRE_Int ret = (HYPRE_Int) MPIX_Wait(comm_handle->Xrequest, + MPI_STATUS_IGNORE); + + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + + hypre_TMemcpy(recv_data, + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + recv_memory_location, + HYPRE_MEMORY_HOST); + //if (rank == 0) { hypre_printf("Node-aware waited\n"); } + } + } +#else if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) { HYPRE_Int ret = hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), @@ -333,6 +734,7 @@ hypre_ParCSRPersistentCommHandleWait( hypre_ParCSRPersistentCommHandle *comm_han recv_memory_location, HYPRE_MEMORY_HOST); } +#endif } #endif // HYPRE_USING_PERSISTENT_COMM @@ -1030,6 +1432,10 @@ hypre_ParCSRCommPkgCreateAndFill( MPI_Comm comm, } #endif +#ifdef HYPRE_USING_NODE_AWARE_MPI + comm_pkg->use_neighbor = 0; +#endif + /* Set input info */ hypre_ParCSRCommPkgComm(comm_pkg) = comm; hypre_ParCSRCommPkgNumRecvs(comm_pkg) = num_recvs; @@ -1194,6 +1600,10 @@ hypre_MatvecCommPkgCreate ( hypre_ParCSRMatrix *A ) num_cols_offd, global_num_cols, apart, comm_pkg ); +#ifdef HYPRE_USING_NODE_AWARE_MPI + comm_pkg->use_neighbor = 1; + hypre_ParCSRCreateCommGraph( first_col_diag, col_map_offd, comm, comm_pkg ); +#endif HYPRE_ANNOTATE_FUNC_END; diff --git a/src/parcsr_mv/par_csr_communication.h b/src/parcsr_mv/par_csr_communication.h index 13f5ea0719..4fa13ac618 100644 --- a/src/parcsr_mv/par_csr_communication.h +++ b/src/parcsr_mv/par_csr_communication.h @@ -8,6 +8,10 @@ #ifndef HYPRE_PAR_CSR_COMMUNICATION_HEADER #define HYPRE_PAR_CSR_COMMUNICATION_HEADER +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*-------------------------------------------------------------------------- * hypre_ParCSRCommPkg: * Structure containing information for doing communications @@ -44,6 +48,9 @@ typedef struct void *recv_data_buffer; HYPRE_Int num_requests; hypre_MPI_Request *requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Request *Xrequest; +#endif } hypre_ParCSRCommHandle; typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; @@ -51,6 +58,10 @@ typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; typedef struct _hypre_ParCSRCommPkg { MPI_Comm comm; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Comm *neighbor_comm; + MPIX_Comm *neighborT_comm; +#endif HYPRE_Int num_components; HYPRE_Int num_sends; HYPRE_Int *send_procs; @@ -60,6 +71,11 @@ typedef struct _hypre_ParCSRCommPkg HYPRE_Int num_recvs; HYPRE_Int *recv_procs; HYPRE_Int *recv_vec_starts; + HYPRE_Int use_neighbor; +#ifdef HYPRE_USING_NODE_AWARE_MPI + long *global_send_indices; + long *global_recv_indices; +#endif /* remote communication information */ hypre_MPI_Datatype *send_mpi_types; hypre_MPI_Datatype *recv_mpi_types; diff --git a/src/parcsr_mv/protos.h b/src/parcsr_mv/protos.h index e8705e0804..1bd2f95423 100644 --- a/src/parcsr_mv/protos.h +++ b/src/parcsr_mv/protos.h @@ -134,6 +134,12 @@ HYPRE_Int hypre_RangeFillResponseIJDetermineRecvProcs ( void *p_recv_contact_buf HYPRE_Int hypre_FillResponseIJDetermineSendProcs ( void *p_recv_contact_buf, HYPRE_Int contact_size, HYPRE_Int contact_proc, void *ro, MPI_Comm comm, void **p_send_response_buf, HYPRE_Int *response_message_size ); +#ifdef HYPRE_USING_NODE_AWARE_MPI +void hypre_ParCSRCreateCommGraph( HYPRE_BigInt first_col_diag, + HYPRE_BigInt *col_map_offd, + MPI_Comm comm, + hypre_ParCSRCommPkg *comm_pkg ); +#endif /* numbers.c */ hypre_NumbersNode *hypre_NumbersNewNode ( void ); diff --git a/src/test/ij.c b/src/test/ij.c index b77e08e799..4480868af3 100644 --- a/src/test/ij.c +++ b/src/test/ij.c @@ -1561,6 +1561,19 @@ main( hypre_int argc, arg_index++; snprintf(mem_tracker_name, HYPRE_MAX_FILE_NAME_LEN, "%s", argv[arg_index++]); } +#endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + else if ( strcmp(argv[arg_index], "-node_aware_lvl_threshold") == 0 ) + { + arg_index++; + hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle()) = atoi(argv[arg_index++]); + if (hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle()) > 0) { + hypre_HandleUsingNodeAwareMPI(hypre_handle()) = 0; + } + //if (myid == 0) { + // printf("Set LVL Thresh: %d\n", hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())); + //} + } #endif else { @@ -2588,6 +2601,9 @@ main( hypre_int argc, hypre_printf(" -umpire_pinned_pool_size : pinned memory pool size (GiB)\n"); hypre_printf(" -umpire_host_pool_size : host memory pool size (GiB)\n"); /* end umpire options */ +#endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + hypre_printf(" -node_aware_lvl_threshold : Min. level in AMG hierarchy to use node-aware MPI\n"); #endif } diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index a7cd43133e..2898d5bd82 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1594,6 +1594,14 @@ typedef struct #if defined(HYPRE_USING_MAGMA) magma_queue_t magma_queue; #endif + +#if defined(HYPRE_USING_NODE_AWARE_MPI) + /* level at which to begin using node aware optimization */ + HYPRE_Int node_aware_switchover_threshold; + /* flag for using node aware optimization */ + HYPRE_Int using_node_aware_mpi; +#endif + } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -1658,6 +1666,9 @@ typedef struct #define hypre_HandleMagmaQueue(hypre_handle) ((hypre_handle) -> magma_queue) +#define hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle) ((hypre_handle) -> node_aware_switchover_threshold) +#define hypre_HandleUsingNodeAwareMPI(hypre_handle) ((hypre_handle) -> using_node_aware_mpi) + #endif /****************************************************************************** * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other diff --git a/src/utilities/general.c b/src/utilities/general.c index c2a81c88fa..d7542b2d49 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -44,6 +44,11 @@ hypre_HandleCreate(void) hypre_HandleDeviceGSMethod(hypre_handle_) = 1; /* CPU: 0; Cusparse: 1 */ #endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + hypre_HandleUsingNodeAwareMPI(hypre_handle_) = 1; + hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle_) = 0; +#endif + return hypre_handle_; } diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 6ff16bf228..30bc71c542 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -58,6 +58,14 @@ typedef struct #if defined(HYPRE_USING_MAGMA) magma_queue_t magma_queue; #endif + +#if defined(HYPRE_USING_NODE_AWARE_MPI) + /* level at which to begin using node aware optimization */ + HYPRE_Int node_aware_switchover_threshold; + /* flag for using node aware optimization */ + HYPRE_Int using_node_aware_mpi; +#endif + } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -122,4 +130,9 @@ typedef struct #define hypre_HandleMagmaQueue(hypre_handle) ((hypre_handle) -> magma_queue) +#ifdef HYPRE_USING_NODE_AWARE_MPI +#define hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle) ((hypre_handle) -> node_aware_switchover_threshold) +#define hypre_HandleUsingNodeAwareMPI(hypre_handle) ((hypre_handle) -> using_node_aware_mpi) +#endif + #endif