From abd59b36d6895f711b58258de5cea95b4ac6bdb4 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 16 Mar 2024 06:16:54 -0600 Subject: [PATCH] Refactor the grpcomm support Separate the fence and group operation backing collectives so we can deal more easily with their unique needs. In particular, the group construct operation has responsibilities to assign context IDs, collect endpt and group info, and generate event notifications during release so that "add members" are released. Signed-off-by: Ralph Castain --- examples/dynamic.c | 73 +- src/mca/errmgr/dvm/errmgr_dvm.c | 9 +- src/mca/filem/raw/filem_raw_module.c | 8 +- src/mca/grpcomm/base/Makefile.am | 5 +- src/mca/grpcomm/base/base.h | 26 +- src/mca/grpcomm/base/grpcomm_base_frame.c | 134 +- src/mca/grpcomm/base/grpcomm_base_select.c | 108 +- src/mca/grpcomm/base/grpcomm_base_stubs.c | 347 +--- src/mca/grpcomm/direct/Makefile.am | 5 +- src/mca/grpcomm/direct/grpcomm_direct.c | 806 +--------- src/mca/grpcomm/direct/grpcomm_direct.h | 161 +- .../grpcomm/direct/grpcomm_direct_component.c | 175 ++- src/mca/grpcomm/direct/grpcomm_direct_fence.c | 670 ++++++++ src/mca/grpcomm/direct/grpcomm_direct_group.c | 1399 +++++++++++++++++ src/mca/grpcomm/direct/grpcomm_direct_xcast.c | 354 +++++ src/mca/grpcomm/grpcomm.h | 187 +-- src/mca/iof/hnp/iof_hnp_send.c | 9 +- src/mca/plm/base/plm_base_launch_support.c | 19 +- src/mca/plm/base/plm_base_prted_cmds.c | 50 +- src/mca/plm/base/plm_base_receive.c | 15 +- src/mca/state/dvm/state_dvm.c | 25 +- src/prted/pmix/pmix_server.c | 33 - src/prted/pmix/pmix_server_dyn.c | 442 +----- src/prted/pmix/pmix_server_fence.c | 78 +- src/prted/pmix/pmix_server_gen.c | 45 +- src/prted/pmix/pmix_server_group.c | 421 ++--- src/prted/pmix/pmix_server_internal.h | 1 + src/prted/pmix/pmix_server_session.c | 3 +- src/rml/rml_types.h | 65 +- .../data_type_support/prte_dt_copy_fns.c | 11 - .../data_type_support/prte_dt_packing_fns.c | 81 - .../data_type_support/prte_dt_print_fns.c | 14 - .../data_type_support/prte_dt_unpacking_fns.c | 106 -- src/runtime/prte_globals.h | 9 - 34 files changed, 3119 insertions(+), 2775 deletions(-) create mode 100644 src/mca/grpcomm/direct/grpcomm_direct_fence.c create mode 100644 src/mca/grpcomm/direct/grpcomm_direct_group.c create mode 100644 src/mca/grpcomm/direct/grpcomm_direct_xcast.c diff --git a/examples/dynamic.c b/examples/dynamic.c index e92f5075fd..7fdc9d0617 100644 --- a/examples/dynamic.c +++ b/examples/dynamic.c @@ -17,7 +17,7 @@ * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -51,9 +51,9 @@ int main(int argc, char **argv) pmix_app_t *app; char hostname[1024], dir[1024]; pmix_proc_t *peers; - size_t npeers, ntmp = 0; + size_t npeers, ntmp = 0, n; char *nodelist; - char *cmd; + char *cmd, *tmp; if (0 > gethostname(hostname, sizeof(hostname))) { exit(1); @@ -96,6 +96,51 @@ int main(int argc, char **argv) PMIX_VALUE_RELEASE(val); fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs); + + // put some values + if (0 > asprintf(&tmp, "%s-%d-remote1", myproc.nspace, myproc.rank)) { + exit(1); + } + value.type = PMIX_UINT64; + value.data.uint64 = 1234; + if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, + myproc.rank, rc); + goto done; + } + free(tmp); + + if (0 > asprintf(&tmp, "%s-%d-remote2", myproc.nspace, myproc.rank)) { + exit(1); + } + value.type = PMIX_UINT64; + value.data.uint64 = 12345; + if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, + myproc.rank, rc); + goto done; + } + free(tmp); + + if (0 > asprintf(&tmp, "%s-%d-remote3", myproc.nspace, myproc.rank)) { + exit(1); + } + value.type = PMIX_UINT64; + value.data.uint64 = 123456; + if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, + myproc.rank, rc); + goto done; + } + free(tmp); + + /* push the data to our PMIx server */ + if (PMIX_SUCCESS != (rc = PMIx_Commit())) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Commit failed: %d\n", myproc.nspace, + myproc.rank, rc); + goto done; + } + /* call fence to sync */ (void) strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; @@ -173,15 +218,10 @@ int main(int argc, char **argv) exitcode = rc; goto done; } - if ((nprocs + ntmp) != npeers) { - fprintf(stderr, - "Client ns %s rank %d: PMIx_Resolve_peers returned incorrect npeers: %d vs %d\n", - myproc.nspace, myproc.rank, (int) (nprocs + ntmp), (int) npeers); - exitcode = 1; - goto done; + fprintf(stderr, "Client ns %s rank %d PEERS:\n", myproc.nspace, myproc.rank); + for (n=0; n < npeers; n++) { + fprintf(stderr, "\t%s:%d\n", peers[n].nspace, peers[n].rank); } - fprintf(stderr, "Client ns %s rank %d: PMIx_Resolve_peers returned %d npeers\n", - myproc.nspace, myproc.rank, (int) npeers); if (PMIX_SUCCESS != (rc = PMIx_Resolve_nodes(nsp2, &nodelist))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Resolve_nodes failed for nspace %s: %d\n", myproc.nspace, myproc.rank, nsp2, rc); @@ -197,15 +237,10 @@ int main(int argc, char **argv) exitcode = rc; goto done; } - if (nprocs != npeers) { - fprintf(stderr, - "Client ns %s rank %d: PMIx_Resolve_peers returned incorrect npeers: %d vs %d\n", - myproc.nspace, myproc.rank, nprocs, (int) npeers); - exitcode = rc; - goto done; + fprintf(stderr, "Client ns %s rank %d PEERS:\n", myproc.nspace, myproc.rank); + for (n=0; n < npeers; n++) { + fprintf(stderr, "\t%s:%d\n", peers[n].nspace, peers[n].rank); } - fprintf(stderr, "Client ns %s rank %d: PMIx_Resolve_peers returned %d npeers\n", - myproc.nspace, myproc.rank, (int) npeers); if (PMIX_SUCCESS != (rc = PMIx_Resolve_nodes(NULL, &nodelist))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Resolve_nodes failed: %d\n", myproc.nspace, myproc.rank, rc); diff --git a/src/mca/errmgr/dvm/errmgr_dvm.c b/src/mca/errmgr/dvm/errmgr_dvm.c index 8c8cea4712..5b26532865 100644 --- a/src/mca/errmgr/dvm/errmgr_dvm.c +++ b/src/mca/errmgr/dvm/errmgr_dvm.c @@ -577,7 +577,6 @@ static void check_send_notification(prte_job_t *jdata, prte_proc_t *proc, pmix_status_t event) { - prte_grpcomm_signature_t sig; int rc; pmix_info_t *info; size_t ninfo; @@ -673,14 +672,8 @@ static void check_send_notification(prte_job_t *jdata, PMIX_INFO_FREE(info, ninfo); /* xcast it to everyone */ - PMIX_CONSTRUCT(&sig, prte_grpcomm_signature_t); - sig.signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - PMIX_LOAD_PROCID(&sig.signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig.sz = 1; - - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(&sig, PRTE_RML_TAG_NOTIFICATION, &pbkt))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_NOTIFICATION, &pbkt))) { PRTE_ERROR_LOG(rc); } - PMIX_DESTRUCT(&sig); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); } diff --git a/src/mca/filem/raw/filem_raw_module.c b/src/mca/filem/raw/filem_raw_module.c index f97b5553bd..2614417078 100644 --- a/src/mca/filem/raw/filem_raw_module.c +++ b/src/mca/filem/raw/filem_raw_module.c @@ -709,7 +709,6 @@ static void send_chunk(int xxx, short argc, void *cbdata) int32_t numbytes; int rc; pmix_data_buffer_t chunk; - prte_grpcomm_signature_t *sig; PRTE_HIDE_UNUSED_PARAMS(xxx, argc); PMIX_ACQUIRE_OBJECT(rev); @@ -786,18 +785,13 @@ static void send_chunk(int xxx, short argc, void *cbdata) } /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_FILEM_BASE, &chunk))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_FILEM_BASE, &chunk))) { PRTE_ERROR_LOG(rc); PMIX_DATA_BUFFER_DESTRUCT(&chunk); close(fd); return; } PMIX_DATA_BUFFER_DESTRUCT(&chunk); - PMIX_RELEASE(sig); rev->nchunk++; /* if num_bytes was zero, then we need to terminate the event diff --git a/src/mca/grpcomm/base/Makefile.am b/src/mca/grpcomm/base/Makefile.am index 97652758d0..c56b2b7cf9 100644 --- a/src/mca/grpcomm/base/Makefile.am +++ b/src/mca/grpcomm/base/Makefile.am @@ -12,7 +12,7 @@ # Copyright (c) 2011-2013 Los Alamos National Security, LLC. # All rights reserved. # Copyright (c) 2019 Intel, Inc. All rights reserved. -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -25,5 +25,4 @@ headers += \ libprtemca_grpcomm_la_SOURCES += \ base/grpcomm_base_select.c \ - base/grpcomm_base_frame.c \ - base/grpcomm_base_stubs.c + base/grpcomm_base_frame.c diff --git a/src/mca/grpcomm/base/base.h b/src/mca/grpcomm/base/base.h index 3ce50aaa5f..eccb5c5f89 100644 --- a/src/mca/grpcomm/base/base.h +++ b/src/mca/grpcomm/base/base.h @@ -15,7 +15,7 @@ * Copyright (c) 2017-2020 Cisco Systems, Inc. All rights reserved * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,34 +61,10 @@ PRTE_EXPORT int prte_grpcomm_base_select(void); * globals that might be needed */ typedef struct { - pmix_list_item_t super; - int pri; - prte_grpcomm_base_module_t *module; - pmix_mca_base_component_t *component; -} prte_grpcomm_base_active_t; -PMIX_CLASS_DECLARATION(prte_grpcomm_base_active_t); - -typedef struct { - pmix_list_t actives; - pmix_list_t ongoing; - pmix_hash_table_t sig_table; - char *transports; uint32_t context_id; } prte_grpcomm_base_t; PRTE_EXPORT extern prte_grpcomm_base_t prte_grpcomm_base; -/* Public API stubs */ -PRTE_EXPORT int prte_grpcomm_API_xcast(prte_grpcomm_signature_t *sig, prte_rml_tag_t tag, - pmix_data_buffer_t *buf); - -PRTE_EXPORT int prte_grpcomm_API_allgather(prte_pmix_mdx_caddy_t *cd); - -PRTE_EXPORT prte_grpcomm_coll_t *prte_grpcomm_base_get_tracker(prte_grpcomm_signature_t *sig, - bool create); - -PRTE_EXPORT int prte_pack_ctrl_options(pmix_byte_object_t *bo, - const pmix_info_t *info, size_t ninfo); - END_C_DECLS #endif diff --git a/src/mca/grpcomm/base/grpcomm_base_frame.c b/src/mca/grpcomm/base/grpcomm_base_frame.c index 4a1756ecae..19ffe3307f 100644 --- a/src/mca/grpcomm/base/grpcomm_base_frame.c +++ b/src/mca/grpcomm/base/grpcomm_base_frame.c @@ -47,56 +47,17 @@ * Global variables */ prte_grpcomm_base_t prte_grpcomm_base = { - .actives = PMIX_LIST_STATIC_INIT, - .ongoing = PMIX_LIST_STATIC_INIT, - .sig_table = PMIX_HASH_TABLE_STATIC_INIT, - .transports = NULL, - .context_id = 0 + .context_id = UINT32_MAX }; -prte_grpcomm_API_module_t prte_grpcomm = { - .xcast = prte_grpcomm_API_xcast, - .allgather = prte_grpcomm_API_allgather -}; - -static int base_register(pmix_mca_base_register_flag_t flags) -{ - PRTE_HIDE_UNUSED_PARAMS(flags); +prte_grpcomm_base_module_t prte_grpcomm = {0}; - prte_grpcomm_base.context_id = 1; - pmix_mca_base_var_register("prte", "grpcomm", "base", "starting_context_id", - "Starting value for assigning context id\'s", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_grpcomm_base.context_id); - - return PRTE_SUCCESS; -} static int prte_grpcomm_base_close(void) { - prte_grpcomm_base_active_t *active; - void *key; - size_t size; - uint32_t *seq_number; - - PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_XCAST); - - /* Close the active modules */ - PMIX_LIST_FOREACH(active, &prte_grpcomm_base.actives, prte_grpcomm_base_active_t) - { - if (NULL != active->module->finalize) { - active->module->finalize(); - } - } - PMIX_LIST_DESTRUCT(&prte_grpcomm_base.actives); - PMIX_LIST_DESTRUCT(&prte_grpcomm_base.ongoing); - for (void *_nptr = NULL; - PRTE_SUCCESS - == pmix_hash_table_get_next_key_ptr(&prte_grpcomm_base.sig_table, &key, &size, - (void **) &seq_number, _nptr, &_nptr);) { - free(seq_number); + if (NULL != prte_grpcomm.finalize) { + prte_grpcomm.finalize(); } - PMIX_DESTRUCT(&prte_grpcomm_base.sig_table); return pmix_mca_base_framework_components_close(&prte_grpcomm_base_framework, NULL); } @@ -107,82 +68,37 @@ static int prte_grpcomm_base_close(void) */ static int prte_grpcomm_base_open(pmix_mca_base_open_flag_t flags) { - PMIX_CONSTRUCT(&prte_grpcomm_base.actives, pmix_list_t); - PMIX_CONSTRUCT(&prte_grpcomm_base.ongoing, pmix_list_t); - PMIX_CONSTRUCT(&prte_grpcomm_base.sig_table, pmix_hash_table_t); - pmix_hash_table_init(&prte_grpcomm_base.sig_table, 128); - prte_grpcomm_base.context_id = UINT32_MAX; - return pmix_mca_base_framework_components_open(&prte_grpcomm_base_framework, flags); } -PMIX_MCA_BASE_FRAMEWORK_DECLARE(prte, grpcomm, "GRPCOMM", base_register, prte_grpcomm_base_open, +PMIX_MCA_BASE_FRAMEWORK_DECLARE(prte, grpcomm, "GRPCOMM", NULL, prte_grpcomm_base_open, prte_grpcomm_base_close, prte_grpcomm_base_static_components, PMIX_MCA_BASE_FRAMEWORK_FLAG_DEFAULT); -PMIX_CLASS_INSTANCE(prte_grpcomm_base_active_t, - pmix_list_item_t, - NULL, NULL); - -static void scon(prte_grpcomm_signature_t *p) -{ - p->groupID = NULL; - p->ctxid = 0; - p->ctxid_assigned = false; - p->signature = NULL; - p->sz = 0; - p->addmembers = NULL; - p->nmembers = 0; - p->bootstrap = 0; - p->finalmembership = NULL; - p->nfinal = 0; -} -static void sdes(prte_grpcomm_signature_t *p) -{ - if (NULL != p->groupID) { - free(p->groupID); - } - if (NULL != p->signature) { - free(p->signature); - } - if (NULL != p->addmembers) { - free(p->addmembers); - } - if (NULL != p->finalmembership) { - free(p->finalmembership); - } -} -PMIX_CLASS_INSTANCE(prte_grpcomm_signature_t, - pmix_object_t, - scon, sdes); - -static void ccon(prte_grpcomm_coll_t *p) +static void grpcon(prte_pmix_grp_caddy_t *p) { - p->sig = NULL; - p->status = PMIX_SUCCESS; - PMIX_DATA_BUFFER_CONSTRUCT(&p->bucket); - p->dmns = NULL; - p->ndmns = 0; - p->nexpected = 0; - p->nreported = 0; - p->assignID = false; - p->timeout = 0; - p->memsize = 0; - PMIX_CONSTRUCT(&p->addmembers, pmix_list_t); + PMIX_CONSTRUCT_LOCK(&p->lock); + p->op = PMIX_GROUP_NONE; + p->grpid = NULL; + p->procs = NULL; + p->nprocs = 0; + p->directives = NULL; + p->ndirs = 0; + p->info = NULL; + p->ninfo = 0; p->cbfunc = NULL; p->cbdata = NULL; - p->buffers = NULL; } -static void cdes(prte_grpcomm_coll_t *p) +static void grpdes(prte_pmix_grp_caddy_t *p) { - if (NULL != p->sig) { - PMIX_RELEASE(p->sig); + PMIX_DESTRUCT_LOCK(&p->lock); + if (NULL != p->grpid) { + free(p->grpid); + } + if (NULL != p->info) { + PMIX_INFO_FREE(p->info, p->ninfo); } - PMIX_DATA_BUFFER_DESTRUCT(&p->bucket); - PMIX_LIST_DESTRUCT(&p->addmembers); - free(p->dmns); - free(p->buffers); } -PMIX_CLASS_INSTANCE(prte_grpcomm_coll_t, - pmix_list_item_t, - ccon, cdes); +PMIX_CLASS_INSTANCE(prte_pmix_grp_caddy_t, + pmix_object_t, + grpcon, grpdes); diff --git a/src/mca/grpcomm/base/grpcomm_base_select.c b/src/mca/grpcomm/base/grpcomm_base_select.c index 44c0cd88b6..ca0fd5d6b9 100644 --- a/src/mca/grpcomm/base/grpcomm_base_select.c +++ b/src/mca/grpcomm/base/grpcomm_base_select.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,104 +26,36 @@ #include "src/mca/base/pmix_base.h" #include "src/mca/mca.h" -#include "src/runtime/prte_globals.h" -#include "src/util/name_fns.h" - #include "src/mca/grpcomm/base/base.h" -static bool selected = false; - /** * Function for selecting one component from all those that are * available. */ int prte_grpcomm_base_select(void) { - pmix_mca_base_component_list_item_t *cli = NULL; - pmix_mca_base_component_t *component = NULL; - pmix_mca_base_module_t *module = NULL; - prte_grpcomm_base_module_t *nmodule; - prte_grpcomm_base_active_t *newmodule, *mod; - int rc, priority; - bool inserted; - - if (selected) { - /* ensure we don't do this twice */ - return PRTE_SUCCESS; + prte_grpcomm_base_component_t *best_component = NULL; + prte_grpcomm_base_module_t *best_module = NULL; + pmix_status_t rc; + + /* + * Select the best component + */ + rc = pmix_mca_base_select("grpcomm", prte_grpcomm_base_framework.framework_output, + &prte_grpcomm_base_framework.framework_components, + (pmix_mca_base_module_t **) &best_module, + (pmix_mca_base_component_t **) &best_component, NULL); + if (PMIX_SUCCESS != rc) { + /* This will only happen if no component was selected */ + return PRTE_ERR_NOT_FOUND; } - selected = true; - - /* Query all available components and ask if they have a module */ - PMIX_LIST_FOREACH(cli, &prte_grpcomm_base_framework.framework_components, - pmix_mca_base_component_list_item_t) - { - component = (pmix_mca_base_component_t *) cli->cli_component; - - pmix_output_verbose(5, prte_grpcomm_base_framework.framework_output, - "mca:grpcomm:select: checking available component %s", - component->pmix_mca_component_name); - - /* If there's no query function, skip it */ - if (NULL == component->pmix_mca_query_component) { - pmix_output_verbose(5, prte_grpcomm_base_framework.framework_output, - "mca:grpcomm:select: Skipping component [%s]. It does not " - "implement a query function", - component->pmix_mca_component_name); - continue; - } - /* Query the component */ - pmix_output_verbose(5, prte_grpcomm_base_framework.framework_output, - "mca:grpcomm:select: Querying component [%s]", - component->pmix_mca_component_name); - rc = component->pmix_mca_query_component(&module, &priority); - - /* If no module was returned, then skip component */ - if (PRTE_SUCCESS != rc || NULL == module) { - pmix_output_verbose( - 5, prte_grpcomm_base_framework.framework_output, - "mca:grpcomm:select: Skipping component [%s]. Query failed to return a module", - component->pmix_mca_component_name); - continue; - } - nmodule = (prte_grpcomm_base_module_t *) module; - - /* if the module fails to init, skip it */ - if (NULL == nmodule->init || PRTE_SUCCESS != nmodule->init()) { - continue; - } - - /* add to the list of selected modules */ - newmodule = PMIX_NEW(prte_grpcomm_base_active_t); - newmodule->pri = priority; - newmodule->module = nmodule; - newmodule->component = component; - - /* maintain priority order */ - inserted = false; - PMIX_LIST_FOREACH(mod, &prte_grpcomm_base.actives, prte_grpcomm_base_active_t) - { - if (priority > mod->pri) { - pmix_list_insert_pos(&prte_grpcomm_base.actives, (pmix_list_item_t *) mod, - &newmodule->super); - inserted = true; - break; - } - } - if (!inserted) { - /* must be lowest priority - add to end */ - pmix_list_append(&prte_grpcomm_base.actives, &newmodule->super); - } + /* Save the winner */ + prte_grpcomm = *best_module; + /* give it a chance to initialize */ + if (NULL != prte_grpcomm.init) { + prte_grpcomm.init(); } - if (4 < pmix_output_get_verbosity(prte_grpcomm_base_framework.framework_output)) { - pmix_output(0, "%s: Final grpcomm priorities", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - /* show the prioritized list */ - PMIX_LIST_FOREACH(mod, &prte_grpcomm_base.actives, prte_grpcomm_base_active_t) - { - pmix_output(0, "\tComponent: %s Priority: %d", mod->component->pmix_mca_component_name, - mod->pri); - } - } return PRTE_SUCCESS; } diff --git a/src/mca/grpcomm/base/grpcomm_base_stubs.c b/src/mca/grpcomm/base/grpcomm_base_stubs.c index 8f6015d942..a6cc9e5852 100644 --- a/src/mca/grpcomm/base/grpcomm_base_stubs.c +++ b/src/mca/grpcomm/base/grpcomm_base_stubs.c @@ -103,9 +103,9 @@ int prte_grpcomm_API_xcast(prte_grpcomm_signature_t *sig, prte_rml_tag_t tag, return rc; } -static void allgather_stub(int fd, short args, void *cbdata) +static void grp_construct_stub(int fd, short args, void *cbdata) { - prte_pmix_mdx_caddy_t *cd = (prte_pmix_mdx_caddy_t *) cbdata; + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t *) cbdata; int ret = PRTE_SUCCESS; prte_grpcomm_base_active_t *active; prte_grpcomm_coll_t *coll; @@ -115,19 +115,14 @@ static void allgather_stub(int fd, short args, void *cbdata) PMIX_ACQUIRE_OBJECT(cd); PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:allgather stub", + "%s grpcomm:base:grp_construct stub", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); /* retrieve an existing tracker, create it if not - * already found. The allgather module is responsible + * already found. The grp_construct module is responsible * for releasing it upon completion of the collective */ - if (NULL != cd->sig->groupID) { - ret = pmix_hash_table_get_value_ptr(&prte_grpcomm_base.sig_table, (void *) cd->sig->groupID, - strlen(cd->sig->groupID), (void **) &seq_number); - } else { - ret = pmix_hash_table_get_value_ptr(&prte_grpcomm_base.sig_table, (void *) cd->sig->signature, - cd->sig->sz * sizeof(pmix_proc_t), (void **) &seq_number); - } + ret = pmix_hash_table_get_value_ptr(&prte_grpcomm_base.sig_table, (void *) cd->sig->groupID, + strlen(cd->sig->groupID), (void **) &seq_number); if (PMIX_ERR_NOT_FOUND == ret) { seq_number = (uint32_t *) malloc(sizeof(uint32_t)); *seq_number = 0; @@ -135,22 +130,17 @@ static void allgather_stub(int fd, short args, void *cbdata) *seq_number = *seq_number + 1; } else { PMIX_OUTPUT((prte_grpcomm_base_framework.framework_output, - "%s rpcomm:base:allgather cannot get signature from hash table", + "%s rpcomm:base:grp_construct cannot get signature from hash table", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); PMIX_ERROR_LOG(ret); PMIX_RELEASE(cd); return; } - if (NULL != cd->sig->groupID) { - ret = pmix_hash_table_set_value_ptr(&prte_grpcomm_base.sig_table, (void *) cd->sig->groupID, - strlen(cd->sig->groupID), (void *) seq_number); - } else { - ret = pmix_hash_table_set_value_ptr(&prte_grpcomm_base.sig_table, (void *) cd->sig->signature, - cd->sig->sz * sizeof(pmix_proc_t), (void *) seq_number); - } + ret = pmix_hash_table_set_value_ptr(&prte_grpcomm_base.sig_table, (void *) cd->sig->groupID, + strlen(cd->sig->groupID), (void *) seq_number); if (PMIX_SUCCESS != ret) { PMIX_OUTPUT((prte_grpcomm_base_framework.framework_output, - "%s rpcomm:base:allgather cannot add new signature to hash table", + "%s rpcomm:base:grp_construct cannot add new signature to hash table", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); PMIX_ERROR_LOG(ret); PMIX_RELEASE(cd); @@ -170,333 +160,28 @@ static void allgather_stub(int fd, short args, void *cbdata) /* cycle thru the actives and see who can process it */ PMIX_LIST_FOREACH(active, &prte_grpcomm_base.actives, prte_grpcomm_base_active_t) { - if (NULL != active->module->allgather) { - if (PRTE_SUCCESS == active->module->allgather(coll, cd)) { + if (NULL != active->module->grp_construct) { + if (PRTE_SUCCESS == active->module->grp_construct(coll, cd)) { break; } } } } -int prte_grpcomm_API_allgather(prte_pmix_mdx_caddy_t *cd) +int prte_grpcomm_API_grp_construct(prte_pmix_grp_caddy_t *cd) { PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:allgather", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + "%s grpcomm:base:grp_construct", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); /* must push this into the event library to ensure we can * access framework-global data safely */ - prte_event_set(prte_event_base, &cd->ev, -1, PRTE_EV_WRITE, allgather_stub, cd); + prte_event_set(prte_event_base, &cd->ev, -1, PRTE_EV_WRITE, grp_construct_stub, cd); PMIX_POST_OBJECT(cd); prte_event_active(&cd->ev, PRTE_EV_WRITE, 1); return PRTE_SUCCESS; } -prte_grpcomm_coll_t *prte_grpcomm_base_get_tracker(prte_grpcomm_signature_t *sig, bool create) -{ - prte_grpcomm_coll_t *coll; - int rc; - pmix_proc_t *p; - size_t n, nmb; - pmix_list_t plist; - prte_namelist_t *nm; - bool found; - - /* search the existing tracker list to see if this already exists - we - * default to using the groupID if one is given, otherwise we fallback - * to the array of participating procs */ - PMIX_LIST_FOREACH(coll, &prte_grpcomm_base.ongoing, prte_grpcomm_coll_t) { - if (NULL == sig->groupID && NULL == sig->signature) { - if (NULL == coll->sig->groupID && NULL == coll->sig->signature) { - /* only one collective can operate at a time - * across every process in the system */ - return coll; - } - /* if only one is NULL, then we can't possibly match */ - break; - } - if (NULL != sig->groupID) { - // must match groupID's - if (NULL != coll->sig->groupID && 0 == strcmp(sig->groupID, coll->sig->groupID)) { - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:returning existing collective", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - // if this is a bootstrap, adjust the membership - if (0 < sig->bootstrap) { - PMIX_CONSTRUCT(&plist, pmix_list_t); - for (n=0; n < sig->sz; n++) { - // see if we already have this proc - found = false; - for (nmb=0; nmb < coll->sig->sz; nmb++) { - if (PMIX_CHECK_PROCID(&sig->signature[n], &coll->sig->signature[nmb])) { - // yes, we do - found = true; - break; - } - } - if (!found) { - // cache the proc - nm = PMIX_NEW(prte_namelist_t); - memcpy(&nm->name, &sig->signature[n], sizeof(pmix_proc_t)); - pmix_list_append(&plist, &nm->super); - } - } - // add any missing procs to the signature - if (0 < pmix_list_get_size(&plist)) { - n = coll->sig->sz + pmix_list_get_size(&plist); - PMIX_PROC_CREATE(p, n); - memcpy(p, coll->sig->signature, coll->sig->sz * sizeof(pmix_proc_t)); - n = coll->sig->sz; - PMIX_LIST_FOREACH(nm, &plist, prte_namelist_t) { - memcpy(&p[n], &nm->name, sizeof(pmix_proc_t)); - ++n; - } - PMIX_LIST_DESTRUCT(&plist); - PMIX_PROC_FREE(coll->sig->signature, coll->sig->sz); - coll->sig->signature = p; - coll->sig->sz = n; - } - } - goto checkmembers; - } - } else if (sig->sz == coll->sig->sz) { - // must match proc signature - if (0 == memcmp(sig->signature, coll->sig->signature, sig->sz * sizeof(pmix_proc_t))) { - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:returning existing collective", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - goto checkmembers; - } - } - } - /* if we get here, then this is a new collective - so create - * the tracker for it */ - if (!create) { - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base: not creating new coll", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - - return NULL; - } - coll = PMIX_NEW(prte_grpcomm_coll_t); - coll->sig = PMIX_NEW(prte_grpcomm_signature_t); - if (NULL != sig->groupID) { - coll->sig->groupID = strdup(sig->groupID); - } - // we have to know the participating procs - coll->sig->sz = sig->sz; - coll->sig->signature = (pmix_proc_t *) malloc(coll->sig->sz * sizeof(pmix_proc_t)); - memcpy(coll->sig->signature, sig->signature, coll->sig->sz * sizeof(pmix_proc_t)); - // need to know the bootstrap in case one is ongoing - coll->sig->bootstrap = sig->bootstrap; - pmix_list_append(&prte_grpcomm_base.ongoing, &coll->super); - - /* if this is a bootstrap operation, then there is no "rollup" - * collective - each daemon reports directly to the DVM controller */ - if (0 < coll->sig->bootstrap) { - coll->nexpected = coll->sig->bootstrap; - goto checkmembers; - } - - /* now get the daemons involved */ - if (PRTE_SUCCESS != (rc = create_dmns(sig, &coll->dmns, &coll->ndmns))) { - PRTE_ERROR_LOG(rc); - return NULL; - } - - /* count the number of contributions we should get */ - coll->nexpected = prte_rml_get_num_contributors(coll->dmns, coll->ndmns); - - /* see if I am in the array of participants - note that I may - * be in the rollup tree even though I'm not participating - * in the collective itself */ - for (n = 0; n < coll->ndmns; n++) { - if (coll->dmns[n] == PRTE_PROC_MY_NAME->rank) { - coll->nexpected++; - break; - } - } - -checkmembers: - // add any addmembers that were given - if (NULL != sig->addmembers) { - if (NULL == coll->sig->addmembers) { - PMIX_PROC_CREATE(coll->sig->addmembers, sig->nmembers); - memcpy(coll->sig->addmembers, sig->addmembers, sig->nmembers * sizeof(pmix_proc_t)); - coll->sig->nmembers = sig->nmembers; - } else { - // aggregate them - PMIX_CONSTRUCT(&plist, pmix_list_t); - for (n=0; n < sig->nmembers; n++) { - // see if we already have this proc - found = false; - for (nmb=0; nmb < coll->sig->nmembers; nmb++) { - if (PMIX_CHECK_PROCID(&sig->addmembers[n], &coll->sig->addmembers[nmb])) { - // yes, we do - found = true; - break; - } - } - if (!found) { - // cache the proc - nm = PMIX_NEW(prte_namelist_t); - memcpy(&nm->name, &sig->addmembers[n], sizeof(pmix_proc_t)); - pmix_list_append(&plist, &nm->super); - } - } - // add any missing procs to the members - if (0 < pmix_list_get_size(&plist)) { - n = coll->sig->nmembers + pmix_list_get_size(&plist); - PMIX_PROC_CREATE(p, n); - memcpy(p, coll->sig->addmembers, coll->sig->nmembers * sizeof(pmix_proc_t)); - n = coll->sig->nmembers; - PMIX_LIST_FOREACH(nm, &plist, prte_namelist_t) { - memcpy(&p[n], &nm->name, sizeof(pmix_proc_t)); - ++n; - } - PMIX_LIST_DESTRUCT(&plist); - PMIX_PROC_FREE(coll->sig->addmembers, coll->sig->nmembers); - coll->sig->addmembers = p; - coll->sig->nmembers = n; - } - } - } - - return coll; -} - -static int create_dmns(prte_grpcomm_signature_t *sig, pmix_rank_t **dmns, size_t *ndmns) -{ - size_t n; - prte_job_t *jdata; - prte_proc_t *proc; - prte_node_t *node; - int i; - pmix_list_t ds; - prte_namelist_t *nm; - pmix_rank_t vpid; - bool found; - size_t nds = 0; - pmix_rank_t *dns = NULL; - int rc = PRTE_SUCCESS; - - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:create_dmns called with %s signature size %" PRIsize_t "", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - (NULL == sig->signature) ? "NULL" : "NON-NULL", sig->sz)); - - /* if NULL == procs, or the target jobid is our own, - * then all daemons are participating */ - if (NULL == sig->signature || - PMIX_CHECK_NSPACE(PRTE_PROC_MY_NAME->nspace, sig->signature[0].nspace)) { - *ndmns = prte_process_info.num_daemons; - *dmns = NULL; - return PRTE_SUCCESS; - } - - PMIX_CONSTRUCT(&ds, pmix_list_t); - for (n = 0; n < sig->sz; n++) { - if (NULL == (jdata = prte_get_job_data_object(sig->signature[n].nspace))) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - rc = PRTE_ERR_NOT_FOUND; - break; - } - if (NULL == jdata->map || 0 == jdata->map->num_nodes) { - /* we haven't generated a job map yet - if we are the HNP, - * then we should only involve ourselves. Otherwise, we have - * no choice but to abort to avoid hangs */ - if (PRTE_PROC_IS_MASTER) { - rc = PRTE_SUCCESS; - break; - } - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - rc = PRTE_ERR_NOT_FOUND; - break; - } - if (PMIX_RANK_WILDCARD == sig->signature[n].rank) { - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:create_dmns called for all procs in job %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_JOBID_PRINT(sig->signature[0].nspace))); - /* all daemons hosting this jobid are participating */ - for (i = 0; i < jdata->map->nodes->size; i++) { - if (NULL == (node = pmix_pointer_array_get_item(jdata->map->nodes, i))) { - continue; - } - if (NULL == node->daemon) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - rc = PRTE_ERR_NOT_FOUND; - goto done; - } - found = false; - PMIX_LIST_FOREACH(nm, &ds, prte_namelist_t) - { - if (nm->name.rank == node->daemon->name.rank) { - found = true; - break; - } - } - if (!found) { - PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:create_dmns adding daemon %s to list", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&node->daemon->name))); - nm = PMIX_NEW(prte_namelist_t); - PMIX_LOAD_PROCID(&nm->name, PRTE_PROC_MY_NAME->nspace, node->daemon->name.rank); - pmix_list_append(&ds, &nm->super); - } - } - } else { - /* lookup the daemon for this proc and add it to the list */ - PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, - "%s sign: GETTING PROC OBJECT FOR %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&sig->signature[n]))); - proc = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, - sig->signature[n].rank); - if (NULL == proc) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - rc = PRTE_ERR_NOT_FOUND; - goto done; - } - if (NULL == proc->node || NULL == proc->node->daemon) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - rc = PRTE_ERR_NOT_FOUND; - goto done; - } - vpid = proc->node->daemon->name.rank; - found = false; - PMIX_LIST_FOREACH(nm, &ds, prte_namelist_t) - { - if (nm->name.rank == vpid) { - found = true; - break; - } - } - if (!found) { - nm = PMIX_NEW(prte_namelist_t); - PMIX_LOAD_PROCID(&nm->name, PRTE_PROC_MY_NAME->nspace, vpid); - pmix_list_append(&ds, &nm->super); - } - } - } - -done: - if (0 < pmix_list_get_size(&ds)) { - dns = (pmix_rank_t *) malloc(pmix_list_get_size(&ds) * sizeof(pmix_rank_t)); - nds = 0; - while (NULL != (nm = (prte_namelist_t *) pmix_list_remove_first(&ds))) { - PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:base:create_dmns adding daemon %s to array", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&nm->name))); - dns[nds++] = nm->name.rank; - PMIX_RELEASE(nm); - } - } - PMIX_LIST_DESTRUCT(&ds); - *dmns = dns; - *ndmns = nds; - return rc; -} static int pack_xcast(prte_grpcomm_signature_t *sig, pmix_data_buffer_t *buffer, pmix_data_buffer_t *message, prte_rml_tag_t tag) diff --git a/src/mca/grpcomm/direct/Makefile.am b/src/mca/grpcomm/direct/Makefile.am index ec7146cffa..68060ee172 100644 --- a/src/mca/grpcomm/direct/Makefile.am +++ b/src/mca/grpcomm/direct/Makefile.am @@ -4,7 +4,7 @@ # reserved. # Copyright (c) 2014-2020 Intel, Inc. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -17,6 +17,9 @@ AM_CPPFLAGS = $(grpcomm_direct_CPPFLAGS) sources = \ grpcomm_direct.h \ grpcomm_direct.c \ + grpcomm_direct_xcast.c \ + grpcomm_direct_fence.c \ + grpcomm_direct_group.c \ grpcomm_direct_component.c # Make the output library in this directory, and name it either diff --git a/src/mca/grpcomm/direct/grpcomm_direct.c b/src/mca/grpcomm/direct/grpcomm_direct.c index ecca487229..71c137bdf9 100644 --- a/src/mca/grpcomm/direct/grpcomm_direct.c +++ b/src/mca/grpcomm/direct/grpcomm_direct.c @@ -40,44 +40,42 @@ /* Static API's */ static int init(void); static void finalize(void); -static int xcast(pmix_rank_t *vpids, size_t nprocs, pmix_data_buffer_t *buf); -static int allgather(prte_grpcomm_coll_t *coll, - prte_pmix_mdx_caddy_t *cd); /* Module def */ prte_grpcomm_base_module_t prte_grpcomm_direct_module = { .init = init, .finalize = finalize, - .xcast = xcast, - .allgather = allgather + .xcast = prte_grpcomm_direct_xcast, + .fence = prte_grpcomm_direct_fence, + .group = prte_grpcomm_direct_group }; -/* internal functions */ -static void xcast_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer, - prte_rml_tag_t tag, void *cbdata); -static void allgather_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer, - prte_rml_tag_t tag, void *cbdata); -static void barrier_release(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer, - prte_rml_tag_t tag, void *cbdata); - -/* internal variables */ -static pmix_list_t tracker; - /** * Initialize the module */ static int init(void) { - PMIX_CONSTRUCT(&tracker, pmix_list_t); + /* setup the trackers */ + PMIX_CONSTRUCT(&prte_mca_grpcomm_direct_component.fence_ops, pmix_list_t); + PMIX_CONSTRUCT(&prte_mca_grpcomm_direct_component.group_ops, pmix_list_t); - /* post the receives */ + /* xcast receive */ PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_XCAST, - PRTE_RML_PERSISTENT, xcast_recv, NULL); - PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_ALLGATHER_DIRECT, - PRTE_RML_PERSISTENT, allgather_recv, NULL); + PRTE_RML_PERSISTENT, prte_grpcomm_direct_xcast_recv, NULL); + + /* fence receives */ + PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_FENCE, + PRTE_RML_PERSISTENT, prte_grpcomm_direct_fence_recv, NULL); /* setup recv for barrier release */ - PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_COLL_RELEASE, - PRTE_RML_PERSISTENT, barrier_release, NULL); + PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_FENCE_RELEASE, + PRTE_RML_PERSISTENT, prte_grpcomm_direct_fence_release, NULL); + + /* group receives */ + PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_GROUP, + PRTE_RML_PERSISTENT, prte_grpcomm_direct_grp_recv, NULL); + + PRTE_RML_RECV(PRTE_NAME_WILDCARD, PRTE_RML_TAG_GROUP_RELEASE, + PRTE_RML_PERSISTENT, prte_grpcomm_direct_grp_release, NULL); return PRTE_SUCCESS; } @@ -87,760 +85,14 @@ static int init(void) */ static void finalize(void) { - PMIX_LIST_DESTRUCT(&tracker); - return; -} - -static int xcast(pmix_rank_t *vpids, size_t nprocs, pmix_data_buffer_t *buf) -{ - int rc; - PRTE_HIDE_UNUSED_PARAMS(vpids, nprocs); - - /* send it to the HNP (could be myself) for relay */ - PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, buf, PRTE_RML_TAG_XCAST); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(buf); - return rc; - } - return PRTE_SUCCESS; -} - -static int allgather(prte_grpcomm_coll_t *coll, - prte_pmix_mdx_caddy_t *cd) -{ - int rc; - pmix_data_buffer_t *relay; - - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct: allgather", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - - /* the base functions pushed us into the event library - * before calling us, so we can safely access global data - * at this point */ - - PMIX_DATA_BUFFER_CREATE(relay); - /* pack the signature */ - rc = prte_grpcomm_sig_pack(relay, coll->sig); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(relay); - return rc; - } - - /* pack the ctrls */ - rc = PMIx_Data_pack(NULL, relay, &cd->ctrls, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(relay); - return prte_pmix_convert_status(rc); - } - - /* pass along the payload */ - rc = PMIx_Data_copy_payload(relay, cd->buf); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(relay); - return prte_pmix_convert_status(rc); - } - - /* if this is a bootstrap operation, send it to the HNP */ - if (0 < coll->sig->bootstrap) { - PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, relay, - PRTE_RML_TAG_ALLGATHER_DIRECT); - return rc; - } - - /* send this to ourselves for processing */ - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct:allgather sending to ourself", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - - /* send the info to ourselves for tracking */ - PRTE_RML_SEND(rc, PRTE_PROC_MY_NAME->rank, relay, - PRTE_RML_TAG_ALLGATHER_DIRECT); - return rc; -} - -static void allgather_recv(int status, pmix_proc_t *sender, - pmix_data_buffer_t *buffer, - prte_rml_tag_t tag, void *cbdata) -{ - int32_t cnt; - int rc, timeout; - size_t n, ninfo, m; - bool assignID = false; - bool found; - pmix_list_t nmlist; - prte_namelist_t *nm, *nm2; - pmix_data_array_t darray; - pmix_status_t st; - pmix_info_t *info, infostat; - prte_grpcomm_signature_t *sig = NULL; - pmix_byte_object_t ctrlsbo; - pmix_data_buffer_t ctrlbuf; - pmix_data_buffer_t *reply; - prte_grpcomm_coll_t *coll; - pmix_proc_t *addmembers; - PRTE_HIDE_UNUSED_PARAMS(status, tag, cbdata); - - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct allgather recvd from %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(sender))); - - /* unpack the signature */ - rc = prte_grpcomm_sig_unpack(buffer, &sig); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - } - - /* check for the tracker and create it if not found */ - if (NULL == (coll = prte_grpcomm_base_get_tracker(sig, true))) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - PMIX_RELEASE(sig); - return; - } - - /* unpack the ctrls from this contributor */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &ctrlsbo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - return; - } - PMIX_DATA_BUFFER_CONSTRUCT(&ctrlbuf); - rc = PMIx_Data_load(&ctrlbuf, &ctrlsbo); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - PMIX_BYTE_OBJECT_DESTRUCT(&ctrlsbo); - return; - } - PMIX_BYTE_OBJECT_DESTRUCT(&ctrlsbo); - - /* unpack the number of info's in the ctrls */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, &ctrlbuf, &ninfo, &cnt, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - return; - } - if (0 < ninfo) { - PMIX_INFO_CREATE(info, ninfo); - cnt = ninfo; - rc = PMIx_Data_unpack(NULL, &ctrlbuf, info, &cnt, PMIX_INFO); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - return; - } - } - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - - /* cycle thru the ctrls to look for keys we support */ - for (n=0; n < ninfo; n++) { - if (PMIX_CHECK_KEY(&info[n], PMIX_TIMEOUT)) { - PMIX_VALUE_GET_NUMBER(rc, &info[n].value, timeout, int); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - return; - } - if (coll->timeout < timeout) { - coll->timeout = timeout; - } - /* update the info with the collected value */ - info[n].value.type = PMIX_INT; - info[n].value.data.integer = coll->timeout; - } else if (PMIX_CHECK_KEY(&info[n], PMIX_LOCAL_COLLECTIVE_STATUS)) { - PMIX_VALUE_GET_NUMBER(rc, &info[n].value, st, pmix_status_t); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - return; - } - if (PMIX_SUCCESS != st && - PMIX_SUCCESS == coll->status) { - coll->status = st; - } - /* update the info with the collected value */ - info[n].value.type = PMIX_STATUS; - info[n].value.data.status = coll->status; - } else if (PMIX_CHECK_KEY(&info[n], PMIX_GROUP_ASSIGN_CONTEXT_ID)) { - assignID = PMIX_INFO_TRUE(&info[n]); - if (assignID) { - coll->assignID = true; - } - /* update the info with the collected value */ - info[n].value.type = PMIX_BOOL; - info[n].value.data.flag = coll->assignID; - } - } - - // check for any added members - if (NULL != sig->addmembers) { - // add them to the global collective - for (m=0; m < sig->nmembers; m++) { - // check to see if we already have this member - found = false; - PMIX_LIST_FOREACH(nm, &coll->addmembers, prte_namelist_t) { - if (PMIX_CHECK_PROCID(&nm->name, &sig->addmembers[m])) { - // already have it - found = true; - // if the new rank is wildcard, keep it - if (PMIX_RANK_WILDCARD == sig->addmembers[m].rank) { - nm->name.rank = PMIX_RANK_WILDCARD; - } - break; - } - } - if (!found) { - nm = PMIX_NEW(prte_namelist_t); - PMIX_XFER_PROCID(&nm->name, &sig->addmembers[m]); - pmix_list_append(&coll->addmembers, &nm->super); - } - } - } - - /* increment nprocs reported for collective */ - coll->nreported++; - /* capture any provided content */ - rc = PMIx_Data_copy_payload(&coll->bucket, buffer); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - return; - } - - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct allgather recv nexpected %d nrep %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) coll->nexpected, - (int) coll->nreported)); - - /* see if everyone has reported */ - if (coll->nreported == coll->nexpected) { - if (PRTE_PROC_IS_MASTER) { - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct allgather HNP reports complete", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - /* the allgather is complete - send the xcast */ - PMIX_DATA_BUFFER_CREATE(reply); - - /* if we were asked to provide a context id, do so */ - if (assignID) { - coll->sig->ctxid = prte_grpcomm_base.context_id; - --prte_grpcomm_base.context_id; - coll->sig->ctxid_assigned = true; - } - - if (NULL != coll->sig->groupID) { - // construct the final membership - PMIX_CONSTRUCT(&nmlist, pmix_list_t); - // sadly, an exhaustive search - for (m=0; m < coll->sig->sz; m++) { - found = false; - PMIX_LIST_FOREACH(nm, &nmlist, prte_namelist_t) { - if (PMIX_CHECK_PROCID(&coll->sig->signature[m], &nm->name)) { - // if the new one is rank=WILDCARD, then ensure - // we keep it as wildcard - if (PMIX_RANK_WILDCARD == coll->sig->signature[m].rank) { - nm->name.rank = PMIX_RANK_WILDCARD; - } - found = true; - break; - } - } - if (!found) { - nm = PMIX_NEW(prte_namelist_t); - memcpy(&nm->name, &coll->sig->signature[m], sizeof(pmix_proc_t)); - pmix_list_append(&nmlist, &nm->super); - } - } - // now check any added members - PMIX_LIST_FOREACH(nm, &coll->addmembers, prte_namelist_t) { - found = false; - PMIX_LIST_FOREACH(nm2, &nmlist, prte_namelist_t) { - if (PMIX_CHECK_PROCID(&nm->name, &nm2->name)) { - // if the new one is rank=WILDCARD, then ensure - // we keep it as wildcard - if (PMIX_RANK_WILDCARD == nm->name.rank) { - nm2->name.rank = PMIX_RANK_WILDCARD; - } - found = true; - break; - } - } - if (!found) { - nm2 = PMIX_NEW(prte_namelist_t); - memcpy(&nm2->name, &nm->name, sizeof(pmix_proc_t)); - pmix_list_append(&nmlist, &nm2->super); - } - } - // create the array of members - coll->sig->nfinal = pmix_list_get_size(&nmlist); - PMIX_PROC_CREATE(coll->sig->finalmembership, coll->sig->nfinal); - m = 0; - PMIX_LIST_FOREACH(nm, &nmlist, prte_namelist_t) { - memcpy(&coll->sig->finalmembership[m], &nm->name, sizeof(pmix_proc_t)); - ++m; - } - PMIX_LIST_DESTRUCT(&nmlist); - - /* sort the procs so everyone gets the same order */ - qsort(coll->sig->finalmembership, coll->sig->nfinal, sizeof(pmix_proc_t), pmix_util_compare_proc); - } - - /* pack the signature */ - rc = prte_grpcomm_sig_pack(reply, coll->sig); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_RELEASE(sig); - return; - } - /* pack the status */ - rc = PMIx_Data_pack(NULL, reply, &coll->status, 1, PMIX_INT32); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_RELEASE(sig); - return; - } - /* add some values to the payload in the bucket */ - PMIX_DATA_BUFFER_CONSTRUCT(&ctrlbuf); - - /* if we assigned a context id, include it in the bucket as well */ - if (assignID) { - PMIX_INFO_LOAD(&infostat, PMIX_GROUP_CONTEXT_ID, &coll->sig->ctxid, PMIX_SIZE); - rc = PMIx_Data_pack(NULL, &ctrlbuf, &infostat, 1, PMIX_INFO); - PMIX_INFO_DESTRUCT(&infostat); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - PMIX_RELEASE(sig); - return; - } - } - /* if this is a group operation, provide group info */ - if (NULL != coll->sig->groupID) { - // provide the group ID - PMIX_INFO_LOAD(&infostat, PMIX_GROUP_ID, sig->groupID, PMIX_STRING); - rc = PMIx_Data_pack(NULL, &ctrlbuf, &infostat, 1, PMIX_INFO); - PMIX_INFO_DESTRUCT(&infostat); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - PMIX_RELEASE(sig); - return; - } - // provide the final membership in an attribute - darray.type = PMIX_PROC; - darray.array = coll->sig->finalmembership; - darray.size = coll->sig->nfinal; - PMIX_INFO_LOAD(&infostat, PMIX_GROUP_MEMBERSHIP, &darray, PMIX_DATA_ARRAY); // copies array - // do not destruct the array - rc = PMIx_Data_pack(NULL, &ctrlbuf, &infostat, 1, PMIX_INFO); - PMIX_INFO_DESTRUCT(&infostat); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - PMIX_RELEASE(sig); - return; - } - // provide the add-members in an attribute - if (0 < pmix_list_get_size(&coll->addmembers)) { - m = pmix_list_get_size(&coll->addmembers); - PMIX_PROC_CREATE(addmembers, m); - // create the array of add-members - n = 0; - PMIX_LIST_FOREACH(nm, &coll->addmembers, prte_namelist_t) { - memcpy(&addmembers[n], &nm->name, sizeof(pmix_proc_t)); - ++n; - } - darray.type = PMIX_PROC; - darray.array = addmembers; - darray.size = m; - PMIX_INFO_LOAD(&infostat, PMIX_GROUP_ADD_MEMBERS, &darray, PMIX_DATA_ARRAY); // copies array - PMIX_DATA_ARRAY_DESTRUCT(&darray); - rc = PMIx_Data_pack(NULL, &ctrlbuf, &infostat, 1, PMIX_INFO); - PMIX_INFO_DESTRUCT(&infostat); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - PMIX_RELEASE(sig); - return; - } - } - } - - // pack the ctrl object - PMIX_DATA_BUFFER_UNLOAD(&ctrlbuf, ctrlsbo.bytes, ctrlsbo.size); - PMIX_DATA_BUFFER_DESTRUCT(&ctrlbuf); - rc = PMIx_Data_pack(NULL, reply, &ctrlsbo, 1, PMIX_BYTE_OBJECT); - PMIX_BYTE_OBJECT_DESTRUCT(&ctrlsbo); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_RELEASE(sig); - return; - } - - /* transfer the collected bucket */ - rc = PMIx_Data_copy_payload(reply, &coll->bucket); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_RELEASE(sig); - return; - } - - /* send the release via xcast */ - (void) prte_grpcomm.xcast(sig, PRTE_RML_TAG_COLL_RELEASE, reply); - } else { - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct allgather rollup complete - sending to %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(PRTE_PROC_MY_PARENT))); - PMIX_DATA_BUFFER_CREATE(reply); - /* pack the signature */ - rc = prte_grpcomm_sig_pack(reply, sig); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_RELEASE(sig); - return; - } - PMIX_RELEASE(sig); - sig = NULL; - /* pass along the ctrls - we have updated the values - * we collected along the way */ - rc = prte_pack_ctrl_options(&ctrlsbo, info, ninfo); - if (PRTE_SUCCESS != rc) { - PMIX_DATA_BUFFER_RELEASE(reply); - return; - } - rc = PMIx_Data_pack(NULL, reply, &ctrlsbo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - PMIx_Byte_object_destruct(&ctrlsbo); - return; - } - PMIx_Byte_object_destruct(&ctrlsbo); - - /* transfer the collected bucket */ - rc = PMIx_Data_copy_payload(reply, &coll->bucket); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - return; - } - /* send the info to our parent */ - PRTE_RML_SEND(rc, PRTE_PROC_MY_PARENT->rank, reply, - PRTE_RML_TAG_ALLGATHER_DIRECT); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - PMIX_DATA_BUFFER_RELEASE(reply); - return; - } - } - } - if (NULL != sig) { - PMIX_RELEASE(sig); - } -} - -static void xcast_recv(int status, pmix_proc_t *sender, - pmix_data_buffer_t *buffer, - prte_rml_tag_t tg, void *cbdata) -{ - prte_routed_tree_t *nm; - int ret, cnt; - pmix_data_buffer_t *relay = NULL, *rly, *rlycopy; - pmix_data_buffer_t datbuf, *data; - bool compressed; - prte_job_t *daemons; - pmix_list_t coll; - prte_grpcomm_signature_t *sig = NULL; - prte_rml_tag_t tag; - pmix_byte_object_t bo, pbo; - pmix_value_t val; - pmix_proc_t dmn; - PRTE_HIDE_UNUSED_PARAMS(status, sender, tg, cbdata); - - PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct:xcast:recv: with %d bytes", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) buffer->bytes_used)); - - /* we need a passthru buffer to send to our children - we leave it - * as compressed data */ - PMIX_DATA_BUFFER_CREATE(rly); - ret = PMIx_Data_copy_payload(rly, buffer); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - PMIX_DATA_BUFFER_CONSTRUCT(&datbuf); - /* setup the relay list */ - PMIX_CONSTRUCT(&coll, pmix_list_t); - - /* unpack the flag to see if this payload is compressed */ - cnt = 1; - ret = PMIx_Data_unpack(NULL, buffer, &compressed, &cnt, PMIX_BOOL); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - /* unpack the data blob */ - cnt = 1; - ret = PMIx_Data_unpack(NULL, buffer, &pbo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - if (compressed) { - /* decompress the data */ - if (PMIx_Data_decompress((uint8_t *) pbo.bytes, pbo.size, - (uint8_t **) &bo.bytes, &bo.size)) { - /* the data has been uncompressed */ - ret = PMIx_Data_load(&datbuf, &bo); - if (PMIX_SUCCESS != ret) { - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - } else { - pmix_show_help("help-prte-runtime.txt", "failed-to-uncompress", - true, prte_process_info.nodename); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - } else { - ret = PMIx_Data_load(&datbuf, &pbo); - if (PMIX_SUCCESS != ret) { - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - } - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - data = &datbuf; - /* get the signature that we do not need */ - ret = prte_grpcomm_sig_unpack(data, &sig); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - PMIX_RELEASE(sig); + PMIX_LIST_DESTRUCT(&prte_mca_grpcomm_direct_component.fence_ops); + PMIX_LIST_DESTRUCT(&prte_mca_grpcomm_direct_component.group_ops); - /* get the target tag */ - cnt = 1; - ret = PMIx_Data_unpack(NULL, data, &tag, &cnt, PMIX_UINT32); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - return; - } - - /* copy the msg for relay to ourselves */ - PMIX_DATA_BUFFER_CREATE(relay); - ret = PMIx_Data_copy_payload(relay, data); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - PMIX_DATA_BUFFER_RELEASE(relay); - return; - } - - if (PRTE_RML_TAG_WIREUP == tag && !PRTE_PROC_IS_MASTER) { - if (PRTE_SUCCESS != (ret = prte_util_decode_nidmap(data))) { - PRTE_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - PMIX_DATA_BUFFER_RELEASE(relay); - return; - } - /* unpack the wireup info */ - cnt = 1; - while (PMIX_SUCCESS == (ret = PMIx_Data_unpack(NULL, data, &dmn, &cnt, PMIX_PROC))) { - PMIX_VALUE_CONSTRUCT(&val); - val.type = PMIX_STRING; - cnt = 1; - ret = PMIx_Data_unpack(NULL, data, &val.data.string, &cnt, PMIX_STRING); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - PMIX_DATA_BUFFER_RELEASE(relay); - return; - } - - if (!PMIX_CHECK_PROCID(&dmn, PRTE_PROC_MY_HNP) && - !PMIX_CHECK_PROCID(&dmn, PRTE_PROC_MY_NAME) && - !PMIX_CHECK_PROCID(&dmn, PRTE_PROC_MY_PARENT)) { - /* store it locally */ - ret = PMIx_Store_internal(&dmn, PMIX_PROC_URI, &val); - PMIX_VALUE_DESTRUCT(&val); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - PMIX_DATA_BUFFER_RELEASE(relay); - return; - } - } - } - if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { - PMIX_ERROR_LOG(ret); - } - } - - daemons = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); - if (!prte_get_attribute(&daemons->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { - /* send the message to each of our children */ - PMIX_LIST_FOREACH(nm, &prte_rml_base.children, prte_routed_tree_t) - { - PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) rly->bytes_used, - PRTE_VPID_PRINT(nm->rank))); - /* copy the buffer for send */ - PMIX_DATA_BUFFER_CREATE(rlycopy); - ret = PMIx_Data_copy_payload(rlycopy, rly); - if (PMIX_SUCCESS != ret) { - PRTE_ERROR_LOG(ret); - PMIX_DATA_BUFFER_RELEASE(rlycopy); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - continue; - } - PRTE_RML_SEND(ret, nm->rank, rlycopy, PRTE_RML_TAG_XCAST); - if (PRTE_SUCCESS != ret) { - PRTE_ERROR_LOG(ret); - PMIX_DATA_BUFFER_RELEASE(rlycopy); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - continue; - } - } - } - - /* cleanup */ - PMIX_LIST_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); // retain accounting - - /* now pass the relay buffer to myself for processing IFF it - * wasn't just a wireup message - don't - * inject it into the RML system via send as that will compete - * with the relay messages down in the OOB. Instead, pass it - * directly to the RML message processor */ - if (PRTE_RML_TAG_WIREUP != tag) { - PRTE_RML_POST_MESSAGE(PRTE_PROC_MY_NAME, tag, 1, relay->base_ptr, relay->bytes_used); - relay->base_ptr = NULL; - relay->bytes_used = 0; - } - if (NULL != relay) { - PMIX_DATA_BUFFER_RELEASE(relay); - } - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); -} - -static void barrier_release(int status, pmix_proc_t *sender, - pmix_data_buffer_t *buffer, - prte_rml_tag_t tag, void *cbdata) -{ - int32_t cnt; - int rc, ret; - prte_grpcomm_signature_t *sig = NULL; - prte_grpcomm_coll_t *coll; - pmix_byte_object_t bo; - PRTE_HIDE_UNUSED_PARAMS(status, sender, tag, cbdata); - - PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, - "%s grpcomm:direct: barrier release called with %d bytes", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) buffer->bytes_used)); - - /* unpack the signature */ - rc = prte_grpcomm_sig_unpack(buffer, &sig); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return; - } - - /* unpack the return status */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &ret, &cnt, PMIX_INT32); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - return; - } - - /* unpack the ctrls byte object */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &bo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(sig); - return; - } - PMIX_BYTE_OBJECT_DESTRUCT(&bo); // don't need it here - - /* check for the tracker - it is not an error if not - * found as that just means we are not involved - * in the collective */ - if (NULL == (coll = prte_grpcomm_base_get_tracker(sig, false))) { - PMIX_RELEASE(sig); - return; - } - - /* execute the callback */ - if (NULL != coll->cbfunc) { - coll->cbfunc(ret, buffer, coll->cbdata); - } - pmix_list_remove_item(&prte_grpcomm_base.ongoing, &coll->super); - PMIX_RELEASE(coll); - PMIX_RELEASE(sig); + PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_XCAST); + PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_FENCE); + PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_FENCE_RELEASE); + PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_GROUP); + PRTE_RML_CANCEL(PRTE_NAME_WILDCARD, PRTE_RML_TAG_GROUP_RELEASE); + return; } diff --git a/src/mca/grpcomm/direct/grpcomm_direct.h b/src/mca/grpcomm/direct/grpcomm_direct.h index 71c7de0de6..abf041533d 100644 --- a/src/mca/grpcomm/direct/grpcomm_direct.h +++ b/src/mca/grpcomm/direct/grpcomm_direct.h @@ -4,7 +4,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,9 +25,166 @@ BEGIN_C_DECLS * Grpcomm interfaces */ -PRTE_MODULE_EXPORT extern prte_grpcomm_base_component_t prte_mca_grpcomm_direct_component; +typedef struct { + prte_grpcomm_base_component_t super; + // track ongoing fence operations - list of prte_grpcomm_fence_t + pmix_list_t fence_ops; + // track ongoiong group operations - list of prte_grpcomm_group_t + pmix_list_t group_ops; +} prte_grpcomm_direct_component_t; + +PRTE_MODULE_EXPORT extern prte_grpcomm_direct_component_t prte_mca_grpcomm_direct_component; extern prte_grpcomm_base_module_t prte_grpcomm_direct_module; + +/* Define collective signatures so we don't need to + * track global collective id's. We provide a unique + * signature struct for each collective type so that + * they can be customized for that collective without + * interfering with other collectives */ +typedef struct { + pmix_object_t super; + pmix_proc_t *signature; + size_t sz; +} prte_grpcomm_direct_fence_signature_t; +PRTE_MODULE_EXPORT PMIX_CLASS_DECLARATION(prte_grpcomm_direct_fence_signature_t); + +typedef struct { + pmix_object_t super; + pmix_group_operation_t op; + char *groupID; + bool assignID; + size_t ctxid; + bool ctxid_assigned; + pmix_proc_t *members; // initially supplied procs + size_t nmembers; + size_t bootstrap; + pmix_proc_t *addmembers; // procs supplied as add-members + size_t naddmembers; +} prte_grpcomm_direct_group_signature_t; +PRTE_MODULE_EXPORT PMIX_CLASS_DECLARATION(prte_grpcomm_direct_group_signature_t); + + +/* Internal component object for tracking ongoing + * allgather operations */ +typedef struct { + pmix_list_item_t super; + /* collective's signature */ + prte_grpcomm_direct_fence_signature_t *sig; + pmix_status_t status; + /* collection bucket */ + pmix_data_buffer_t bucket; + /* participating daemons */ + pmix_rank_t *dmns; + /** number of participating daemons */ + size_t ndmns; + /** my index in the dmns array */ + unsigned long my_rank; + /* number of buckets expected */ + size_t nexpected; + /* number reported in */ + size_t nreported; + /* controls values */ + int timeout; + /* callback function */ + pmix_modex_cbfunc_t cbfunc; + /* user-provided callback data */ + void *cbdata; +} prte_grpcomm_fence_t; +PMIX_CLASS_DECLARATION(prte_grpcomm_fence_t); + +/* Internal component object for tracking ongoing + * group operations */ +typedef struct { + pmix_list_item_t super; + /* collective's signature */ + prte_grpcomm_direct_group_signature_t *sig; + pmix_status_t status; + /* participating daemons */ + pmix_rank_t *dmns; + /** number of participating daemons */ + size_t ndmns; + /** my index in the dmns array */ + unsigned long my_rank; + /* number of buckets expected */ + size_t nexpected; + /* number reported in */ + size_t nreported; + /* controls values */ + bool assignID; + int timeout; + size_t memsize; + void *grpinfo; // info list of group info + void *endpts; // info list of endpts + /* callback function */ + pmix_info_cbfunc_t cbfunc; + /* user-provided callback data */ + void *cbdata; +} prte_grpcomm_group_t; +PMIX_CLASS_DECLARATION(prte_grpcomm_group_t); + +typedef struct { + pmix_object_t super; + prte_event_t ev; + prte_grpcomm_direct_fence_signature_t *sig; + pmix_data_buffer_t *buf; + pmix_proc_t *procs; + size_t nprocs; + pmix_info_t *info; + size_t ninfo; + char *data; + size_t ndata; + pmix_modex_cbfunc_t cbfunc; + void *cbdata; +} prte_pmix_fence_caddy_t; +PMIX_CLASS_DECLARATION(prte_pmix_fence_caddy_t); + + +/* xcast functions */ +PRTE_MODULE_EXPORT extern +int prte_grpcomm_direct_xcast(prte_rml_tag_t tag, + pmix_data_buffer_t *msg); + +PRTE_MODULE_EXPORT extern +void prte_grpcomm_direct_xcast_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tg, void *cbdata); + +/* fence functions */ +PRTE_MODULE_EXPORT extern +int prte_grpcomm_direct_fence(const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t info[], size_t ninfo, char *data, + size_t ndata, pmix_modex_cbfunc_t cbfunc, void *cbdata); + +PRTE_MODULE_EXPORT extern +void prte_grpcomm_direct_fence_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata); + +PRTE_MODULE_EXPORT extern +void prte_grpcomm_direct_fence_release(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata); + + +/* group functions */ +PRTE_MODULE_EXPORT extern +int prte_grpcomm_direct_group(pmix_group_operation_t op, char *grpid, + const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); + +PRTE_MODULE_EXPORT extern +void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata); + + +PRTE_MODULE_EXPORT extern +void prte_grpcomm_direct_grp_release(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata); + END_C_DECLS #endif diff --git a/src/mca/grpcomm/direct/grpcomm_direct_component.c b/src/mca/grpcomm/direct/grpcomm_direct_component.c index e874287fe4..98dcef65e5 100644 --- a/src/mca/grpcomm/direct/grpcomm_direct_component.c +++ b/src/mca/grpcomm/direct/grpcomm_direct_component.c @@ -6,7 +6,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,59 +25,160 @@ #include "grpcomm_direct.h" -static int my_priority = 5; /* must be below "bad" module */ -static int direct_open(void); -static int direct_close(void); static int direct_query(pmix_mca_base_module_t **module, int *priority); -static int direct_register(void); /* * Struct of function pointers that need to be initialized */ -prte_grpcomm_base_component_t prte_mca_grpcomm_direct_component = { - PRTE_GRPCOMM_BASE_VERSION_3_0_0, +prte_grpcomm_direct_component_t prte_mca_grpcomm_direct_component = { + .super = { + PRTE_GRPCOMM_BASE_VERSION_4_0_0, - .pmix_mca_component_name = "direct", - PMIX_MCA_BASE_MAKE_VERSION(component, - PRTE_MAJOR_VERSION, - PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), - .pmix_mca_open_component = direct_open, - .pmix_mca_close_component = direct_close, - .pmix_mca_query_component = direct_query, - .pmix_mca_register_component_params = direct_register, + .pmix_mca_component_name = "direct", + PMIX_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PMIX_RELEASE_VERSION), + .pmix_mca_query_component = direct_query, + }, + .fence_ops = PMIX_LIST_STATIC_INIT, + .group_ops = PMIX_LIST_STATIC_INIT }; -static int direct_register(void) +static int direct_query(pmix_mca_base_module_t **module, int *priority) { - pmix_mca_base_component_t *c = &prte_mca_grpcomm_direct_component; - - /* make the priority adjustable so users can select - * direct for use by apps without affecting daemons - */ - my_priority = 85; - (void) pmix_mca_base_component_var_register(c, "priority", - "Priority of the grpcomm direct component", - PMIX_MCA_BASE_VAR_TYPE_INT, - &my_priority); + /* we are always available */ + *priority = 5; + *module = (pmix_mca_base_module_t *) &prte_grpcomm_direct_module; return PRTE_SUCCESS; } -/* Open the component */ -static int direct_open(void) +static void scon(prte_grpcomm_direct_fence_signature_t *p) { - return PRTE_SUCCESS; + p->signature = NULL; + p->sz = 0; +} +static void sdes(prte_grpcomm_direct_fence_signature_t *p) +{ + if (NULL != p->signature) { + free(p->signature); + } } +PMIX_CLASS_INSTANCE(prte_grpcomm_direct_fence_signature_t, + pmix_object_t, + scon, sdes); -static int direct_close(void) +static void sgcon(prte_grpcomm_direct_group_signature_t *p)\ { - return PRTE_SUCCESS; + p->op = PMIX_GROUP_NONE; + p->groupID = NULL; + p->assignID = false; + p->ctxid = 0; + p->ctxid_assigned = false; + p->members = NULL; + p->nmembers = 0; + p->bootstrap = 0; + p->addmembers = NULL; + p->naddmembers = 0; +} +static void sgdes(prte_grpcomm_direct_group_signature_t *p) +{ + if (NULL != p->groupID) { + free(p->groupID); + } + if (NULL != p->members) { + free(p->members); + } + if (NULL != p->addmembers) { + free(p->addmembers); + } } +PMIX_CLASS_INSTANCE(prte_grpcomm_direct_group_signature_t, + pmix_object_t, + sgcon, sgdes); -static int direct_query(pmix_mca_base_module_t **module, int *priority) +static void ccon(prte_grpcomm_fence_t *p) { - /* we are always available */ - *priority = my_priority; - *module = (pmix_mca_base_module_t *) &prte_grpcomm_direct_module; - return PRTE_SUCCESS; + p->sig = NULL; + p->status = PMIX_SUCCESS; + PMIX_DATA_BUFFER_CONSTRUCT(&p->bucket); + p->dmns = NULL; + p->ndmns = 0; + p->nexpected = 0; + p->nreported = 0; + p->timeout = 0; + p->cbfunc = NULL; + p->cbdata = NULL; +} +static void cdes(prte_grpcomm_fence_t *p) +{ + if (NULL != p->sig) { + PMIX_RELEASE(p->sig); + } + PMIX_DATA_BUFFER_DESTRUCT(&p->bucket); + if (NULL != p->dmns) { + free(p->dmns); + } +} +PMIX_CLASS_INSTANCE(prte_grpcomm_fence_t, + pmix_list_item_t, + ccon, cdes); + + +static void gccon(prte_grpcomm_group_t *p) +{ + p->sig = NULL; + p->status = PMIX_SUCCESS; + p->dmns = NULL; + p->ndmns = 0; + p->nexpected = 0; + p->nreported = 0; + p->assignID = false; + p->timeout = 0; + p->memsize = 0; + p->grpinfo = PMIx_Info_list_start(); + p->endpts = PMIx_Info_list_start(); + p->cbfunc = NULL; + p->cbdata = NULL; +} +static void gcdes(prte_grpcomm_group_t *p) +{ + if (NULL != p->sig) { + PMIX_RELEASE(p->sig); + } + PMIx_Info_list_release(p->grpinfo); + PMIx_Info_list_release(p->endpts); + if (NULL != p->dmns) { + free(p->dmns); + } +} +PMIX_CLASS_INSTANCE(prte_grpcomm_group_t, + pmix_list_item_t, + gccon, gcdes); + + +static void mdcon(prte_pmix_fence_caddy_t *p) +{ + p->sig = NULL; + p->buf = NULL; + p->procs = NULL; + p->nprocs = 0; + p->info = NULL; + p->ninfo = 0; + p->data = NULL; + p->ndata = 0; + p->cbfunc = NULL; + p->cbdata = NULL; +} +static void mddes(prte_pmix_fence_caddy_t *p) +{ + if (NULL != p->sig) { + PMIX_RELEASE(p->sig); + } + if (NULL != p->buf) { + PMIX_DATA_BUFFER_RELEASE(p->buf); + } } +PMIX_CLASS_INSTANCE(prte_pmix_fence_caddy_t, + pmix_object_t, + mdcon, mddes); diff --git a/src/mca/grpcomm/direct/grpcomm_direct_fence.c b/src/mca/grpcomm/direct/grpcomm_direct_fence.c new file mode 100644 index 0000000000..894bd163ce --- /dev/null +++ b/src/mca/grpcomm/direct/grpcomm_direct_fence.c @@ -0,0 +1,670 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2007 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All + * rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "prte_config.h" +#include "constants.h" +#include "types.h" + +#include + +#include "src/class/pmix_list.h" +#include "src/pmix/pmix-internal.h" + +#include "src/prted/pmix/pmix_server_internal.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/rml/rml.h" +#include "src/mca/rmaps/rmaps_types.h" +#include "src/mca/state/state.h" +#include "src/util/name_fns.h" +#include "src/util/nidmap.h" +#include "src/util/proc_info.h" +#include "src/util/pmix_show_help.h" + +#include "grpcomm_direct.h" +#include "src/mca/grpcomm/base/base.h" + +/* internal functions */ +static void fence(int sd, short args, void *cbdata); +static prte_grpcomm_fence_t* get_tracker(prte_grpcomm_direct_fence_signature_t *sig, bool create); +static int create_dmns(prte_grpcomm_direct_fence_signature_t *sig, + pmix_rank_t **dmns, size_t *ndmns); +static int fence_sig_pack(pmix_data_buffer_t *bkt, + prte_grpcomm_direct_fence_signature_t *sig); +static int fence_sig_unpack(pmix_data_buffer_t *buffer, + prte_grpcomm_direct_fence_signature_t **sig); + +int prte_grpcomm_direct_fence(const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t info[], size_t ninfo, char *data, + size_t ndata, pmix_modex_cbfunc_t cbfunc, void *cbdata) +{ + prte_pmix_fence_caddy_t *cd; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + // bozo check + if (NULL == procs) { + return PRTE_ERR_NOT_SUPPORTED; + } + + cd = PMIX_NEW(prte_pmix_fence_caddy_t); + cd->procs = (pmix_proc_t*)procs; + cd->nprocs = nprocs; + cd->info = (pmix_info_t*)info; + cd->ninfo = ninfo; + cd->data = data; + cd->ndata = ndata; + cd->cbfunc = cbfunc; + cd->cbdata = cbdata; + + /* must push this into the event library to ensure we can + * access framework-global data safely */ + prte_event_set(prte_event_base, &cd->ev, -1, PRTE_EV_WRITE, fence, cd); + PMIX_POST_OBJECT(cd); + prte_event_active(&cd->ev, PRTE_EV_WRITE, 1); + return PRTE_SUCCESS; +} + +static void fence(int sd, short args, void *cbdata) +{ + prte_pmix_fence_caddy_t *cd = (prte_pmix_fence_caddy_t *) cbdata; + prte_grpcomm_direct_fence_signature_t sig; + prte_grpcomm_fence_t *coll; + int rc; + pmix_data_buffer_t *relay, bkt; + pmix_byte_object_t bo; + PRTE_HIDE_UNUSED_PARAMS(sd, args); + + PMIX_ACQUIRE_OBJECT(cd); + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct: fence", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + /* compute the signature of this collective */ + PMIX_CONSTRUCT(&sig, prte_grpcomm_direct_fence_signature_t); + sig.sz = cd->nprocs; + sig.signature = (pmix_proc_t *) malloc(sig.sz * sizeof(pmix_proc_t)); + memcpy(sig.signature, cd->procs, sig.sz * sizeof(pmix_proc_t)); + + /* retrieve an existing tracker, create it if not + * already found. The fence module is responsible + * for releasing it upon completion of the collective */ + coll = get_tracker(&sig, true); + if (NULL == coll) { + PMIX_DESTRUCT(&sig); + PMIX_RELEASE(cd); + return; + } + coll->cbfunc = cd->cbfunc; + coll->cbdata = cd->cbdata; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct: fence", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + // execute the fence operation + PMIX_DATA_BUFFER_CREATE(relay); + /* pack the signature */ + rc = fence_sig_pack(relay, coll->sig); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_RELEASE(cd); + return; + } + + // pack the info structs + rc = PMIx_Data_pack(NULL, relay, &cd->ninfo, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_RELEASE(cd); + return; + } + if (0 < cd->ninfo) { + rc = PMIx_Data_pack(NULL, relay, cd->info, cd->ninfo, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_RELEASE(cd); + return; + } + } + + /* pass along the payload */ + PMIX_DATA_BUFFER_CONSTRUCT(&bkt); + bo.bytes = cd->data; + bo.size = cd->ndata; + PMIx_Data_embed(&bkt, &bo); + rc = PMIx_Data_copy_payload(relay, &bkt); + PMIX_DATA_BUFFER_DESTRUCT(&bkt); + if (PMIX_SUCCESS != rc) { + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_RELEASE(cd); + return; + } + + /* send this to ourselves for processing */ + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence sending to ourself", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + PRTE_RML_SEND(rc, PRTE_PROC_MY_NAME->rank, relay, + PRTE_RML_TAG_FENCE); + PMIX_RELEASE(cd); + return; +} + +void prte_grpcomm_direct_fence_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata) +{ + int32_t cnt; + int rc, timeout; + size_t n, ninfo; + pmix_status_t st; + pmix_info_t *info = NULL; + prte_grpcomm_direct_fence_signature_t *sig = NULL; + pmix_data_buffer_t *reply; + prte_grpcomm_fence_t *coll; + PRTE_HIDE_UNUSED_PARAMS(status, tag, cbdata); + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct fence recvd from %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(sender))); + + /* unpack the signature */ + rc = fence_sig_unpack(buffer, &sig); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + } + + /* check for the tracker and create it if not found */ + if (NULL == (coll = get_tracker(sig, true))) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + PMIX_RELEASE(sig); + return; + } + PMIX_RELEASE(sig); + + // unpack the info structs + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &ninfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return; + } + if (0 < ninfo) { + PMIX_INFO_CREATE(info, ninfo); + cnt = ninfo; + rc = PMIx_Data_unpack(NULL, buffer, info, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(info, ninfo); + return; + } + } + + /* cycle thru the info to look for keys we support */ + for (n=0; n < ninfo; n++) { + if (PMIX_CHECK_KEY(&info[n], PMIX_TIMEOUT)) { + PMIX_VALUE_GET_NUMBER(rc, &info[n].value, timeout, int); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(info, ninfo); + return; + } + if (coll->timeout < timeout) { + coll->timeout = timeout; + } + /* update the info with the collected value */ + info[n].value.type = PMIX_INT; + info[n].value.data.integer = coll->timeout; + + } else if (PMIX_CHECK_KEY(&info[n], PMIX_LOCAL_COLLECTIVE_STATUS)) { + PMIX_VALUE_GET_NUMBER(rc, &info[n].value, st, pmix_status_t); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(info, ninfo); + return; + } + if (PMIX_SUCCESS != st && + PMIX_SUCCESS == coll->status) { + coll->status = st; + } + /* update the info with the collected value */ + info[n].value.type = PMIX_STATUS; + info[n].value.data.status = coll->status; + } + } + + /* increment nprocs reported for collective */ + coll->nreported++; + + // transfer any data + rc = PMIx_Data_copy_payload(&coll->bucket, buffer); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(info, ninfo); + return; + } + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct fence recv nexpected %d nrep %d", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) coll->nexpected, + (int) coll->nreported)); + + /* see if everyone has reported */ + if (coll->nreported == coll->nexpected) { + if (PRTE_PROC_IS_MASTER) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct fence HNP reports complete", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + /* the allgather is complete - send the xcast */ + PMIX_DATA_BUFFER_CREATE(reply); + + /* pack the signature */ + rc = fence_sig_pack(reply, coll->sig); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_INFO_FREE(info, ninfo); + return; + } + /* pack the status */ + rc = PMIx_Data_pack(NULL, reply, &coll->status, 1, PMIX_INT32); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_INFO_FREE(info, ninfo); + return; + } + + /* transfer the collected bucket */ + rc = PMIx_Data_copy_payload(reply, &coll->bucket); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_INFO_FREE(info, ninfo); + return; + } + + /* send the release via xcast */ + (void) prte_grpcomm.xcast(PRTE_RML_TAG_FENCE_RELEASE, reply); + } else { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct fence rollup complete - sending to %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(PRTE_PROC_MY_PARENT))); + PMIX_DATA_BUFFER_CREATE(reply); + /* pack the signature */ + rc = fence_sig_pack(reply, coll->sig); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_INFO_FREE(info, ninfo); + return; + } + + // pack the info structs + rc = PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_INFO_FREE(info, ninfo); + return; + } + if (0 < ninfo) { + rc = PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_INFO_FREE(info, ninfo); + return; + } + } + PMIX_INFO_FREE(info, ninfo); + + /* transfer the collected bucket */ + rc = PMIx_Data_copy_payload(reply, &coll->bucket); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + return; + } + /* send the info to our parent */ + PRTE_RML_SEND(rc, PRTE_PROC_MY_PARENT->rank, reply, + PRTE_RML_TAG_FENCE); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + return; + } + } + } +} + +static void relcb(void *cbdata) +{ + uint8_t *data = (uint8_t *) cbdata; + + if (NULL != data) { + free(data); + } +} + +void prte_grpcomm_direct_fence_release(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata) +{ + int32_t cnt; + int rc, ret; + prte_grpcomm_direct_fence_signature_t *sig = NULL; + prte_grpcomm_fence_t *coll; + pmix_byte_object_t bo; + PRTE_HIDE_UNUSED_PARAMS(status, sender, tag, cbdata); + + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct: fence release called with %d bytes", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) buffer->bytes_used)); + + /* unpack the signature */ + rc = fence_sig_unpack(buffer, &sig); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return; + } + + /* unpack the return status */ + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &ret, &cnt, PMIX_INT32); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(sig); + return; + } + + /* check for the tracker - it is not an error if not + * found as that just means we are not involved + * in the collective */ + if (NULL == (coll = get_tracker(sig, false))) { + PMIX_RELEASE(sig); + return; + } + + /* unload the buffer */ + PMIX_BYTE_OBJECT_CONSTRUCT(&bo); + rc = PMIx_Data_unload(buffer, &bo); + if (PMIX_SUCCESS != rc) { + ret = rc; + } + + /* execute the callback */ + if (NULL != coll->cbfunc) { + coll->cbfunc(ret, bo.bytes, bo.size, coll->cbdata, relcb, bo.bytes); + } + pmix_list_remove_item(&prte_mca_grpcomm_direct_component.fence_ops, &coll->super); + PMIX_RELEASE(coll); + PMIX_RELEASE(sig); +} + +static prte_grpcomm_fence_t* get_tracker(prte_grpcomm_direct_fence_signature_t *sig, bool create) +{ + prte_grpcomm_fence_t *coll; + int rc; + size_t n; + + /* search the existing tracker list to see if this already exists */ + PMIX_LIST_FOREACH(coll, &prte_mca_grpcomm_direct_component.fence_ops, prte_grpcomm_fence_t) { + if (sig->sz == coll->sig->sz) { + // must match proc signature + if (0 == memcmp(sig->signature, coll->sig->signature, sig->sz * sizeof(pmix_proc_t))) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:base:returning existing collective", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + return coll; + } + } + } + /* if we get here, then this is a new collective - so create + * the tracker for it */ + if (!create) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:base: not creating new coll", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + return NULL; + } + coll = PMIX_NEW(prte_grpcomm_fence_t); + // we have to know the participating procs + coll->sig = PMIX_NEW(prte_grpcomm_direct_fence_signature_t); + coll->sig->sz = sig->sz; + coll->sig->signature = (pmix_proc_t *) malloc(coll->sig->sz * sizeof(pmix_proc_t)); + memcpy(coll->sig->signature, sig->signature, coll->sig->sz * sizeof(pmix_proc_t)); + pmix_list_append(&prte_mca_grpcomm_direct_component.fence_ops, &coll->super); + + /* now get the daemons involved */ + if (PRTE_SUCCESS != (rc = create_dmns(sig, &coll->dmns, &coll->ndmns))) { + PRTE_ERROR_LOG(rc); + return NULL; + } + + /* count the number of contributions we should get */ + coll->nexpected = prte_rml_get_num_contributors(coll->dmns, coll->ndmns); + + /* see if I am in the array of participants - note that I may + * be in the rollup tree even though I'm not participating + * in the collective itself */ + for (n = 0; n < coll->ndmns; n++) { + if (coll->dmns[n] == PRTE_PROC_MY_NAME->rank) { + coll->nexpected++; + break; + } + } + + return coll; +} + +static int create_dmns(prte_grpcomm_direct_fence_signature_t *sig, + pmix_rank_t **dmns, size_t *ndmns) +{ + size_t n; + prte_job_t *jdata; + prte_proc_t *proc; + prte_node_t *node; + prte_job_map_t *map; + int i; + pmix_list_t ds; + prte_namelist_t *nm; + pmix_rank_t vpid; + bool found; + size_t nds = 0; + pmix_rank_t *dns = NULL; + int rc = PRTE_SUCCESS; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence:create_dmns called with %s signature size %" PRIsize_t "", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + (NULL == sig->signature) ? "NULL" : "NON-NULL", sig->sz)); + + /* if the target jobid is our own, + * then all daemons are participating */ + if (PMIX_CHECK_NSPACE(PRTE_PROC_MY_NAME->nspace, sig->signature[0].nspace)) { + *ndmns = prte_process_info.num_daemons; + *dmns = NULL; + return PRTE_SUCCESS; + } + + PMIX_CONSTRUCT(&ds, pmix_list_t); + for (n = 0; n < sig->sz; n++) { + if (NULL == (jdata = prte_get_job_data_object(sig->signature[n].nspace))) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + break; + } + map = (prte_job_map_t*)jdata->map; + if (NULL == map || 0 == map->num_nodes) { + /* we haven't generated a job map yet - if we are the HNP, + * then we should only involve ourselves. Otherwise, we have + * no choice but to abort to avoid hangs */ + if (PRTE_PROC_IS_MASTER) { + rc = PRTE_SUCCESS; + break; + } + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + break; + } + if (PMIX_RANK_WILDCARD == sig->signature[n].rank) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence::create_dmns called for all procs in job %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_JOBID_PRINT(sig->signature[0].nspace))); + /* all daemons hosting this jobid are participating */ + for (i = 0; i < map->nodes->size; i++) { + if (NULL == (node = pmix_pointer_array_get_item(map->nodes, i))) { + continue; + } + if (NULL == node->daemon) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto done; + } + found = false; + PMIX_LIST_FOREACH(nm, &ds, prte_namelist_t) + { + if (nm->name.rank == node->daemon->name.rank) { + found = true; + break; + } + } + if (!found) { + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence::create_dmns adding daemon %s to list", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&node->daemon->name))); + nm = PMIX_NEW(prte_namelist_t); + PMIX_LOAD_PROCID(&nm->name, PRTE_PROC_MY_NAME->nspace, node->daemon->name.rank); + pmix_list_append(&ds, &nm->super); + } + } + } else { + /* lookup the daemon for this proc and add it to the list */ + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s sign: GETTING PROC OBJECT FOR %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&sig->signature[n]))); + proc = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, + sig->signature[n].rank); + if (NULL == proc) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto done; + } + if (NULL == proc->node || NULL == proc->node->daemon) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto done; + } + vpid = proc->node->daemon->name.rank; + found = false; + PMIX_LIST_FOREACH(nm, &ds, prte_namelist_t) + { + if (nm->name.rank == vpid) { + found = true; + break; + } + } + if (!found) { + nm = PMIX_NEW(prte_namelist_t); + PMIX_LOAD_PROCID(&nm->name, PRTE_PROC_MY_NAME->nspace, vpid); + pmix_list_append(&ds, &nm->super); + } + } + } + +done: + if (0 < pmix_list_get_size(&ds)) { + dns = (pmix_rank_t *) malloc(pmix_list_get_size(&ds) * sizeof(pmix_rank_t)); + nds = 0; + while (NULL != (nm = (prte_namelist_t *) pmix_list_remove_first(&ds))) { + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence::create_dmns adding daemon %s to array", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&nm->name))); + dns[nds++] = nm->name.rank; + PMIX_RELEASE(nm); + } + } + PMIX_LIST_DESTRUCT(&ds); + *dmns = dns; + *ndmns = nds; + return rc; +} + +static int fence_sig_pack(pmix_data_buffer_t *bkt, + prte_grpcomm_direct_fence_signature_t *sig) +{ + pmix_status_t rc; + + // always send the participating procs + rc = PMIx_Data_pack(NULL, bkt, &sig->sz, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + if (0 < sig->sz) { + rc = PMIx_Data_pack(NULL, bkt, sig->signature, sig->sz, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + } + + return PRTE_SUCCESS; +} + +static int fence_sig_unpack(pmix_data_buffer_t *buffer, + prte_grpcomm_direct_fence_signature_t **sig) +{ + pmix_status_t rc; + int32_t cnt; + prte_grpcomm_direct_fence_signature_t *s; + + s = PMIX_NEW(prte_grpcomm_direct_fence_signature_t); + + // unpack the participating procs + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->sz, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + if (0 < s->sz) { + PMIX_PROC_CREATE(s->signature, s->sz); + cnt = s->sz; + rc = PMIx_Data_unpack(NULL, buffer, s->signature, &cnt, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + } + + *sig = s; + return PRTE_SUCCESS; +} diff --git a/src/mca/grpcomm/direct/grpcomm_direct_group.c b/src/mca/grpcomm/direct/grpcomm_direct_group.c new file mode 100644 index 0000000000..960f69939c --- /dev/null +++ b/src/mca/grpcomm/direct/grpcomm_direct_group.c @@ -0,0 +1,1399 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2007 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All + * rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "prte_config.h" +#include "constants.h" +#include "types.h" + +#include + +#include "src/class/pmix_list.h" +#include "src/pmix/pmix-internal.h" + +#include "src/prted/pmix/pmix_server_internal.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/mca/rmaps/rmaps_types.h" +#include "src/rml/rml.h" +#include "src/mca/state/state.h" +#include "src/util/name_fns.h" +#include "src/util/nidmap.h" +#include "src/util/proc_info.h" +#include "src/util/pmix_show_help.h" + +#include "grpcomm_direct.h" +#include "src/mca/grpcomm/base/base.h" + +static void group(int sd, short args, void *cbdata); + +static prte_grpcomm_group_t *get_tracker(prte_grpcomm_direct_group_signature_t *sig, bool create); + +static int create_dmns(prte_grpcomm_direct_group_signature_t *sig, + pmix_rank_t **dmns, size_t *ndmns); + +static int pack_signature(pmix_data_buffer_t *buf, + prte_grpcomm_direct_group_signature_t *sig); + +static int unpack_signature(pmix_data_buffer_t *buf, + prte_grpcomm_direct_group_signature_t **sig); + +int prte_grpcomm_direct_group(pmix_group_operation_t op, char *grpid, + const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + prte_pmix_grp_caddy_t *cd; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:group with %lu procs", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), nprocs)); + + cd = PMIX_NEW(prte_pmix_grp_caddy_t); + cd->op = op; + cd->grpid = strdup(grpid); + cd->procs = procs; + cd->nprocs = nprocs; + cd->directives = directives; + cd->ndirs = ndirs; + cd->cbfunc = cbfunc; + cd->cbdata = cbdata; + + /* must push this into the event library to ensure we can + * access framework-global data safely */ + prte_event_set(prte_event_base, &cd->ev, -1, PRTE_EV_WRITE, group, cd); + PMIX_POST_OBJECT(cd); + prte_event_active(&cd->ev, PRTE_EV_WRITE, 1); + return PRTE_SUCCESS; +} + +static void group(int sd, short args, void *cbdata) +{ + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; + prte_grpcomm_direct_group_signature_t sig; + prte_grpcomm_group_t *coll; + size_t i; + pmix_data_buffer_t *relay; + pmix_status_t rc, st = PMIX_SUCCESS; + int timeout = 0; + void *endpts, *grpinfo; + pmix_data_array_t darray; + pmix_info_t *info; + size_t ninfo; + PRTE_HIDE_UNUSED_PARAMS(sd, args); + + /* compute the signature of this collective */ + PMIX_CONSTRUCT(&sig, prte_grpcomm_direct_group_signature_t); + sig.groupID = strdup(cd->grpid); + if (NULL != cd->procs) { + sig.nmembers = cd->nprocs; + PMIX_PROC_CREATE(sig.members, sig.nmembers); + memcpy(sig.members, cd->procs, cd->nprocs * sizeof(pmix_proc_t)); + } + sig.op = cd->op; + + /* create a tracker for this operation */ + if (NULL == (coll = get_tracker(&sig, true))) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + PMIX_RELEASE(cd); + PMIX_DESTRUCT(&sig); + return; + } + coll->cbfunc = cd->cbfunc; + coll->cbdata = cd->cbdata; + + // setup to track endpts and grpinfo + endpts = PMIx_Info_list_start(); + grpinfo = PMIx_Info_list_start(); + + /* check the directives */ + for (i = 0; i < cd->ndirs; i++) { + /* see if they want a context id assigned */ + if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_GROUP_ASSIGN_CONTEXT_ID)) { + sig.assignID = PMIX_INFO_TRUE(&cd->directives[i]); + +#ifdef PMIX_GROUP_BOOTSTRAP + } else if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_GROUP_BOOTSTRAP)) { + PMIX_VALUE_GET_NUMBER(rc, &cd->directives[i].value, sig.bootstrap, size_t); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DESTRUCT(&sig); + } +#endif + + } else if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_LOCAL_COLLECTIVE_STATUS)) { + PMIX_VALUE_GET_NUMBER(rc, &cd->directives[i].value, st, pmix_status_t); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DESTRUCT(&sig); + goto error; + } + + } else if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_TIMEOUT)) { + PMIX_VALUE_GET_NUMBER(rc, &cd->directives[i].value, timeout, int); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DESTRUCT(&sig); + goto error; + } + + } else if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_GROUP_ADD_MEMBERS)) { + // there is only one of these as it is aggregated by the + // PMIx server library + sig.addmembers = (pmix_proc_t*)cd->directives[i].value.data.darray->array; + sig.naddmembers = cd->directives[i].value.data.darray->size; + + } else if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_GROUP_INFO)) { + rc = PMIx_Info_list_xfer(grpinfo, &cd->directives[i]); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + + } else if (PMIX_CHECK_KEY(&cd->directives[i], PMIX_PROC_DATA)) { + rc = PMIx_Info_list_xfer(endpts, &cd->directives[i]); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + } + + // create the relay buffer + PMIX_DATA_BUFFER_CREATE(relay); + + /* pack the signature */ + rc = pack_signature(relay, &sig); + // protect the inbound directives + sig.addmembers = NULL; + sig.naddmembers = 0; + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_DESTRUCT(&sig); + goto error; + } + + // pack the local collective status + rc = PMIx_Data_pack(NULL, relay, &st, 1, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_DESTRUCT(&sig); + goto error; + } + + // pack any timeout directive + rc = PMIx_Data_pack(NULL, relay, &timeout, 1, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(relay); + PMIX_DESTRUCT(&sig); + goto error; + } + + if (PMIX_GROUP_CONSTRUCT == sig.op) { + // pack any group info + PMIx_Info_list_convert(grpinfo, &darray); + info = (pmix_info_t*)darray.array; + ninfo = darray.size; + PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE); + if (0 < ninfo) { + PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO); + } + PMIX_DATA_ARRAY_DESTRUCT(&darray); + + // pack any endpts + PMIx_Info_list_convert(endpts, &darray); + info = (pmix_info_t*)darray.array; + ninfo = darray.size; + PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE); + if (0 < ninfo) { + PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO); + } + PMIX_DATA_ARRAY_DESTRUCT(&darray); + } + PMIx_Info_list_release(grpinfo); + PMIx_Info_list_release(endpts); + + /* if this is a bootstrap operation, send it directly to the HNP */ + if (0 < sig.bootstrap || NULL == sig.members) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:grp bootstrap sending to HNP", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, relay, + PRTE_RML_TAG_GROUP); + if (PRTE_SUCCESS != rc) { + PMIX_RELEASE(relay); + rc = prte_pmix_convert_rc(rc); + PMIX_DESTRUCT(&sig); + goto error; + } + } + PMIX_DESTRUCT(&sig); + + /* send this to ourselves for processing */ + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:grp_construct sending to ourself", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + PRTE_RML_SEND(rc, PRTE_PROC_MY_NAME->rank, relay, + PRTE_RML_TAG_GROUP); + if (PRTE_SUCCESS != rc) { + PMIX_RELEASE(relay); + rc = prte_pmix_convert_rc(rc); + goto error; + } + return; + +error: + if (NULL != cd->cbfunc) { + cd->cbfunc(rc, NULL, 0, cd->cbdata, NULL, NULL); + } + PMIX_RELEASE(cd); +} + +void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata) +{ + int32_t cnt; + int rc, timeout; + size_t m, n, ninfo, nfinal = 0, nendpts, ngrpinfo; + pmix_proc_t *finalmembership = NULL; + bool found; + pmix_list_t nmlist; + prte_namelist_t *nm; + pmix_data_array_t darray; + pmix_status_t st; + pmix_info_t *info = NULL, *endpts, *grpinfo; + prte_grpcomm_direct_group_signature_t *sig = NULL; + pmix_data_buffer_t *reply; + prte_grpcomm_group_t *coll; + PRTE_HIDE_UNUSED_PARAMS(status, tag, cbdata); + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct grp_construct recvd from %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(sender))); + + /* unpack the signature */ + rc = unpack_signature(buffer, &sig); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + } + + /* check for the tracker and create it if not found */ + if (NULL == (coll = get_tracker(sig, true))) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + PMIX_RELEASE(sig); + return; + } + + // unpack the local collective status + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &st, &cnt, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(sig); + return; + } + if (PMIX_SUCCESS != st && + PMIX_SUCCESS == coll->status) { + coll->status = st; + } + + // unpack any timeout directive + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &timeout, &cnt, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(sig); + return; + } + if (coll->timeout < timeout) { + coll->timeout = timeout; + } + + + if (PMIX_GROUP_CONSTRUCT == sig->op) { + /* unpack the number of group infos */ + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &ngrpinfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(sig); + return; + } + if (0 < ngrpinfo) { + PMIX_INFO_CREATE(grpinfo, ngrpinfo); + cnt = ngrpinfo; + rc = PMIx_Data_unpack(NULL, buffer, grpinfo, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(grpinfo, ngrpinfo); + PMIX_RELEASE(sig); + return; + } + for (n=0; n < ngrpinfo; n++) { + rc = PMIx_Info_list_xfer(coll->grpinfo, &grpinfo[n]); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + PMIX_INFO_FREE(grpinfo, ngrpinfo); + } + + /* unpack the number of endpoints */ + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &nendpts, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(grpinfo, ngrpinfo); + PMIX_RELEASE(sig); + return; + } + if (0 < nendpts) { + PMIX_INFO_CREATE(endpts, nendpts); + cnt = nendpts; + rc = PMIx_Data_unpack(NULL, buffer, endpts, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_INFO_FREE(grpinfo, ngrpinfo); + PMIX_INFO_FREE(endpts, nendpts); + PMIX_RELEASE(sig); + return; + } + for (n=0; n < nendpts; n++) { + rc = PMIx_Info_list_xfer(coll->endpts, &endpts[n]); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + PMIX_INFO_FREE(endpts, nendpts); + } + } + + /* increment nprocs reported for collective */ + coll->nreported++; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct group recv nexpected %d nrep %d", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) coll->nexpected, + (int) coll->nreported)); + + /* see if everyone has reported */ + if (coll->nreported == coll->nexpected) { + + if (PRTE_PROC_IS_MASTER) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct group HNP reports complete", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + /* the allgather is complete - send the xcast */ + if (PMIX_GROUP_CONSTRUCT == sig->op) { + /* if we were asked to provide a context id, do so */ + if (coll->sig->assignID) { + coll->sig->ctxid = prte_grpcomm_base.context_id; + --prte_grpcomm_base.context_id; + coll->sig->ctxid_assigned = true; + } + + // construct the final membership + PMIX_CONSTRUCT(&nmlist, pmix_list_t); + // sadly, an exhaustive search + for (m=0; m < coll->sig->nmembers; m++) { + found = false; + PMIX_LIST_FOREACH(nm, &nmlist, prte_namelist_t) { + if (PMIX_CHECK_PROCID(&coll->sig->members[m], &nm->name)) { + // if the new one is rank=WILDCARD, then ensure + // we keep it as wildcard + if (PMIX_RANK_WILDCARD == coll->sig->members[m].rank) { + nm->name.rank = PMIX_RANK_WILDCARD; + } + found = true; + break; + } + } + if (!found) { + nm = PMIX_NEW(prte_namelist_t); + memcpy(&nm->name, &coll->sig->members[m], sizeof(pmix_proc_t)); + pmix_list_append(&nmlist, &nm->super); + } + } + // now check any added members + for (m=0; m < coll->sig->naddmembers; m++) { + found = false; + PMIX_LIST_FOREACH(nm, &nmlist, prte_namelist_t) { + if (PMIX_CHECK_PROCID(&coll->sig->members[m], &nm->name)) { + // if the new one is rank=WILDCARD, then ensure + // we keep it as wildcard + if (PMIX_RANK_WILDCARD == coll->sig->addmembers[m].rank) { + nm->name.rank = PMIX_RANK_WILDCARD; + } + found = true; + break; + } + } + if (!found) { + nm = PMIX_NEW(prte_namelist_t); + memcpy(&nm->name, &coll->sig->addmembers[m], sizeof(pmix_proc_t)); + pmix_list_append(&nmlist, &nm->super); + } + } + // create the full membership array + nfinal = pmix_list_get_size(&nmlist); + PMIX_PROC_CREATE(finalmembership, nfinal); + m = 0; + PMIX_LIST_FOREACH(nm, &nmlist, prte_namelist_t) { + memcpy(&finalmembership[m], &nm->name, sizeof(pmix_proc_t)); + ++m; + } + PMIX_LIST_DESTRUCT(&nmlist); + + /* sort the procs so everyone gets the same order */ + qsort(finalmembership, nfinal, sizeof(pmix_proc_t), pmix_util_compare_proc); + } + + // CONSTRUCT THE RELEASE MESSAGE + PMIX_DATA_BUFFER_CREATE(reply); + + /* pack the signature */ + rc = pack_signature(reply, coll->sig); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + PMIX_PROC_FREE(finalmembership, nfinal); + return; + } + /* pack the status */ + rc = PMIx_Data_pack(NULL, reply, &coll->status, 1, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + PMIX_PROC_FREE(finalmembership, nfinal); + return; + } + + if (PMIX_GROUP_CONSTRUCT == sig->op) { + // pack the final membership + rc = PMIx_Data_pack(NULL, reply, &nfinal, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + PMIX_PROC_FREE(finalmembership, nfinal); + return; + } + if (0 < nfinal) { + rc = PMIx_Data_pack(NULL, reply, finalmembership, nfinal, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + return; + } + PMIX_PROC_FREE(finalmembership, nfinal); + } + + // pack any group info + PMIx_Info_list_convert(coll->grpinfo, &darray); + info = (pmix_info_t*)darray.array; + ninfo = darray.size; + PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE); + if (0 < ninfo) { + PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO); + } + PMIX_DATA_ARRAY_DESTRUCT(&darray); + + // pack any endpts + PMIx_Info_list_convert(coll->endpts, &darray); + info = (pmix_info_t*)darray.array; + ninfo = darray.size; + PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE); + if (0 < ninfo) { + PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO); + } + PMIX_DATA_ARRAY_DESTRUCT(&darray); + } + + /* send the release via xcast */ + (void) prte_grpcomm.xcast(PRTE_RML_TAG_GROUP_RELEASE, reply); + + } else { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct allgather rollup complete - sending to %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(PRTE_PROC_MY_PARENT))); + + // setup to relay our rollup results + PMIX_DATA_BUFFER_CREATE(reply); + + /* pack the signature */ + rc = pack_signature(reply, coll->sig); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + return; + } + + // pack the local collective status + rc = PMIx_Data_pack(NULL, reply, &coll->status, 1, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + return; + } + + // pack any timeout directive + rc = PMIx_Data_pack(NULL, reply, &coll->timeout, 1, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + return; + } + + if (PMIX_GROUP_CONSTRUCT == sig->op) { + // pack any group info + PMIx_Info_list_convert(coll->grpinfo, &darray); + info = (pmix_info_t*)darray.array; + ninfo = darray.size; + PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE); + if (0 < ninfo) { + PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO); + } + PMIX_DATA_ARRAY_DESTRUCT(&darray); + + // pack any endpts + PMIx_Info_list_convert(coll->endpts, &darray); + info = (pmix_info_t*)darray.array; + ninfo = darray.size; + PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE); + if (0 < ninfo) { + PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO); + } + PMIX_DATA_ARRAY_DESTRUCT(&darray); + } + + /* send the info to our parent */ + PRTE_RML_SEND(rc, PRTE_PROC_MY_PARENT->rank, reply, + PRTE_RML_TAG_GROUP); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(sig); + return; + } + } + } + PMIX_RELEASE(sig); +} + +static void relcb(void *cbdata) +{ + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; + PMIX_RELEASE(cd); +} + +static void opcbfunc(int status, void *cbdata) +{ + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; + PRTE_HIDE_UNUSED_PARAMS(status); + + PMIX_RELEASE(cd); +} + +static void lkopcbfunc(int status, void *cbdata) +{ + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; + PRTE_HIDE_UNUSED_PARAMS(status); + + cd->lock.status = status; + PMIX_WAKEUP_THREAD(&cd->lock); +} + +static void find_delete_tracker(prte_grpcomm_direct_group_signature_t *sig) +{ + prte_grpcomm_group_t *coll; + + PMIX_LIST_FOREACH(coll, &prte_mca_grpcomm_direct_component.group_ops, prte_grpcomm_group_t) { + // must match groupID's + if (0 == strcmp(sig->groupID, coll->sig->groupID)) { + pmix_list_remove_item(&prte_mca_grpcomm_direct_component.group_ops, &coll->super); + PMIX_RELEASE(coll); + return; + } + } +} + +void prte_grpcomm_direct_grp_release(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tag, void *cbdata) +{ + prte_grpcomm_group_t *coll; + prte_grpcomm_direct_group_signature_t *sig = NULL; + prte_pmix_grp_caddy_t cd2, *cd; + int32_t cnt; + pmix_status_t rc = PMIX_SUCCESS, st; + pmix_proc_t *finalmembership = NULL; + size_t nfinal = 0; + size_t nendpts = 0; + size_t ngrpinfo = 0; + size_t n; + pmix_data_array_t darray; + pmix_info_t *grpinfo = NULL; + pmix_info_t *endpts = NULL; + pmix_server_pset_t *pset; + void *ilist; + PRTE_HIDE_UNUSED_PARAMS(status, sender, tag, cbdata); + + PMIX_ACQUIRE_OBJECT(cd); + + pmix_output_verbose(2, prte_pmix_server_globals.output, + "%s group request complete", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); + + // unpack the signature + rc = unpack_signature(buffer, &sig); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + return; + } + + /* check for the tracker - okay if not found, it just + * means that we had no local participants */ + coll = get_tracker(sig, false); + + // unpack the status + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &st, &cnt, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + + /* if this was a destruct operation, then there is nothing + * further to unpack */ + if (PMIX_GROUP_DESTRUCT == sig->op) { + /* find this group ID on our list of groups */ + PMIX_LIST_FOREACH(pset, &prte_pmix_server_globals.groups, pmix_server_pset_t) + { + if (0 == strcmp(pset->name, sig->groupID)) { + pmix_list_remove_item(&prte_pmix_server_globals.groups, &pset->super); + PMIX_RELEASE(pset); + break; + } + } + if (NULL != coll && NULL != coll->cbfunc) { + /* return to the local procs in the collective */ + coll->cbfunc(rc, NULL, 0, coll->cbdata, NULL, NULL); + } + // remove the tracker, if found + find_delete_tracker(sig); + PMIX_RELEASE(sig); + return; + } + + // must be a construct operation - continue unpacking + ilist = PMIx_Info_list_start(); + + if (sig->ctxid_assigned) { + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_CONTEXT_ID, &sig->ctxid, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + } + + // unpack the final membership + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &nfinal, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + if (0 < nfinal) { + PMIX_PROC_CREATE(finalmembership, nfinal); + cnt = nfinal; + rc = PMIx_Data_unpack(NULL, buffer, finalmembership, &cnt, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + } + + // unpack group info + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &ngrpinfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + if (0 < ngrpinfo) { + PMIX_INFO_CREATE(grpinfo, ngrpinfo); + cnt = ngrpinfo; + rc = PMIx_Data_unpack(NULL, buffer, grpinfo, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + // transfer them to our list + for (n=0; n < ngrpinfo; n++) { + rc = PMIx_Info_list_add_value(ilist, PMIX_GROUP_INFO, &grpinfo[n].value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + PMIX_INFO_FREE(grpinfo, ngrpinfo); + } + + + // unpack endpts + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &nendpts, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + if (0 < nendpts) { + PMIX_INFO_CREATE(endpts, nendpts); + cnt = nendpts; + rc = PMIx_Data_unpack(NULL, buffer, endpts, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + st = rc; + goto notify; + } + // transfer them to our list + for (n=0; n < nendpts; n++) { + rc = PMIx_Info_list_add_value(ilist, PMIX_GROUP_ENDPT_DATA, &endpts[n].value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + PMIX_INFO_FREE(endpts, nendpts); + } + + // PRRTE automatically ensures that all daemons register all jobs + // with their local PMIx server, regardless of whether or not + // that daemon hosts any local clients of that job. So we + // do not need to collect/pass job data for participants + // in the group construct + + // pass the information down to the PMIx server + PMIX_INFO_LIST_CONVERT(rc, ilist, &darray); + PMIX_CONSTRUCT(&cd2, prte_pmix_grp_caddy_t); + cd2.info = (pmix_info_t*)darray.array; + cd2.ninfo = darray.size; + PMIX_INFO_LIST_RELEASE(ilist); + + rc = PMIx_server_register_resources(cd2.info, cd2.ninfo, lkopcbfunc, &cd2); + if (PMIX_SUCCESS == rc) { + PMIX_WAIT_THREAD(&cd2.lock); + rc = cd2.lock.status; + } + PMIX_DESTRUCT(&cd2); + + if (PMIX_SUCCESS == st) { + /* add it to our list of known groups */ + pset = PMIX_NEW(pmix_server_pset_t); + pset->name = strdup(sig->groupID); + if (NULL != finalmembership) { + pset->num_members = nfinal; + PMIX_PROC_CREATE(pset->members, pset->num_members); + memcpy(pset->members, finalmembership, nfinal * sizeof(pmix_proc_t)); + } + pmix_list_append(&prte_pmix_server_globals.groups, &pset->super); + } + +notify: + // regardless of prior error, we MUST notify any pending clients + // so they don't hang + + if (NULL == coll || NULL != sig->addmembers) { + // still need to generate invite event for procs + // that might be on nodes that were not involved + // in the original collective + + PMIX_INFO_LIST_START(ilist); + + // provide the group ID since the invitee won't have it + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_ID, sig->groupID, PMIX_STRING); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + + // set the range to be only procs that were added + darray.type = PMIX_PROC; + darray.array = sig->addmembers; + darray.size = sig->naddmembers; + // load the array - note: this copies the array! + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_EVENT_CUSTOM_RANGE, &darray, PMIX_DATA_ARRAY); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + + // mark that this event stays local and does not go up to the host + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_EVENT_STAYS_LOCAL, NULL, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + + if (NULL != finalmembership) { + // pass back the final group membership + darray.type = PMIX_PROC; + darray.array = finalmembership; + darray.size = nfinal; + // load the array - note: this copies the array! + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_MEMBERSHIP, &darray, PMIX_DATA_ARRAY); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + + // pass any assigned context ID + if (sig->ctxid_assigned) { + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_CONTEXT_ID, &sig->ctxid, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + + // convert for passing in event + PMIX_INFO_LIST_CONVERT(rc, ilist, &darray); + if (PMIX_SUCCESS != rc && PMIX_ERR_EMPTY != rc) { + PMIX_ERROR_LOG(rc); + } + cd = PMIX_NEW(prte_pmix_grp_caddy_t); + cd->info = (pmix_info_t*)darray.array; + cd->ninfo = darray.size; + PMIX_INFO_LIST_RELEASE(ilist); + + // notify local procs + PMIx_Notify_event(PMIX_GROUP_INVITED, &prte_process_info.myproc, PMIX_RANGE_CUSTOM, + cd->info, cd->ninfo, opcbfunc, (void*)cd); + } + + if (NULL != coll && NULL != coll->cbfunc) { + // service the procs that are part of the collective + + PMIX_INFO_LIST_START(ilist); + if (NULL != finalmembership) { + // pass back the final group membership + darray.type = PMIX_PROC; + darray.array = finalmembership; + darray.size = nfinal; + // load the array - note: this copies the array! + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_MEMBERSHIP, &darray, PMIX_DATA_ARRAY); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + + if (sig->ctxid_assigned) { + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_CONTEXT_ID, &sig->ctxid, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + + // convert for returning to PMIx server library + PMIX_INFO_LIST_CONVERT(rc, ilist, &darray); + if (PMIX_SUCCESS != rc && PMIX_ERR_EMPTY != rc) { + PMIX_ERROR_LOG(rc); + } + cd = PMIX_NEW(prte_pmix_grp_caddy_t); + cd->info = (pmix_info_t*)darray.array; + cd->ninfo = darray.size; + PMIX_INFO_LIST_RELEASE(ilist); + + /* return to the PMIx server library for relay to + * local procs in the operation */ + coll->cbfunc(rc, cd->info, cd->ninfo, coll->cbdata, relcb, (void*)cd); + } + + if (NULL != finalmembership) { + PMIX_PROC_FREE(finalmembership, nfinal); + } + if (0 < nendpts) { + PMIX_INFO_FREE(endpts, nendpts); + } + if (0 < ngrpinfo) { + PMIX_INFO_FREE(grpinfo, ngrpinfo); + } + // remove this collective from our tracker + find_delete_tracker(sig); + PMIX_RELEASE(sig); +} + + +static prte_grpcomm_group_t *get_tracker(prte_grpcomm_direct_group_signature_t *sig, bool create) +{ + prte_grpcomm_group_t *coll; + int rc; + pmix_proc_t *p; + size_t n, nmb; + pmix_list_t plist; + prte_namelist_t *nm; + bool found; + + if (NULL == sig->groupID) { + return NULL; + } + + /* search the existing tracker list to see if this already exists - we + * default to using the groupID if one is given, otherwise we fallback + * to the array of participating procs */ + PMIX_LIST_FOREACH(coll, &prte_mca_grpcomm_direct_component.group_ops, prte_grpcomm_group_t) { + // must match groupID's and ops + if (0 == strcmp(sig->groupID, coll->sig->groupID) && + sig->op == coll->sig->op) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:group:returning existing collective %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + sig->groupID)); + // if we are adding members, aggregate them + if (0 < sig->naddmembers) { + PMIX_CONSTRUCT(&plist, pmix_list_t); + for (n=0; n < sig->naddmembers; n++) { + // see if we already have this proc + found = false; + for (nmb=0; nmb < coll->sig->naddmembers; nmb++) { + if (PMIX_CHECK_PROCID(&sig->addmembers[n], &coll->sig->addmembers[nmb])) { + // yes, we do + found = true; + // check for wildcard as that needs to be retained + if (PMIX_RANK_WILDCARD == sig->addmembers[n].rank) { + coll->sig->addmembers[n].rank = PMIX_RANK_WILDCARD; + } + break; + } + } + if (!found) { + // cache the proc + nm = PMIX_NEW(prte_namelist_t); + memcpy(&nm->name, &sig->addmembers[n], sizeof(pmix_proc_t)); + pmix_list_append(&plist, &nm->super); + } + } + // add any missing procs to the addmembers + if (0 < pmix_list_get_size(&plist)) { + n = coll->sig->naddmembers + pmix_list_get_size(&plist); + PMIX_PROC_CREATE(p, n); + if (NULL != coll->sig->addmembers) { + memcpy(p, coll->sig->addmembers, coll->sig->naddmembers * sizeof(pmix_proc_t)); + } + n = coll->sig->naddmembers; + PMIX_LIST_FOREACH(nm, &plist, prte_namelist_t) { + memcpy(&p[n], &nm->name, sizeof(pmix_proc_t)); + ++n; + } + PMIX_LIST_DESTRUCT(&plist); + if (NULL != coll->sig->addmembers) { + PMIX_PROC_FREE(coll->sig->addmembers, coll->sig->naddmembers); + } + coll->sig->addmembers = p; + coll->sig->naddmembers = n; + } + } + if (!coll->sig->assignID && sig->assignID) { + coll->sig->assignID = true; + } + return coll; + } + } + + /* if we get here, then this is a new collective - so create + * the tracker for it */ + if (!create) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:base: not creating new coll", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + + return NULL; + } + coll = PMIX_NEW(prte_grpcomm_group_t); + coll->sig = PMIX_NEW(prte_grpcomm_direct_group_signature_t); + coll->sig->op = sig->op; + coll->sig->groupID = strdup(sig->groupID); + coll->sig->assignID = sig->assignID; + // save the participating procs + coll->sig->nmembers = sig->nmembers; + if (0 < sig->nmembers) { + coll->sig->members = (pmix_proc_t *) malloc(coll->sig->nmembers * sizeof(pmix_proc_t)); + memcpy(coll->sig->members, sig->members, coll->sig->nmembers * sizeof(pmix_proc_t)); + } + coll->sig->naddmembers = sig->naddmembers; + if (0 < sig->naddmembers) { + coll->sig->addmembers = (pmix_proc_t *) malloc(coll->sig->naddmembers * sizeof(pmix_proc_t)); + memcpy(coll->sig->addmembers, sig->addmembers, coll->sig->naddmembers * sizeof(pmix_proc_t)); + } + + // need to know the bootstrap in case one is ongoing + coll->sig->bootstrap = sig->bootstrap; + pmix_list_append(&prte_mca_grpcomm_direct_component.group_ops, &coll->super); + + /* if this is a bootstrap operation, then there is no "rollup" + * collective - each daemon reports directly to the DVM controller */ + if (0 < coll->sig->bootstrap) { + coll->nexpected = coll->sig->bootstrap; + return coll; + } + + /* now get the daemons involved */ + if (PRTE_SUCCESS != (rc = create_dmns(sig, &coll->dmns, &coll->ndmns))) { + PRTE_ERROR_LOG(rc); + pmix_list_remove_item(&prte_mca_grpcomm_direct_component.group_ops, &coll->super); + PMIX_RELEASE(coll); + return NULL; + } + + /* count the number of contributions we should get */ + coll->nexpected = prte_rml_get_num_contributors(coll->dmns, coll->ndmns); + + /* see if I am in the array of participants - note that I may + * be in the rollup tree even though I'm not participating + * in the collective itself */ + for (n = 0; n < coll->ndmns; n++) { + if (coll->dmns[n] == PRTE_PROC_MY_NAME->rank) { + coll->nexpected++; + break; + } + } + + return coll; +} + +static int create_dmns(prte_grpcomm_direct_group_signature_t *sig, + pmix_rank_t **dmns, size_t *ndmns) +{ + size_t n; + prte_job_t *jdata; + prte_proc_t *proc; + prte_node_t *node; + prte_job_map_t *map; + int i; + pmix_list_t ds; + prte_namelist_t *nm; + pmix_rank_t vpid; + bool found; + size_t nds = 0; + pmix_rank_t *dns = NULL; + int rc = PRTE_SUCCESS; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:group:create_dmns called with %s signature size %" PRIsize_t "", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + (NULL == sig->members) ? "NULL" : "NON-NULL", sig->nmembers)); + + PMIX_CONSTRUCT(&ds, pmix_list_t); + for (n = 0; n < sig->nmembers; n++) { + if (NULL == (jdata = prte_get_job_data_object(sig->members[n].nspace))) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + break; + } + map = (prte_job_map_t*)jdata->map; + if (NULL == map || 0 == map->num_nodes) { + /* we haven't generated a job map yet - if we are the HNP, + * then we should only involve ourselves. Otherwise, we have + * no choice but to abort to avoid hangs */ + if (PRTE_PROC_IS_MASTER) { + rc = PRTE_SUCCESS; + break; + } + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + break; + } + if (PMIX_RANK_WILDCARD == sig->members[n].rank) { + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence::create_dmns called for all procs in job %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_JOBID_PRINT(sig->members[0].nspace))); + /* all daemons hosting this jobid are participating */ + for (i = 0; i < map->nodes->size; i++) { + if (NULL == (node = pmix_pointer_array_get_item(map->nodes, i))) { + continue; + } + if (NULL == node->daemon) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto done; + } + found = false; + PMIX_LIST_FOREACH(nm, &ds, prte_namelist_t) + { + if (nm->name.rank == node->daemon->name.rank) { + found = true; + break; + } + } + if (!found) { + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence::create_dmns adding daemon %s to list", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&node->daemon->name))); + nm = PMIX_NEW(prte_namelist_t); + PMIX_LOAD_PROCID(&nm->name, PRTE_PROC_MY_NAME->nspace, node->daemon->name.rank); + pmix_list_append(&ds, &nm->super); + } + } + } else { + /* lookup the daemon for this proc and add it to the list */ + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s sign: GETTING PROC OBJECT FOR %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&sig->members[n]))); + proc = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, + sig->members[n].rank); + if (NULL == proc) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto done; + } + if (NULL == proc->node || NULL == proc->node->daemon) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + rc = PRTE_ERR_NOT_FOUND; + goto done; + } + vpid = proc->node->daemon->name.rank; + found = false; + PMIX_LIST_FOREACH(nm, &ds, prte_namelist_t) + { + if (nm->name.rank == vpid) { + found = true; + break; + } + } + if (!found) { + nm = PMIX_NEW(prte_namelist_t); + PMIX_LOAD_PROCID(&nm->name, PRTE_PROC_MY_NAME->nspace, vpid); + pmix_list_append(&ds, &nm->super); + } + } + } + +done: + if (0 < pmix_list_get_size(&ds)) { + dns = (pmix_rank_t *) malloc(pmix_list_get_size(&ds) * sizeof(pmix_rank_t)); + nds = 0; + while (NULL != (nm = (prte_namelist_t *) pmix_list_remove_first(&ds))) { + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:fence::create_dmns adding daemon %s to array", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&nm->name))); + dns[nds++] = nm->name.rank; + PMIX_RELEASE(nm); + } + } + PMIX_LIST_DESTRUCT(&ds); + *dmns = dns; + *ndmns = nds; + return rc; +} + +static int pack_signature(pmix_data_buffer_t *bkt, + prte_grpcomm_direct_group_signature_t *sig) +{ + pmix_status_t rc; + + // pack the operation + rc = PMIx_Data_pack(NULL, bkt, &sig->op, 1, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + + // add the groupID + rc = PMIx_Data_pack(NULL, bkt, &sig->groupID, 1, PMIX_STRING); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + + // pack the flag to assign context ID + rc = PMIx_Data_pack(NULL, bkt, &sig->assignID, 1, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + + // pack the context ID, if one was given + rc = PMIx_Data_pack(NULL, bkt, &sig->ctxid_assigned, 1, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + if (sig->ctxid_assigned) { + rc = PMIx_Data_pack(NULL, bkt, &sig->ctxid, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + } + + // pack members, if given + rc = PMIx_Data_pack(NULL, bkt, &sig->nmembers, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + if (0 < sig->nmembers) { + rc = PMIx_Data_pack(NULL, bkt, sig->members, sig->nmembers, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + } + + // pack bootstrap number + rc = PMIx_Data_pack(NULL, bkt, &sig->bootstrap, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + + // pack added membership, if given + rc = PMIx_Data_pack(NULL, bkt, &sig->naddmembers, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + if (0 < sig->naddmembers) { + rc = PMIx_Data_pack(NULL, bkt, sig->addmembers, sig->naddmembers, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + } + + return PRTE_SUCCESS; +} + +static int unpack_signature(pmix_data_buffer_t *buffer, + prte_grpcomm_direct_group_signature_t **sig) +{ + pmix_status_t rc; + int32_t cnt; + prte_grpcomm_direct_group_signature_t *s; + + s = PMIX_NEW(prte_grpcomm_direct_group_signature_t); + + // unpack the op + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->op, &cnt, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + + // unpack the groupID + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->groupID, &cnt, PMIX_STRING); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + + // unpack whether or not to assign a context ID + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->assignID, &cnt, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + + // unpack the context ID, if one was assigned + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->ctxid_assigned, &cnt, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + if (s->ctxid_assigned) { + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->ctxid, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + } + + // unpack the membership + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->nmembers, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + if (0 < s->nmembers) { + PMIX_PROC_CREATE(s->members, s->nmembers); + cnt = s->nmembers; + rc = PMIx_Data_unpack(NULL, buffer, s->members, &cnt, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + } + + // unpack the bootstrap count + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->bootstrap, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + + // unpack the added members + cnt = 1; + rc = PMIx_Data_unpack(NULL, buffer, &s->naddmembers, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + if (0 < s->naddmembers) { + PMIX_PROC_CREATE(s->addmembers, s->naddmembers); + cnt = s->naddmembers; + rc = PMIx_Data_unpack(NULL, buffer, s->addmembers, &cnt, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(s); + return prte_pmix_convert_status(rc); + } + } + + *sig = s; + return PRTE_SUCCESS; +} diff --git a/src/mca/grpcomm/direct/grpcomm_direct_xcast.c b/src/mca/grpcomm/direct/grpcomm_direct_xcast.c new file mode 100644 index 0000000000..f42d01a550 --- /dev/null +++ b/src/mca/grpcomm/direct/grpcomm_direct_xcast.c @@ -0,0 +1,354 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2007 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All + * rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "prte_config.h" +#include "constants.h" +#include "types.h" + +#include + +#include "src/class/pmix_list.h" +#include "src/pmix/pmix-internal.h" + +#include "src/prted/pmix/pmix_server_internal.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/rml/rml.h" +#include "src/mca/state/state.h" +#include "src/util/name_fns.h" +#include "src/util/nidmap.h" +#include "src/util/proc_info.h" +#include "src/util/pmix_show_help.h" + +#include "grpcomm_direct.h" +#include "src/mca/grpcomm/base/base.h" + + +static int pack_xcast(pmix_data_buffer_t *buffer, + pmix_data_buffer_t *message, prte_rml_tag_t tag); + +int prte_grpcomm_direct_xcast(prte_rml_tag_t tag, + pmix_data_buffer_t *msg) +{ + int rc; + pmix_data_buffer_t *buf; + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:xcast: with %d bytes", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) msg->bytes_used)); + + /* this function does not access any framework-global data, and + * so it does not require us to push it into the event library */ + + /* prep the output buffer */ + PMIX_DATA_BUFFER_CREATE(buf); + + /* setup the payload */ + if (PRTE_SUCCESS != (rc = pack_xcast(buf, msg, tag))) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + + /* send it to the HNP (could be myself) for relay */ + PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, buf, PRTE_RML_TAG_XCAST); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + PMIX_DATA_BUFFER_RELEASE(buf); + return rc; + } + return PRTE_SUCCESS; +} + +void prte_grpcomm_direct_xcast_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, + prte_rml_tag_t tg, void *cbdata) +{ + prte_routed_tree_t *nm; + int ret, cnt; + pmix_data_buffer_t *relay = NULL, *rly, *rlycopy; + pmix_data_buffer_t datbuf, *data; + bool compressed; + prte_job_t *daemons; + pmix_list_t coll; + prte_rml_tag_t tag; + pmix_byte_object_t bo, pbo; + pmix_value_t val; + pmix_proc_t dmn; + PRTE_HIDE_UNUSED_PARAMS(status, sender, tg, cbdata); + + PMIX_OUTPUT_VERBOSE((1, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:xcast:recv: with %d bytes", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) buffer->bytes_used)); + + /* we need a passthru buffer to send to our children - we leave it + * as compressed data */ + PMIX_DATA_BUFFER_CREATE(rly); + ret = PMIx_Data_copy_payload(rly, buffer); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + PMIX_DATA_BUFFER_CONSTRUCT(&datbuf); + /* setup the relay list */ + PMIX_CONSTRUCT(&coll, pmix_list_t); + + /* unpack the flag to see if this payload is compressed */ + cnt = 1; + ret = PMIx_Data_unpack(NULL, buffer, &compressed, &cnt, PMIX_BOOL); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + /* unpack the data blob */ + cnt = 1; + ret = PMIx_Data_unpack(NULL, buffer, &pbo, &cnt, PMIX_BYTE_OBJECT); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + if (compressed) { + /* decompress the data */ + if (PMIx_Data_decompress((uint8_t *) pbo.bytes, pbo.size, + (uint8_t **) &bo.bytes, &bo.size)) { + /* the data has been uncompressed */ + ret = PMIx_Data_load(&datbuf, &bo); + if (PMIX_SUCCESS != ret) { + PMIX_BYTE_OBJECT_DESTRUCT(&pbo); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + } else { + pmix_show_help("help-prte-runtime.txt", "failed-to-uncompress", + true, prte_process_info.nodename); + PMIX_BYTE_OBJECT_DESTRUCT(&pbo); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + } else { + ret = PMIx_Data_load(&datbuf, &pbo); + if (PMIX_SUCCESS != ret) { + PMIX_BYTE_OBJECT_DESTRUCT(&pbo); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + } + PMIX_BYTE_OBJECT_DESTRUCT(&pbo); + data = &datbuf; + + /* get the target tag */ + cnt = 1; + ret = PMIx_Data_unpack(NULL, data, &tag, &cnt, PMIX_UINT32); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + return; + } + + /* copy the msg for relay to ourselves */ + PMIX_DATA_BUFFER_CREATE(relay); + ret = PMIx_Data_copy_payload(relay, data); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + PMIX_DATA_BUFFER_RELEASE(relay); + return; + } + + if (PRTE_RML_TAG_WIREUP == tag && !PRTE_PROC_IS_MASTER) { + if (PRTE_SUCCESS != (ret = prte_util_decode_nidmap(data))) { + PRTE_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + PMIX_DATA_BUFFER_RELEASE(relay); + return; + } + /* unpack the wireup info */ + cnt = 1; + while (PMIX_SUCCESS == (ret = PMIx_Data_unpack(NULL, data, &dmn, &cnt, PMIX_PROC))) { + PMIX_VALUE_CONSTRUCT(&val); + val.type = PMIX_STRING; + cnt = 1; + ret = PMIx_Data_unpack(NULL, data, &val.data.string, &cnt, PMIX_STRING); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + PMIX_DATA_BUFFER_RELEASE(relay); + return; + } + + if (!PMIX_CHECK_PROCID(&dmn, PRTE_PROC_MY_HNP) && + !PMIX_CHECK_PROCID(&dmn, PRTE_PROC_MY_NAME) && + !PMIX_CHECK_PROCID(&dmn, PRTE_PROC_MY_PARENT)) { + /* store it locally */ + ret = PMIx_Store_internal(&dmn, PMIX_PROC_URI, &val); + PMIX_VALUE_DESTRUCT(&val); + if (PMIX_SUCCESS != ret) { + PMIX_ERROR_LOG(ret); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); + PMIX_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); + PMIX_DATA_BUFFER_RELEASE(relay); + return; + } + } + } + if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { + PMIX_ERROR_LOG(ret); + } + } + + daemons = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); + if (!prte_get_attribute(&daemons->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { + /* send the message to each of our children */ + PMIX_LIST_FOREACH(nm, &prte_rml_base.children, prte_routed_tree_t) + { + PMIX_OUTPUT_VERBOSE((5, prte_grpcomm_base_framework.framework_output, + "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) rly->bytes_used, + PRTE_VPID_PRINT(nm->rank))); + /* copy the buffer for send */ + PMIX_DATA_BUFFER_CREATE(rlycopy); + ret = PMIx_Data_copy_payload(rlycopy, rly); + if (PMIX_SUCCESS != ret) { + PRTE_ERROR_LOG(ret); + PMIX_DATA_BUFFER_RELEASE(rlycopy); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + continue; + } + PRTE_RML_SEND(ret, nm->rank, rlycopy, PRTE_RML_TAG_XCAST); + if (PRTE_SUCCESS != ret) { + PRTE_ERROR_LOG(ret); + PMIX_DATA_BUFFER_RELEASE(rlycopy); + PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); + continue; + } + } + } + + /* cleanup */ + PMIX_LIST_DESTRUCT(&coll); + PMIX_DATA_BUFFER_RELEASE(rly); // retain accounting + + /* now pass the relay buffer to myself for processing IFF it + * wasn't just a wireup message - don't + * inject it into the RML system via send as that will compete + * with the relay messages down in the OOB. Instead, pass it + * directly to the RML message processor */ + if (PRTE_RML_TAG_WIREUP != tag) { + PRTE_RML_POST_MESSAGE(PRTE_PROC_MY_NAME, tag, 1, relay->base_ptr, relay->bytes_used); + relay->base_ptr = NULL; + relay->bytes_used = 0; + } + if (NULL != relay) { + PMIX_DATA_BUFFER_RELEASE(relay); + } + PMIX_DATA_BUFFER_DESTRUCT(&datbuf); +} + +static int pack_xcast(pmix_data_buffer_t *buffer, + pmix_data_buffer_t *message, prte_rml_tag_t tag) +{ + int rc; + pmix_data_buffer_t data; + bool compressed; + pmix_byte_object_t bo; + size_t sz; + + /* setup an intermediate buffer */ + PMIX_DATA_BUFFER_CONSTRUCT(&data); + + /* pass the final tag */ + rc = PMIx_Data_pack(NULL, &data, &tag, 1, PRTE_RML_TAG); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_DESTRUCT(&data); + return rc; + } + + /* copy the payload into the new buffer - this is non-destructive, so our + * caller is still responsible for releasing any memory in the buffer they + * gave to us + */ + rc = PMIx_Data_copy_payload(&data, message); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_DATA_BUFFER_DESTRUCT(&data); + return rc; + } + + /* see if we want to compress this message */ + if (PMIx_Data_compress((uint8_t *) data.base_ptr, data.bytes_used, + (uint8_t **) &bo.bytes, &sz)) { + /* the data was compressed - mark that we compressed it */ + compressed = true; + bo.size = sz; + } else { + /* mark that it was not compressed */ + compressed = false; + bo.bytes = data.base_ptr; + bo.size = data.bytes_used; + data.base_ptr = NULL; + data.bytes_used = 0; + } + PMIX_DATA_BUFFER_DESTRUCT(&data); + rc = PMIx_Data_pack(NULL, buffer, &compressed, 1, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_BYTE_OBJECT_DESTRUCT(&bo); + return rc; + } + rc = PMIx_Data_pack(NULL, buffer, &bo, 1, PMIX_BYTE_OBJECT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_BYTE_OBJECT_DESTRUCT(&bo); + return rc; + } + PMIX_BYTE_OBJECT_DESTRUCT(&bo); + + return PRTE_SUCCESS; +} + diff --git a/src/mca/grpcomm/grpcomm.h b/src/mca/grpcomm/grpcomm.h index a34c41ff8b..9b37ca72aa 100644 --- a/src/mca/grpcomm/grpcomm.h +++ b/src/mca/grpcomm/grpcomm.h @@ -53,84 +53,26 @@ BEGIN_C_DECLS -/* define a callback function to be invoked upon - * collective completion */ -typedef void (*prte_grpcomm_cbfunc_t)(int status, pmix_data_buffer_t *buf, void *cbdata); - -typedef int (*prte_grpcomm_rbcast_cb_t)(pmix_data_buffer_t *buffer); - -/* Define a collective signature so we don't need to - * track global collective id's */ -typedef struct { - pmix_object_t super; - char *groupID; - size_t ctxid; - bool ctxid_assigned; - pmix_proc_t *signature; - size_t sz; - pmix_proc_t *addmembers; - size_t nmembers; - size_t bootstrap; - pmix_proc_t *finalmembership; - size_t nfinal; -} prte_grpcomm_signature_t; -PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_grpcomm_signature_t); - -/* Internal component object for tracking ongoing - * allgather operations */ -typedef struct { - pmix_list_item_t super; - /* collective's signature */ - prte_grpcomm_signature_t *sig; - pmix_status_t status; - /* collection bucket */ - pmix_data_buffer_t bucket; - /* participating daemons */ - pmix_rank_t *dmns; - /** number of participating daemons */ - size_t ndmns; - /** my index in the dmns array */ - unsigned long my_rank; - /* number of buckets expected */ - size_t nexpected; - /* number reported in */ - size_t nreported; - /* controls values */ - bool assignID; - int timeout; - size_t memsize; - pmix_list_t addmembers; - /* distance masks for receive */ - pmix_bitmap_t distance_mask_recv; - /* received buckets */ - pmix_data_buffer_t **buffers; - /* callback function */ - prte_grpcomm_cbfunc_t cbfunc; - /* user-provided callback data */ - void *cbdata; -} prte_grpcomm_coll_t; -PMIX_CLASS_DECLARATION(prte_grpcomm_coll_t); - typedef struct { pmix_object_t super; prte_event_t ev; - prte_grpcomm_signature_t *sig; + pmix_lock_t lock; pmix_group_operation_t op; char *grpid; - pmix_data_buffer_t *buf; - pmix_byte_object_t ctrls; - pmix_proc_t *procs; + const pmix_proc_t *procs; size_t nprocs; + const pmix_info_t *directives; + size_t ndirs; pmix_info_t *info; size_t ninfo; - prte_grpcomm_cbfunc_t grpcbfunc; - pmix_modex_cbfunc_t mdxcbfunc; - pmix_info_cbfunc_t infocbfunc; - pmix_op_cbfunc_t opcbfunc; + pmix_info_cbfunc_t cbfunc; void *cbdata; - void *relcbdata; -} prte_pmix_mdx_caddy_t; -PMIX_CLASS_DECLARATION(prte_pmix_mdx_caddy_t); +} prte_pmix_grp_caddy_t; +PMIX_CLASS_DECLARATION(prte_pmix_grp_caddy_t); + +/* define a callback function to be invoked upon + * collective completion */ +typedef void (*prte_grpcomm_cbfunc_t)(int status, pmix_data_buffer_t *buf, void *cbdata); /* * Component functions - all MUST be provided! @@ -142,88 +84,41 @@ typedef int (*prte_grpcomm_base_module_init_fn_t)(void); /* finalize the selected module */ typedef void (*prte_grpcomm_base_module_finalize_fn_t)(void); -/* Scalably send a message. Caller will provide an array - * of daemon vpids that are to receive the message. A NULL - * pointer indicates that all daemons are participating. */ -typedef int (*prte_grpcomm_base_module_xcast_fn_t)(pmix_rank_t *vpids, size_t nprocs, - pmix_data_buffer_t *msg); -/* allgather - gather data from all specified daemons. Barrier operations - * will provide a zero-byte buffer. Caller will provide an array - * of daemon vpids that are participating in the allgather via the - * prte_grpcomm_coll_t object. A NULL pointer indicates that all daemons - * are participating. - * - * NOTE: this is a non-blocking call. The callback function cached in - * the prte_grpcomm_coll_t will be invoked upon completion. */ -typedef int (*prte_grpcomm_base_module_allgather_fn_t)(prte_grpcomm_coll_t *coll, - prte_pmix_mdx_caddy_t *cd); - -/* Reliable broadcast a message thru BMG. - * only need to provide a message buffer, dont need create dmns - */ -typedef int (*prte_grpcomm_base_module_rbcast_fn_t)(pmix_data_buffer_t *msg); - -typedef int (*prte_grpcomm_base_module_rbcast_register_cb_fn_t)(prte_grpcomm_rbcast_cb_t callback); +/* Scalably send a message. */ +typedef int (*prte_grpcomm_base_module_xcast_fn_t)(prte_rml_tag_t tag, + pmix_data_buffer_t *msg); -typedef int (*prte_grpcomm_base_module_rbcast_unregister_cb_fn_t)(int type); +/* fence - gather data from all specified procs. Barrier operations + * will provide NULL data. + * + * NOTE: this is a non-blocking call. The callback function + * will be invoked upon completion. */ +typedef int (*prte_grpcomm_base_module_fence_fn_t)(const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t info[], size_t ninfo, char *data, + size_t ndata, pmix_modex_cbfunc_t cbfunc, void *cbdata); + + +/* support group operations - this is basically a fence + * operation, but there are enough differences to warrant keeping it + * separate to avoid over-complicating the fence code */ +typedef int (*prte_grpcomm_base_module_grp_fn_t)(pmix_group_operation_t op, char *grpid, + const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); /* - * Ver 3.0 - internal modules + * Ver 4.0 */ typedef struct { - prte_grpcomm_base_module_init_fn_t init; - prte_grpcomm_base_module_finalize_fn_t finalize; + prte_grpcomm_base_module_init_fn_t init; + prte_grpcomm_base_module_finalize_fn_t finalize; /* collective operations */ - prte_grpcomm_base_module_xcast_fn_t xcast; - prte_grpcomm_base_module_allgather_fn_t allgather; - prte_grpcomm_base_module_rbcast_fn_t rbcast; - prte_grpcomm_base_module_rbcast_register_cb_fn_t register_cb; - prte_grpcomm_base_module_rbcast_unregister_cb_fn_t unregister_cb; + prte_grpcomm_base_module_xcast_fn_t xcast; + prte_grpcomm_base_module_fence_fn_t fence; + prte_grpcomm_base_module_grp_fn_t group; } prte_grpcomm_base_module_t; -/* the Public APIs */ -/* Scalably send a message. Caller will provide an array - * of process names that are to receive the message. A NULL - * pointer indicates that all known procs are to receive - * the message. A pointer to a name that includes PRTE_VPID_WILDCARD - * will send the message to all procs in the specified jobid. - * The message will be sent to the daemons hosting the specified - * procs for processing and relay. */ -typedef int (*prte_grpcomm_base_API_xcast_fn_t)(prte_grpcomm_signature_t *sig, prte_rml_tag_t tag, - pmix_data_buffer_t *msg); - -/* allgather - gather data from all specified procs. Barrier operations - * will provide a zero-byte buffer. Caller will provide an array - * of application proc vpids that are participating in the allgather. A NULL - * pointer indicates that all known procs are participating. A pointer - * to a name that includes PRTE_VPID_WILDCARD indicates that all procs - * in the specified jobid are contributing. - * - * NOTE: this is a non-blocking call. The provided callback function - * will be invoked upon completion. */ -typedef int (*prte_grpcomm_base_API_allgather_fn_t)(prte_pmix_mdx_caddy_t *cd); - -/* Reliable broadcast a message. Caller will provide an array - * of daemon. A NULL pointer indicates that all known daemons are in the BMG. - * A pointer to a name that includes ORTE_VPID_WILDCARD - * all daemons in the specified jobid.*/ -typedef int (*prte_grpcomm_base_API_rbcast_fn_t)(prte_grpcomm_signature_t *sig, prte_rml_tag_t tag, - pmix_data_buffer_t *msg); - -typedef int (*prte_grpcomm_base_API_rbcast_register_cb_fn_t)(prte_grpcomm_rbcast_cb_t callback); - -typedef int (*prte_grpcomm_base_API_rbcast_unregister_cb_fn_t)(int type); - -typedef struct { - /* collective operations */ - prte_grpcomm_base_API_xcast_fn_t xcast; - prte_grpcomm_base_API_allgather_fn_t allgather; - prte_grpcomm_base_API_rbcast_fn_t rbcast; - prte_grpcomm_base_API_rbcast_register_cb_fn_t register_cb; - prte_grpcomm_base_API_rbcast_unregister_cb_fn_t unregister_cb; -} prte_grpcomm_API_module_t; - /* * the standard component data structure */ @@ -232,12 +127,12 @@ typedef pmix_mca_base_component_t prte_grpcomm_base_component_t; /* * Macro for use in components that are of type grpcomm v3.0.0 */ -#define PRTE_GRPCOMM_BASE_VERSION_3_0_0 \ - /* grpcomm v3.0 is chained to MCA v2.0 */ \ - PRTE_MCA_BASE_VERSION_3_0_0("grpcomm", 3, 0, 0) +#define PRTE_GRPCOMM_BASE_VERSION_4_0_0 \ + /* grpcomm v4.0 is chained to MCA v2.0 */ \ + PRTE_MCA_BASE_VERSION_3_0_0("grpcomm", 4, 0, 0) /* Global structure for accessing grpcomm functions */ -PRTE_EXPORT extern prte_grpcomm_API_module_t prte_grpcomm; +PRTE_EXPORT extern prte_grpcomm_base_module_t prte_grpcomm; END_C_DECLS diff --git a/src/mca/iof/hnp/iof_hnp_send.c b/src/mca/iof/hnp/iof_hnp_send.c index af7047dbd0..9653b2a414 100644 --- a/src/mca/iof/hnp/iof_hnp_send.c +++ b/src/mca/iof/hnp/iof_hnp_send.c @@ -13,7 +13,7 @@ * Copyright (c) 2012 Los Alamos National Security, LLC * All rights reserved * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,7 +49,6 @@ int prte_iof_hnp_send_data_to_endpoint(const pmix_proc_t *host, { pmix_data_buffer_t *buf; int rc; - prte_grpcomm_signature_t sig; /* if the host is a daemon and we are in the process of aborting, * then ignore this request. We leave it alone if the host is not @@ -96,12 +95,8 @@ int prte_iof_hnp_send_data_to_endpoint(const pmix_proc_t *host, if (PMIX_CHECK_NSPACE(PRTE_PROC_MY_NAME->nspace, host->nspace) && PMIX_RANK_WILDCARD == host->rank) { /* xcast this to everyone - the local daemons will know how to handle it */ - PMIX_PROC_CREATE(sig.signature, 1); - sig.sz = 1; - PMIX_LOAD_PROCID(&sig.signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - (void) prte_grpcomm.xcast(&sig, PRTE_RML_TAG_IOF_PROXY, buf); + (void) prte_grpcomm.xcast(PRTE_RML_TAG_IOF_PROXY, buf); PMIX_DATA_BUFFER_RELEASE(buf); - PMIX_PROC_FREE(sig.signature, 1); return PRTE_SUCCESS; } diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index a005619d19..8daf2db3f3 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -563,7 +563,6 @@ static void job_timeout_cb(int fd, short event, void *cbdata) * if we cannot do so */ prte_daemon_cmd_flag_t command = PRTE_DAEMON_GET_STACK_TRACES; pmix_data_buffer_t buffer; - prte_grpcomm_signature_t *sig; bo.bytes = "Waiting for stack traces (this may take a few moments)...\n"; bo.size = strlen(bo.bytes); @@ -590,18 +589,12 @@ static void job_timeout_cb(int fd, short event, void *cbdata) goto giveup; } /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig->sz = 1; - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, &buffer))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, &buffer))) { PRTE_ERROR_LOG(rc); PMIX_DATA_BUFFER_DESTRUCT(&buffer); goto giveup; } PMIX_DATA_BUFFER_DESTRUCT(&buffer); - /* maintain accounting */ - PMIX_RELEASE(sig); /* we will terminate after we get the stack_traces, but set a timeout * just in case we never hear back from everyone */ if (prte_stack_trace_wait_timeout > 0) { @@ -801,7 +794,6 @@ void prte_plm_base_launch_apps(int fd, short args, void *cbdata) void prte_plm_base_send_launch_msg(int fd, short args, void *cbdata) { prte_state_caddy_t *caddy = (prte_state_caddy_t *) cbdata; - prte_grpcomm_signature_t *sig; prte_job_t *jdata; int rc; PRTE_HIDE_UNUSED_PARAMS(fd, args); @@ -833,21 +825,14 @@ void prte_plm_base_send_launch_msg(int fd, short args, void *cbdata) } /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig->sz = 1; - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, &jdata->launch_msg))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, &jdata->launch_msg))) { PRTE_ERROR_LOG(rc); - PMIX_RELEASE(sig); PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_NEVER_LAUNCHED); PMIX_RELEASE(caddy); return; } PMIX_DATA_BUFFER_DESTRUCT(&jdata->launch_msg); PMIX_DATA_BUFFER_CONSTRUCT(&jdata->launch_msg); - /* maintain accounting */ - PMIX_RELEASE(sig); /* track that we automatically are considered to have reported - used * only to report launch progress diff --git a/src/mca/plm/base/plm_base_prted_cmds.c b/src/mca/plm/base/plm_base_prted_cmds.c index 9cf048930f..5089b907dd 100644 --- a/src/mca/plm/base/plm_base_prted_cmds.c +++ b/src/mca/plm/base/plm_base_prted_cmds.c @@ -15,7 +15,7 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,28 +50,11 @@ #include "src/mca/plm/base/base.h" #include "src/mca/plm/base/plm_private.h" -#if 0 -static void failed_cmd(int fd, short event, void *cbdata) -{ - prte_timer_t *tm = (prte_timer_t*)cbdata; - - /* we get called if an abnormal term - * don't complete in time - just force exit - */ - PMIX_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, - "%s plm:base:orted_cmd command timed out", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - PMIX_RELEASE(tm); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); -} -#endif - int prte_plm_base_prted_exit(prte_daemon_cmd_flag_t command) { int rc; pmix_data_buffer_t cmd; prte_daemon_cmd_flag_t cmmnd; - prte_grpcomm_signature_t *sig; PMIX_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, "%s plm:base:prted_cmd sending prted_exit commands", @@ -106,24 +89,11 @@ int prte_plm_base_prted_exit(prte_daemon_cmd_flag_t command) return rc; } /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, &cmd))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, &cmd))) { PRTE_ERROR_LOG(rc); } PMIX_DATA_BUFFER_DESTRUCT(&cmd); - PMIX_RELEASE(sig); -#if 0 - /* if we are abnormally ordering the termination, then - * set a timeout in case it never finishes - */ - if (prte_abnormal_term_ordered) { - PRTE_DETECT_TIMEOUT(prte_process_info.num_procs, 100, 3, failed_cmd, NULL); - } -#endif return rc; } @@ -157,7 +127,6 @@ int prte_plm_base_prted_kill_local_procs(pmix_pointer_array_t *procs) prte_daemon_cmd_flag_t command = PRTE_DAEMON_KILL_LOCAL_PROCS; int v; prte_proc_t *proc; - prte_grpcomm_signature_t *sig; PMIX_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, "%s plm:base:orted_cmd sending kill_local_procs cmds", @@ -187,15 +156,10 @@ int prte_plm_base_prted_kill_local_procs(pmix_pointer_array_t *procs) } } /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, &cmd))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, &cmd))) { PRTE_ERROR_LOG(rc); } PMIX_DATA_BUFFER_DESTRUCT(&cmd); - PMIX_RELEASE(sig); /* we're done! */ return rc; @@ -206,7 +170,6 @@ int prte_plm_base_prted_signal_local_procs(pmix_nspace_t job, int32_t signal) int rc; pmix_data_buffer_t cmd; prte_daemon_cmd_flag_t command = PRTE_DAEMON_SIGNAL_LOCAL_PROCS; - prte_grpcomm_signature_t *sig; PMIX_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, "%s plm:base:prted_cmd sending signal_local_procs cmds", @@ -239,15 +202,10 @@ int prte_plm_base_prted_signal_local_procs(pmix_nspace_t job, int32_t signal) } /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, &cmd))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, &cmd))) { PRTE_ERROR_LOG(rc); } PMIX_DATA_BUFFER_DESTRUCT(&cmd); - PMIX_RELEASE(sig); /* we're done! */ return PRTE_SUCCESS; diff --git a/src/mca/plm/base/plm_base_receive.c b/src/mca/plm/base/plm_base_receive.c index 16f255a189..47e09980df 100644 --- a/src/mca/plm/base/plm_base_receive.c +++ b/src/mca/plm/base/plm_base_receive.c @@ -270,7 +270,7 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, /* try defaulting to parent session */ if (NULL != (parent = prte_get_job_data_object(nptr->nspace))) { session = parent->session; - if (NULL == session) { + if (NULL == session) { rc = PRTE_ERR_NOT_FOUND; goto ANSWER_LAUNCH; } @@ -290,19 +290,6 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, } } -#if 0 - // (RHC) I'm not sure the following is true - merits some thought - - /* Jobs are only allowed to be spawned in the the session of the requestor - * or one of its child sessions. */ - if (NULL == session || - !prte_sessions_related(prte_get_job_data_object(nptr->nspace)->session, session)) { - PRTE_ERROR_LOG(PRTE_ERR_PERM); - rc = PRTE_ERR_PERM; - goto ANSWER_LAUNCH; - } -#endif - jdata->session = session; pmix_pointer_array_add(jdata->session->jobs, jdata); diff --git a/src/mca/state/dvm/state_dvm.c b/src/mca/state/dvm/state_dvm.c index 1f316d7d32..824cde45cc 100644 --- a/src/mca/state/dvm/state_dvm.c +++ b/src/mca/state/dvm/state_dvm.c @@ -262,7 +262,6 @@ static void vm_ready(int fd, short args, void *cbdata) prte_state_caddy_t *caddy = (prte_state_caddy_t *) cbdata; int rc, i; pmix_data_buffer_t buf; - prte_grpcomm_signature_t sig; prte_job_t *jptr; prte_proc_t *dmn; int32_t v; @@ -321,19 +320,13 @@ static void vm_ready(int fd, short args, void *cbdata) } /* goes to all daemons */ - PMIX_CONSTRUCT(&sig, prte_grpcomm_signature_t); - PMIX_PROC_CREATE(sig.signature, 1); - PMIX_LOAD_PROCID(&sig.signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig.sz = 1; - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(&sig, PRTE_RML_TAG_WIREUP, &buf))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_WIREUP, &buf))) { PRTE_ERROR_LOG(rc); PMIX_DATA_BUFFER_DESTRUCT(&buf); - PMIX_PROC_FREE(sig.signature, 1); PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); return; } PMIX_DATA_BUFFER_DESTRUCT(&buf); - PMIX_PROC_FREE(sig.signature, 1); } } if (PMIX_CHECK_NSPACE(PRTE_PROC_MY_NAME->nspace, caddy->jdata->nspace)) { @@ -898,7 +891,6 @@ static void dvm_notify(int sd, short args, void *cbdata) int rc; pmix_data_buffer_t *reply; prte_daemon_cmd_flag_t command; - prte_grpcomm_signature_t sig; bool notify = true, flag; pmix_proc_t *proc, pnotify; pmix_info_t *info; @@ -1043,14 +1035,9 @@ static void dvm_notify(int sd, short args, void *cbdata) /* we have to send the notification to all daemons so that * anyone watching for it can receive it */ - PMIX_CONSTRUCT(&sig, prte_grpcomm_signature_t); - PMIX_PROC_CREATE(sig.signature, 1); - PMIX_LOAD_PROCID(&sig.signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig.sz = 1; - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(&sig, PRTE_RML_TAG_NOTIFICATION, reply))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_NOTIFICATION, reply))) { PRTE_ERROR_LOG(rc); PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_PROC_FREE(sig.signature, 1); PMIX_RELEASE(caddy); return; } @@ -1058,8 +1045,6 @@ static void dvm_notify(int sd, short args, void *cbdata) "%s state:dvm:dvm_notify notification sent", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); PMIX_DATA_BUFFER_RELEASE(reply); - /* maintain accounting */ - PMIX_PROC_FREE(sig.signature, 1); } if (prte_persistent) { @@ -1083,12 +1068,8 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_DATA_BUFFER_RELEASE(reply); return; } - PMIX_PROC_CREATE(sig.signature, 1); - PMIX_LOAD_PROCID(&sig.signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig.sz = 1; - prte_grpcomm.xcast(&sig, PRTE_RML_TAG_DAEMON, reply); + prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, reply); PMIX_DATA_BUFFER_RELEASE(reply); - PMIX_PROC_FREE(sig.signature, 1); } // We are done with our use of job data and have notified the other daemons diff --git a/src/prted/pmix/pmix_server.c b/src/prted/pmix/pmix_server.c index 6c27e9d1c9..98f07c5cb5 100644 --- a/src/prted/pmix/pmix_server.c +++ b/src/prted/pmix/pmix_server.c @@ -2053,39 +2053,6 @@ PMIX_CLASS_INSTANCE(pmix_server_req_t, pmix_object_t, rqcon, rqdes); -static void mdcon(prte_pmix_mdx_caddy_t *p) -{ - p->sig = NULL; - p->grpid = NULL; - p->buf = NULL; - PMIX_BYTE_OBJECT_CONSTRUCT(&p->ctrls); - p->procs = NULL; - p->nprocs = 0; - p->info = NULL; - p->ninfo = 0; - p->cbdata = NULL; - p->grpcbfunc = NULL; - p->mdxcbfunc = NULL; - p->infocbfunc = NULL; - p->opcbfunc = NULL; -} -static void mddes(prte_pmix_mdx_caddy_t *p) -{ - if (NULL != p->sig) { - PMIX_RELEASE(p->sig); - } - if (NULL != p->grpid) { - free(p->grpid); - } - if (NULL != p->buf) { - PMIX_DATA_BUFFER_RELEASE(p->buf); - } - PMIX_BYTE_OBJECT_DESTRUCT(&p->ctrls); -} -PMIX_CLASS_INSTANCE(prte_pmix_mdx_caddy_t, - pmix_object_t, - mdcon, mddes); - static void pscon(pmix_server_pset_t *p) { p->name = NULL; diff --git a/src/prted/pmix/pmix_server_dyn.c b/src/prted/pmix/pmix_server_dyn.c index ee3b232ebe..e2721a131a 100644 --- a/src/prted/pmix/pmix_server_dyn.c +++ b/src/prted/pmix/pmix_server_dyn.c @@ -927,398 +927,86 @@ int pmix_server_spawn_fn(const pmix_proc_t *proc, const pmix_info_t job_info[], return PRTE_SUCCESS; } -static void _cnct(int sd, short args, void *cbdata); - -static void opcbfunc(pmix_status_t status, void *cbdata) -{ - prte_pmix_lock_t *lock = (prte_pmix_lock_t *) cbdata; - lock->status = status; - PRTE_PMIX_WAKEUP_THREAD(lock); -} - -static void _cnlk(pmix_status_t status, pmix_pdata_t data[], size_t ndata, void *cbdata) +// modex callback func for connect +static void connect_release(pmix_status_t status, + const char *data, size_t sz, + void *cbdata, + pmix_release_cbfunc_t rel, void *relcbdata) { - prte_pmix_server_op_caddy_t *cd = (prte_pmix_server_op_caddy_t *) cbdata; - int cnt; - prte_job_t *jdata; - pmix_status_t ret; + pmix_server_req_t *md = (pmix_server_req_t*)cbdata; + pmix_byte_object_t bo; pmix_data_buffer_t pbkt; - prte_pmix_lock_t lock; - pmix_info_t *info = NULL; - size_t ninfo; + pmix_info_t *info = NULL, infostat; + pmix_proc_t *procID; + pmix_status_t rc; + int cnt; - PMIX_ACQUIRE_OBJECT(cd); + PMIX_ACQUIRE_OBJECT(md); - /* if we failed to get the required data, then just inform - * the embedded server that the connect cannot succeed */ + rc = status; if (PMIX_SUCCESS != status) { - ret = status; - goto release; - } - if (NULL == data) { - ret = PMIX_ERR_NOT_FOUND; goto release; } - /* if we have more than one data returned, that's an error */ - if (1 != ndata) { - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - ret = PMIX_ERR_BAD_PARAM; - goto release; - } - - /* the data will consist of a byte object containing - * a packed buffer of the job data */ - PMIX_DATA_BUFFER_CONSTRUCT(&pbkt); - ret = PMIx_Data_load(&pbkt, &data[0].value.data.bo); - if (PMIX_SUCCESS != ret) { - goto release; - } - data[0].value.data.bo.bytes = NULL; - data[0].value.data.bo.size = 0; - - /* extract the number of returned info */ - cnt = 1; - if (PMIX_SUCCESS != (ret = PMIx_Data_unpack(&data[0].proc, &pbkt, &ninfo, &cnt, PMIX_SIZE))) { - PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - goto release; - } - if (0 < ninfo) { - PMIX_INFO_CREATE(info, ninfo); - cnt = ninfo; - if (PMIX_SUCCESS != (ret = PMIx_Data_unpack(&data[0].proc, &pbkt, info, &cnt, PMIX_INFO))) { - PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - PMIX_INFO_FREE(info, ninfo); + /* process returned data */ + if (NULL != data && 0 != sz) { + /* prep for unpacking */ + PMIx_Byte_object_load(&bo, (char*)data, sz); + PMIX_DATA_BUFFER_CONSTRUCT(&pbkt); + rc = PMIx_Data_embed(&pbkt, &bo); + if (PMIX_SUCCESS != rc) { goto release; } - } - PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - - /* we have to process the data to convert it into an prte_job_t - * that describes this job as we didn't already have it */ - jdata = PMIX_NEW(prte_job_t); - - /* register the data with the local server */ - PRTE_PMIX_CONSTRUCT_LOCK(&lock); - ret = PMIx_server_register_nspace(data[0].proc.nspace, jdata->num_local_procs, info, ninfo, - opcbfunc, &lock); - if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - PMIX_INFO_FREE(info, ninfo); - PRTE_PMIX_DESTRUCT_LOCK(&lock); - goto release; - } - PRTE_PMIX_WAIT_THREAD(&lock); - ret = lock.status; - PRTE_PMIX_DESTRUCT_LOCK(&lock); - PMIX_INFO_FREE(info, ninfo); - - /* restart the cnct processor */ - PRTE_PMIX_OPERATION(cd->procs, cd->nprocs, cd->info, cd->ninfo, _cnct, cd->cbfunc, cd->cbdata); - /* we don't need to protect the re-referenced data as - * the prte_pmix_server_op_caddy_t does not have - * a destructor! */ - PMIX_RELEASE(cd); - return; - -release: - if (NULL != cd->cbfunc) { - cd->cbfunc(ret, cd->cbdata); - } - PMIX_RELEASE(cd); -} - -static void cndbfunc(pmix_status_t status, void *cbdata) -{ - prte_pmix_lock_t *lock = (prte_pmix_lock_t*)cbdata; - lock->status = status; - PRTE_PMIX_WAKEUP_THREAD(lock); -} - -static void connect_release(int status, pmix_data_buffer_t *buf, void *cbdata) -{ - prte_pmix_mdx_caddy_t *md = (prte_pmix_mdx_caddy_t*)cbdata; - pmix_nspace_t nspace; - pmix_info_t *info = NULL, infostat; - size_t ninfo = 1; - int rc = PMIX_SUCCESS; - int cnt, n=0; - prte_pmix_lock_t lock; - bool assignedID = false; - uint32_t ctxid; - bool first = true; - char *payload; - - PMIX_ACQUIRE_OBJECT(md); - - /* process returned data */ - if (NULL != buf && 0 != buf->bytes_used) { - /* check for any directives */ - payload = buf->unpack_ptr; + // the payload consists of packed info containing + // endpoint info for each involved process - we have to + // convert each entry into an array of info, and then + // load that into a PMIX_PROC_DATA info cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &infostat, &cnt, PMIX_INFO); + rc = PMIx_Data_unpack(NULL, &pbkt, &infostat, &cnt, PMIX_INFO); while (PMIX_SUCCESS == rc) { - if (PMIX_CHECK_KEY(&infostat, PMIX_GROUP_CONTEXT_ID)) { - PMIX_VALUE_GET_NUMBER(rc, &infostat.value, ctxid, uint32_t); - if (PMIX_SUCCESS != rc) { + // only interested in the endpt data + if (PMIX_CHECK_KEY(&infostat, PMIX_PROC_DATA)) { + // contains an array of info + info = (pmix_info_t*)infostat.value.data.darray->array; + // procID is in first place + procID = info[0].value.data.proc; + // register this data + rc = PMIx_server_register_nspace(procID->nspace, -1, &infostat, 1, NULL, NULL); + if (PMIX_SUCCESS != rc && PMIX_OPERATION_SUCCEEDED != rc) { PMIX_ERROR_LOG(rc); - } else { - assignedID = true; - ++ninfo; } } - /* save where we are */ - payload = buf->unpack_ptr; - /* cleanup */ PMIX_INFO_DESTRUCT(&infostat); - /* get the next object */ cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &infostat, &cnt, PMIX_INFO); + rc = PMIx_Data_unpack(NULL, &pbkt, &infostat, &cnt, PMIX_INFO); } - /* restore the unpack location as the last unsuccessful attempt will - * have moved it */ - buf->unpack_ptr = payload; - - /* create space for the info array that will be passed down */ - PMIX_INFO_CREATE(info, ninfo); - /* we will put the proc data in the first position, so put anything - * else towards the back of the array */ - n = 1; - if (assignedID) { - PMIX_INFO_LOAD(&info[n], PMIX_GROUP_CONTEXT_ID, &ctxid, PMIX_UINT32); - ++n; - } - - /* there is a byte object for each proc in the connect operation */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &nspace, &cnt, PMIX_PROC_NSPACE); - while (PMIX_SUCCESS == rc) { - ++n; - /* unpack the proc data for this entry */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &info[0], &cnt, PMIX_INFO); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto next; - } - if (first) { - cnt = 2; - first = false; - } else { - cnt = 1; - } - /* use the register_nspace API to enter this information - it - * will see that it already knows the nspace, and so it will - * simply cache the data for later retrieval when requested */ - PRTE_PMIX_CONSTRUCT_LOCK(&lock); - rc = PMIx_server_register_nspace(nspace, -1, info, cnt, cndbfunc, &lock); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PRTE_PMIX_DESTRUCT_LOCK(&lock); - } else { - PRTE_PMIX_WAIT_THREAD(&lock); - rc = lock.status; - PRTE_PMIX_DESTRUCT_LOCK(&lock); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - } - } - PMIX_INFO_DESTRUCT(&info[0]); - next: - /* get the next nspace */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &nspace, &cnt, PMIX_PROC_NSPACE); + if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + PMIX_ERROR_LOG(rc); + } else { + rc = PMIX_SUCCESS; } - PMIX_INFO_DESTRUCT(&info[1]); } +release: /* now release the connect call */ if (NULL != md->opcbfunc) { - md->opcbfunc(status, md->cbdata); + md->opcbfunc(rc, md->cbdata); } + if (NULL != rel) { + rel(relcbdata); + } PMIX_RELEASE(md); } -static void _cnct(int sd, short args, void *cbdata) -{ - prte_pmix_server_op_caddy_t *cd = (prte_pmix_server_op_caddy_t *) cbdata; - char **keys = NULL; - prte_job_t *jdata; - int rc = PRTE_SUCCESS; - size_t k, n, ninfo; - int m; - uint32_t uid; - pmix_value_t *val; - pmix_info_t info[2], *isrc, *idest, procdata; - prte_proc_t *proc; - pmix_data_buffer_t dbuf; - pmix_data_array_t *darray; - pmix_scope_t scope; - prte_pmix_mdx_caddy_t *md; - PRTE_HIDE_UNUSED_PARAMS(sd, args); - - PMIX_ACQUIRE_OBJECT(cd); - - /* at some point, we need to add bookeeping to track which - * procs are "connected" so we know who to notify upon - * termination or failure. For now, we have to ensure - * that we have registered all participating nspaces so - * the embedded PMIx server can provide them to the client. - * Otherwise, the client will receive an error as it won't - * be able to resolve any of the required data for the - * missing nspaces */ - - /* cycle thru the procs */ - PMIX_DATA_BUFFER_CONSTRUCT(&dbuf); - PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, NULL, PMIX_BOOL); - scope = PMIX_REMOTE; - PMIX_INFO_LOAD(&info[1], PMIX_DATA_SCOPE, &scope, PMIX_SCOPE); - for (n = 0; n < cd->nprocs; n++) { - /* see if we have the job object for this job */ - if (NULL == (jdata = prte_get_job_data_object(cd->procs[n].nspace))) { - /* we don't know about this job. If our "global" data - * server is just our HNP, then we have no way of finding - * out about it, and all we can do is return an error */ - if (PMIX_CHECK_PROCID(&prte_pmix_server_globals.server, PRTE_PROC_MY_HNP)) { - rc = PRTE_ERR_NOT_SUPPORTED; - goto release; - } - /* ask the global data server for the data - if we get it, - * then we can complete the request */ - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&keys, cd->procs[n].nspace); - /* we have to add the user's id to the directives */ - cd->ndirs = 1; - PMIX_INFO_CREATE(cd->directives, cd->ndirs); - uid = geteuid(); - PMIX_INFO_LOAD(&cd->directives[0], PMIX_USERID, &uid, PMIX_UINT32); - if (PRTE_SUCCESS - != (rc = pmix_server_lookup_fn(&cd->procs[n], keys, cd->directives, cd->ndirs, - _cnlk, cd))) { - PMIX_ARGV_FREE_COMPAT(keys); - PMIX_INFO_FREE(cd->directives, cd->ndirs); - goto release; - } - PMIX_ARGV_FREE_COMPAT(keys); - /* the callback function on this lookup will return us to this - * routine so we can continue the process */ - return; - } - /* we know about the job - check to ensure it has been - * registered with the local PMIx server */ - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_NSPACE_REGISTERED, NULL, PMIX_BOOL)) { - /* it hasn't been registered yet, so register it now */ - if (PRTE_SUCCESS != (rc = prte_pmix_server_register_nspace(jdata))) { - goto release; - } - } - /* cycle thru our local children and collect any info they have posted */ - for (m=0; m < prte_local_children->size; m++) { - if (NULL == (proc = (prte_proc_t*)pmix_pointer_array_get_item(prte_local_children, m))) { - continue; - } - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - /* we cannot use PMIx_server_dmodex_request here as it will wait - * for the requested data to be posted by the process - this can lead - * to a "hang" should that process never post something. So use - * PMIx_Get instead as we can then pass an "optional" flag to it. - * Note also that we don't need any local data as the server can - * already provide it */ - rc = PMIx_Get(&proc->name, NULL, info, 2, &val); - if (PMIX_SUCCESS != rc) { - /* if the proc didn't post any remote data, then we will - * get a "not found" response - this is okay */ - continue; - } - /* we should have received a data array containing all the requested info */ - /* first pack the nspace */ - rc = PMIx_Data_pack(NULL, &dbuf, &jdata->nspace, 1, PMIX_PROC_NSPACE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&dbuf); - PMIX_VALUE_RELEASE(val); - goto release; - } - /* transfer the returned data to a data array suitable for PMIX_PROC_DATA */ - ninfo = 1 + val->data.darray->size; - PMIX_DATA_ARRAY_CREATE(darray, ninfo, PMIX_INFO); - idest = (pmix_info_t*)darray->array; - /* the array starts with the proc's rank */ - PMIX_INFO_LOAD(&idest[0], PMIX_RANK, &proc->name.rank, PMIX_PROC_RANK); - /* now transfer the returned data */ - isrc = (pmix_info_t*)val->data.darray->array; - for (k=1; k < ninfo; k++) { - PMIX_INFO_XFER(&idest[k], &isrc[k-1]); - } - PMIX_VALUE_RELEASE(val); - /* load the proc_data info */ - PMIX_INFO_LOAD(&procdata, PMIX_PROC_DATA, darray, PMIX_DATA_ARRAY); - /* now pack it */ - rc = PMIx_Data_pack(NULL, &dbuf, &procdata, 1, PMIX_INFO); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&dbuf); - goto release; - } - } - } - /* PMIx server only calls us if this is a multi-node operation. In this - * case, we also need to (a) execute a "fence" across the participating - * nodes, (b) send along any information posted by the participants - * for "remote" scope, and (c) request assignment of a unique context ID - * that the app can use for things like a communicator ID */ - md = PMIX_NEW(prte_pmix_mdx_caddy_t); - md->sig = PMIX_NEW(prte_grpcomm_signature_t); - md->sig->sz = cd->nprocs; - md->sig->signature = (pmix_proc_t *) malloc(md->sig->sz * sizeof(pmix_proc_t)); - memcpy(md->sig->signature, cd->procs, md->sig->sz * sizeof(pmix_proc_t)); - md->buf = PMIx_Data_buffer_create(); - rc = PMIx_Data_copy_payload(md->buf, &dbuf); - PMIX_DATA_BUFFER_DESTRUCT(&dbuf); - if (PMIX_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - PMIX_RELEASE(md); - goto release; - } - /* create a buffer and load it with all the controls - * info (e.g., timeout and size estimates) the PMIx - * server provided */ - rc = prte_pack_ctrl_options(&md->ctrls, cd->info, cd->ninfo); - if (PRTE_SUCCESS != rc) { - PMIX_RELEASE(md); - goto release; - } - md->grpcbfunc = connect_release; - md->opcbfunc = cd->cbfunc; - md->cbdata = md; - md->cbdata = cd->cbdata; - - /* pass it to the global collective algorithm */ - /* pass along any data that was collected locally */ - rc = prte_grpcomm.allgather(md); - if (PMIX_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - PMIX_RELEASE(md); - goto release; - } - PMIX_RELEASE(cd); - return; - -release: - rc = prte_pmix_convert_rc(rc); - if (NULL != cd->cbfunc) { - cd->cbfunc(rc, cd->cbdata); - } - PMIX_RELEASE(cd); -} pmix_status_t pmix_server_connect_fn(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { - prte_pmix_server_op_caddy_t *op; + pmix_server_req_t *cd; + size_t n; + pmix_status_t rc; pmix_output_verbose(2, prte_pmix_server_globals.output, "%s connect called with %d procs", @@ -1328,30 +1016,24 @@ pmix_status_t pmix_server_connect_fn(const pmix_proc_t procs[], size_t nprocs, if (NULL == procs || 0 == nprocs) { return PMIX_ERR_BAD_PARAM; } - /* must thread shift this as we will be accessing global data */ - op = PMIX_NEW(prte_pmix_server_op_caddy_t); - op->procs = (pmix_proc_t *) procs; - op->nprocs = nprocs; - op->info = (pmix_info_t *) info; - op->ninfo = ninfo; -#ifdef PMIX_LOCAL_COLLECTIVE_STATUS - if (NULL != info) { - if (PMIX_CHECK_KEY(&info[ninfo-1], PMIX_LOCAL_COLLECTIVE_STATUS)) { - op->status = info[ninfo-1].value.data.status; + /* PMIx server only calls us if this is a multi-node operation. In this + * case, we also need to (a) execute a "fence" across the participating + * nodes, and (b) send along any endpt information posted by the participants + * for "remote" scope */ + + cd = PMIX_NEW(pmix_server_req_t); + for (n=0; n < ninfo; n++) { + if (PMIX_CHECK_KEY(&info[n], PMIX_PROC_DATA)) { + PMIx_Data_pack(NULL, &cd->msg, (pmix_info_t*)&info[n], 1, PMIX_INFO); } - } else { - op->status = PMIX_SUCCESS; } -#else - op->status = PMIX_SUCCESS; -#endif - op->cbfunc = cbfunc; - op->cbdata = cbdata; - prte_event_set(prte_event_base, &(op->ev), -1, PRTE_EV_WRITE, _cnct, op); - PMIX_POST_OBJECT(op); - prte_event_active(&(op->ev), PRTE_EV_WRITE, 1); + cd->opcbfunc = cbfunc; + cd->cbdata = cbdata; - return PMIX_SUCCESS; + rc = prte_grpcomm.fence(procs, nprocs, info, ninfo, + cd->msg.unpack_ptr, cd->msg.bytes_used, + connect_release, cd); + return rc; } static void mdxcbfunc(pmix_status_t status, diff --git a/src/prted/pmix/pmix_server_fence.c b/src/prted/pmix/pmix_server_fence.c index 11623ea55b..176cfe561a 100644 --- a/src/prted/pmix/pmix_server_fence.c +++ b/src/prted/pmix/pmix_server_fence.c @@ -48,34 +48,6 @@ #include "src/prted/pmix/pmix_server.h" #include "src/prted/pmix/pmix_server_internal.h" -static void relcb(void *cbdata) -{ - uint8_t *data = (uint8_t *) cbdata; - - if (NULL != data) { - free(data); - } -} -static void pmix_server_release(int status, pmix_data_buffer_t *buf, void *cbdata) -{ - prte_pmix_mdx_caddy_t *cd = (prte_pmix_mdx_caddy_t *) cbdata; - pmix_byte_object_t bo; - int rc = PRTE_SUCCESS; - - PMIX_ACQUIRE_OBJECT(cd); - - /* unload the buffer */ - PMIX_BYTE_OBJECT_CONSTRUCT(&bo); - if (NULL != buf) { - rc = PMIx_Data_unload(buf, &bo); - } - if (PRTE_SUCCESS == rc) { - rc = status; - } - cd->mdxcbfunc(rc, bo.bytes, bo.size, cd->cbdata, relcb, bo.bytes); - PMIX_RELEASE(cd); -} - /* this function is called when all the local participants have * called fence - thus, the collective is already locally * complete at this point. We therefore just need to create the @@ -84,8 +56,6 @@ pmix_status_t pmix_server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, char *data, size_t ndata, pmix_modex_cbfunc_t cbfunc, void *cbdata) { - prte_pmix_mdx_caddy_t *cd = NULL; - pmix_data_buffer_t buf; int rc; pmix_output_verbose(2, prte_pmix_server_globals.output, @@ -93,50 +63,10 @@ pmix_status_t pmix_server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs, PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), prte_process_info.nodename); - cd = PMIX_NEW(prte_pmix_mdx_caddy_t); - cd->mdxcbfunc = cbfunc; - cd->cbdata = cbdata; - cd->grpcbfunc = pmix_server_release; - cd->buf = PMIx_Data_buffer_create(); - - /* compute the signature of this collective */ - if (NULL != procs) { - cd->sig = PMIX_NEW(prte_grpcomm_signature_t); - cd->sig->sz = nprocs; - cd->sig->signature = (pmix_proc_t *) malloc(cd->sig->sz * sizeof(pmix_proc_t)); - memcpy(cd->sig->signature, procs, cd->sig->sz * sizeof(pmix_proc_t)); - } - - rc = prte_pack_ctrl_options(&cd->ctrls, info, ninfo); - if (PMIX_SUCCESS != rc) { - PMIX_RELEASE(cd); - return rc; - } - - if (NULL != data) { - /* do not use PMIx_Data_load as it would modify - * the data and we don't own it */ - PMIx_Data_buffer_construct(&buf); - buf.base_ptr = (char*)data; - buf.pack_ptr = buf.base_ptr + ndata; - buf.unpack_ptr = buf.base_ptr; - buf.bytes_used = ndata; - buf.bytes_allocated = ndata; - rc = PMIx_Data_copy_payload(cd->buf, &buf); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(cd); - return rc; - } - } - - /* pass it to the global collective algorithm */ - if (PRTE_SUCCESS != (rc = prte_grpcomm.allgather(cd))) { - PRTE_ERROR_LOG(rc); - PMIX_RELEASE(cd); - return PMIX_ERROR; - } - return PMIX_SUCCESS; + // just pass this along + rc = prte_grpcomm.fence(procs, nprocs, info, ninfo, + data, ndata, cbfunc, cbdata); + return rc; } diff --git a/src/prted/pmix/pmix_server_gen.c b/src/prted/pmix/pmix_server_gen.c index 7e0f60f19c..ad83c95be3 100644 --- a/src/prted/pmix/pmix_server_gen.c +++ b/src/prted/pmix/pmix_server_gen.c @@ -345,7 +345,6 @@ pmix_status_t pmix_server_notify_event(pmix_status_t code, const pmix_proc_t *so pmix_op_cbfunc_t cbfunc, void *cbdata) { int rc; - prte_grpcomm_signature_t *sig; pmix_data_buffer_t pbkt; pmix_status_t ret; size_t n; @@ -426,29 +425,12 @@ pmix_status_t pmix_server_notify_event(pmix_status_t code, const pmix_proc_t *so } } - /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - if (NULL == sig) { - PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - return PMIX_ERR_NOMEM; - } - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - if (NULL == sig->signature) { - PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - PMIX_RELEASE(sig); - return PMIX_ERR_NOMEM; - } - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - sig->sz = 1; - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_NOTIFICATION, &pbkt))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_NOTIFICATION, &pbkt))) { PRTE_ERROR_LOG(rc); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - PMIX_RELEASE(sig); return PMIX_ERROR; } PMIX_DATA_BUFFER_DESTRUCT(&pbkt); - /* maintain accounting */ - PMIX_RELEASE(sig); done: /* we do not need to execute a callback as we did this atomically */ @@ -819,7 +801,6 @@ pmix_status_t pmix_server_job_ctrl_fn(const pmix_proc_t *requestor, const pmix_p pmix_pointer_array_t parray, *ptrarray; pmix_data_buffer_t *cmd; prte_daemon_cmd_flag_t cmmnd; - prte_grpcomm_signature_t *sig; pmix_proc_t *proct; PRTE_HIDE_UNUSED_PARAMS(cbfunc, cbdata); @@ -882,16 +863,10 @@ pmix_status_t pmix_server_job_ctrl_fn(const pmix_proc_t *requestor, const pmix_p PMIX_DATA_BUFFER_RELEASE(cmd); return rc; } - /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, cmd))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, cmd))) { PRTE_ERROR_LOG(rc); } PMIX_DATA_BUFFER_RELEASE(cmd); - PMIX_RELEASE(sig); if (PMIX_SUCCESS != rc) { return rc; } @@ -934,16 +909,10 @@ pmix_status_t pmix_server_job_ctrl_fn(const pmix_proc_t *requestor, const pmix_p PMIX_DATA_BUFFER_RELEASE(cmd); return rc; } - /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, cmd))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, cmd))) { PRTE_ERROR_LOG(rc); } PMIX_DATA_BUFFER_RELEASE(cmd); - PMIX_RELEASE(sig); if (PMIX_SUCCESS != rc) { return rc; } @@ -983,16 +952,10 @@ pmix_status_t pmix_server_job_ctrl_fn(const pmix_proc_t *requestor, const pmix_p PMIX_DATA_BUFFER_RELEASE(cmd); return rc; } - /* goes to all daemons */ - sig = PMIX_NEW(prte_grpcomm_signature_t); - sig->signature = (pmix_proc_t *) malloc(sizeof(pmix_proc_t)); - sig->sz = 1; - PMIX_LOAD_PROCID(&sig->signature[0], PRTE_PROC_MY_NAME->nspace, PMIX_RANK_WILDCARD); - if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(sig, PRTE_RML_TAG_DAEMON, cmd))) { + if (PRTE_SUCCESS != (rc = prte_grpcomm.xcast(PRTE_RML_TAG_DAEMON, cmd))) { PRTE_ERROR_LOG(rc); } PMIX_DATA_BUFFER_RELEASE(cmd); - PMIX_RELEASE(sig); if (PMIX_SUCCESS != rc) { return rc; } diff --git a/src/prted/pmix/pmix_server_group.c b/src/prted/pmix/pmix_server_group.c index c9f39fb0f4..37cbdc10a7 100644 --- a/src/prted/pmix/pmix_server_group.c +++ b/src/prted/pmix/pmix_server_group.c @@ -60,220 +60,130 @@ static void relcb(void *cbdata) { - prte_pmix_mdx_caddy_t *cd = (prte_pmix_mdx_caddy_t *) cbdata; - - if (NULL != cd->info) { - PMIX_INFO_FREE(cd->info, cd->ninfo); - } + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; PMIX_RELEASE(cd); } -static void group_release(int status, pmix_data_buffer_t *buf, void *cbdata) -{ - prte_pmix_mdx_caddy_t *cd = (prte_pmix_mdx_caddy_t *) cbdata; - int32_t cnt; - pmix_status_t rc = PMIX_SUCCESS; - bool assignedID = false; - size_t cid; - pmix_proc_t *members = NULL, *finmembers = NULL; - size_t num_members, nfinmembers; - pmix_data_array_t darray; - pmix_info_t info; - pmix_data_buffer_t dbuf; - pmix_byte_object_t bo; - int32_t byused; - pmix_server_pset_t *pset; - void *ilist; - - PMIX_ACQUIRE_OBJECT(cd); - - pmix_output_verbose(2, prte_pmix_server_globals.output, - "%s group request complete", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - - if (PRTE_SUCCESS != status) { - rc = prte_pmix_convert_rc(status); - goto complete; - } - - /* if this was a destruct operation, then there is nothing - * further we need do */ - if (PMIX_GROUP_DESTRUCT == cd->op) { - /* find this group ID on our list of groups */ - PMIX_LIST_FOREACH(pset, &prte_pmix_server_globals.groups, pmix_server_pset_t) - { - if (0 == strcmp(pset->name, cd->grpid)) { - pmix_list_remove_item(&prte_pmix_server_globals.groups, &pset->super); - PMIX_RELEASE(pset); - break; - } - } - rc = status; - goto complete; - } - - /* check for any directives */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &bo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto complete; - } - PMIX_DATA_BUFFER_CONSTRUCT(&dbuf); - PMIX_DATA_BUFFER_LOAD(&dbuf, bo.bytes, bo.size); - - cnt = 1; - rc = PMIx_Data_unpack(NULL, &dbuf, &info, &cnt, PMIX_INFO); - while (PMIX_SUCCESS == rc) { - if (PMIX_CHECK_KEY(&info, PMIX_GROUP_CONTEXT_ID)) { - PMIX_VALUE_GET_NUMBER(rc, &info.value, cid, size_t); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&dbuf); - goto complete; - } - assignedID = true; - - } else if (PMIX_CHECK_KEY(&info, PMIX_GROUP_ADD_MEMBERS)) { - num_members = info.value.data.darray->size; - PMIX_PROC_CREATE(members, num_members); - memcpy(members, info.value.data.darray->array, num_members * sizeof(pmix_proc_t)); - - } else if (PMIX_CHECK_KEY(&info, PMIX_GROUP_MEMBERSHIP)) { - nfinmembers = info.value.data.darray->size; - PMIX_PROC_CREATE(finmembers, nfinmembers); - memcpy(finmembers, info.value.data.darray->array, nfinmembers * sizeof(pmix_proc_t)); - } - /* cleanup */ - PMIX_INFO_DESTRUCT(&info); - /* get the next object */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, &dbuf, &info, &cnt, PMIX_INFO); - } - PMIX_DATA_BUFFER_DESTRUCT(&dbuf); - - /* the unpacking loop will have ended when the unpack either - * went past the end of the buffer */ - if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - PMIX_ERROR_LOG(rc); - goto complete; - } - rc = PMIX_SUCCESS; - - if (PMIX_GROUP_CONSTRUCT == cd->op) { - /* add it to our list of known groups */ - pset = PMIX_NEW(pmix_server_pset_t); - pset->name = strdup(cd->grpid); - if (NULL != finmembers) { - pset->num_members = nfinmembers; - PMIX_PROC_CREATE(pset->members, pset->num_members); - memcpy(pset->members, finmembers, nfinmembers * sizeof(pmix_proc_t)); - } else { - pset->num_members = cd->nprocs; - PMIX_PROC_CREATE(pset->members, pset->num_members); - memcpy(pset->members, cd->procs, cd->nprocs * sizeof(pmix_proc_t)); - } - pmix_list_append(&prte_pmix_server_globals.groups, &pset->super); - } - - /* if anything is left in the buffer, then it is - * modex data that needs to be stored */ - PMIX_BYTE_OBJECT_CONSTRUCT(&bo); - byused = buf->bytes_used - (buf->unpack_ptr - buf->base_ptr); - if (0 < byused) { - bo.bytes = buf->unpack_ptr; - bo.size = byused; - } - - PMIX_INFO_LIST_START(ilist); - // pass back the final group membership - darray.type = PMIX_PROC; - if (NULL != finmembers) { - darray.array = finmembers; - darray.size = nfinmembers; - } else { - darray.array = cd->procs; - darray.size = cd->nprocs; - } - PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_MEMBERSHIP, &darray, PMIX_DATA_ARRAY); - - if (assignedID) { - PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_CONTEXT_ID, &cid, PMIX_SIZE); - } - - if (NULL != bo.bytes && 0 < bo.size) { - PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_ENDPT_DATA, &bo, PMIX_BYTE_OBJECT); - } +static void opcbfunc(int status, void *cbdata) +{ + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; + PRTE_HIDE_UNUSED_PARAMS(status); - if (NULL != members) { - darray.array = members; - darray.size = num_members; - PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_ADD_MEMBERS, &darray, PMIX_DATA_ARRAY); - } - PMIX_INFO_LIST_CONVERT(rc, ilist, &darray); - cd->info = (pmix_info_t*)darray.array; - cd->ninfo = darray.size; - PMIX_INFO_LIST_RELEASE(ilist); - -complete: - if (NULL != cd->procs) { - PMIX_PROC_FREE(cd->procs, cd->nprocs); - } - if (NULL != finmembers) { - PMIX_PROC_FREE(finmembers, nfinmembers); - } - if (NULL != members) { - PMIX_PROC_FREE(members, num_members); - } - /* return to the local procs in the collective */ - if (NULL != cd->infocbfunc) { - cd->infocbfunc(rc, cd->info, cd->ninfo, cd->cbdata, relcb, cd); - } else { - if (NULL != cd->info) { - PMIX_INFO_FREE(cd->info, cd->ninfo); - } - PMIX_RELEASE(cd); - } + PMIX_RELEASE(cd); } static void local_complete(int sd, short args, void *cbdata) { - prte_pmix_mdx_caddy_t *cd = (prte_pmix_mdx_caddy_t*)cbdata; + prte_pmix_grp_caddy_t *cd = (prte_pmix_grp_caddy_t*)cbdata; + prte_pmix_grp_caddy_t *cd2; pmix_server_pset_t *pset; - pmix_data_array_t *members; + pmix_data_array_t members = PMIX_DATA_ARRAY_STATIC_INIT; + pmix_proc_t *addmembers = NULL; + size_t nmembers = 0, naddmembers = 0; pmix_proc_t *p; + void *ilist; + pmix_status_t rc; + size_t n; + pmix_data_array_t darray; + pmix_data_buffer_t dbuf; + pmix_byte_object_t bo; PRTE_HIDE_UNUSED_PARAMS(sd, args); if (PMIX_GROUP_CONSTRUCT == cd->op) { - // construct the group membership - members = PMIx_Data_array_create(cd->nprocs, PMIX_PROC); - p = (pmix_proc_t*)members->array; + PMIX_INFO_LIST_START(ilist); + + for (n=0; n < cd->ndirs; n++) { + // check if they gave us any grp or endpt info + if (PMIX_CHECK_KEY(&cd->directives[n], PMIX_PROC_DATA) || + PMIX_CHECK_KEY(&cd->directives[n], PMIX_GROUP_INFO)) { + rc = PMIx_Info_list_add_value(ilist, cd->directives[n].key, &cd->info[n].value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + // check for add members - server lib would have aggregated them + } else if (PMIX_CHECK_KEY(&cd->directives[n], PMIX_GROUP_ADD_MEMBERS)) { + naddmembers = cd->directives[n].value.data.darray->size; + addmembers = (pmix_proc_t*)cd->directives[n].value.data.darray->array; + } + } + + // construct the final group membership + nmembers = cd->nprocs + naddmembers; + PMIX_DATA_ARRAY_CONSTRUCT(&members, nmembers, PMIX_PROC); + p = (pmix_proc_t*)members.array; memcpy(p, cd->procs, cd->nprocs * sizeof(pmix_proc_t)); - cd->ninfo = 2; - PMIX_INFO_CREATE(cd->info, cd->ninfo); - PMIX_LOAD_KEY(cd->info[0].key, PMIX_GROUP_MEMBERSHIP); - cd->info[0].value.type = PMIX_DATA_ARRAY; - cd->info[0].value.data.darray = members; + if (0 < naddmembers) { + memcpy(&p[cd->nprocs], addmembers, naddmembers * sizeof(pmix_proc_t)); + } + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_MEMBERSHIP, &members, PMIX_DATA_ARRAY); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } - PMIX_LOAD_KEY(cd->info[1].key, PMIX_GROUP_ID); - cd->info[1].value.type = PMIX_STRING; - cd->info[1].value.data.string = strdup(cd->grpid); + PMIX_INFO_LIST_ADD(rc, ilist, PMIX_GROUP_ID, cd->grpid, PMIX_STRING); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } /* add it to our list of known groups */ pset = PMIX_NEW(pmix_server_pset_t); pset->name = strdup(cd->grpid); - pset->num_members = cd->nprocs; - PMIX_PROC_CREATE(pset->members, pset->num_members); - memcpy(pset->members, cd->procs, cd->nprocs * sizeof(pmix_proc_t)); + pset->num_members = nmembers; + PMIX_PROC_CREATE(pset->members, nmembers); + memcpy(pset->members, p, nmembers * sizeof(pmix_proc_t)); pmix_list_append(&prte_pmix_server_globals.groups, &pset->super); - // protect the procs array - cd->procs = NULL; - cd->nprocs = 0; + // convert the info list + PMIX_INFO_LIST_CONVERT(rc, ilist, &darray); + cd->info = (pmix_info_t*)darray.array; + cd->ninfo = darray.size; + PMIX_INFO_LIST_RELEASE(ilist); + + // generate events for any add members as they are waiting for notification + if (NULL != addmembers) { + + cd2 = PMIX_NEW(prte_pmix_grp_caddy_t); + cd2->ninfo = cd->ninfo + 3; + PMIX_INFO_CREATE(cd2->info, cd2->ninfo); + // carry over the info we created + for (n=0; n < cd->ninfo; n++) { + rc = PMIx_Info_xfer(&cd2->info[n], &cd->info[n]); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + } + + // set the range to be only procs that were added + darray.type = PMIX_PROC; + darray.array = addmembers; + darray.size = naddmembers; + // load the array - note: this copies the array! + PMIX_INFO_LOAD(&cd2->info[n], PMIX_EVENT_CUSTOM_RANGE, &darray, PMIX_DATA_ARRAY); + ++n; + + // mark that this event stays local and does not go up to the host + PMIX_INFO_LOAD(&cd2->info[n], PMIX_EVENT_STAYS_LOCAL, NULL, PMIX_BOOL); + ++n; + + // add the job-level info + PMIX_DATA_BUFFER_CONSTRUCT(&dbuf); + rc = PMIx_server_collect_job_info(p, nmembers, &dbuf); + if (PMIX_SUCCESS == rc) { + PMIx_Data_buffer_unload(&dbuf, &bo.bytes, &bo.size); + PMIX_INFO_LOAD(&cd2->info[n], PMIX_GROUP_JOB_INFO, &bo, PMIX_BYTE_OBJECT); + PMIX_BYTE_OBJECT_DESTRUCT(&bo); + } + PMIX_DATA_BUFFER_DESTRUCT(&dbuf); + + // notify local procs + PMIx_Notify_event(PMIX_GROUP_INVITED, &prte_process_info.myproc, + PMIX_RANGE_CUSTOM, + cd2->info, cd2->ninfo, opcbfunc, cd2); + } - // return this to them - cd->infocbfunc(PMIX_SUCCESS, cd->info, cd->ninfo, cd->cbdata, relcb, cd); + // return this to the PMIx server + cd->cbfunc(PMIX_SUCCESS, cd->info, cd->ninfo, cd->cbdata, relcb, cd); } else { /* find this group ID on our list of groups and remove it */ @@ -285,13 +195,9 @@ static void local_complete(int sd, short args, void *cbdata) break; } } - // return their callback - cd->infocbfunc(PMIX_SUCCESS, NULL, 0, cd->cbdata, NULL, NULL); - // protect the procs array - cd->procs = NULL; - cd->nprocs = 0; - PMIX_RELEASE(cd); + // return their callback + cd->cbfunc(PMIX_SUCCESS, NULL, 0, cd->cbdata, relcb, cd); } } @@ -300,19 +206,12 @@ pmix_status_t pmix_server_group_fn(pmix_group_operation_t op, char *grpid, const pmix_info_t directives[], size_t ndirs, pmix_info_cbfunc_t cbfunc, void *cbdata) { - prte_pmix_mdx_caddy_t *cd; + prte_pmix_grp_caddy_t *cd; int rc; size_t i; bool assignID = false; bool fence = false; bool force_local = false; - pmix_proc_t *members = NULL; - pmix_proc_t *mbrs, *p; - size_t num_members = 0; - size_t nmembers; - size_t bootstrap = 0; - bool copied = false; - pmix_byte_object_t *bo = NULL; struct timeval tv = {0, 0}; pmix_output_verbose(2, prte_pmix_server_globals.output, @@ -333,134 +232,46 @@ pmix_status_t pmix_server_group_fn(pmix_group_operation_t op, char *grpid, } else if (PMIX_CHECK_KEY(&directives[i], PMIX_EMBED_BARRIER)) { fence = PMIX_INFO_TRUE(&directives[i]); - } else if (PMIX_CHECK_KEY(&directives[i], PMIX_GROUP_ENDPT_DATA)) { - bo = (pmix_byte_object_t *) &directives[i].value.data.bo; - } else if (PMIX_CHECK_KEY(&directives[i], PMIX_TIMEOUT)) { tv.tv_sec = directives[i].value.data.uint32; } else if (PMIX_CHECK_KEY(&directives[i], PMIX_GROUP_LOCAL_ONLY)) { force_local = PMIX_INFO_TRUE(&directives[i]); - -#ifdef PMIX_GROUP_BOOTSTRAP - } else if (PMIX_CHECK_KEY(&directives[i], PMIX_GROUP_BOOTSTRAP)) { - PMIX_VALUE_GET_NUMBER(rc, &directives[i].value, bootstrap, size_t); - if (PMIX_SUCCESS != rc) { - return rc; - } -#endif - - } else if (PMIX_CHECK_KEY(&directives[i], PMIX_GROUP_ADD_MEMBERS)) { - // there can be more than one entry here as this is the aggregate - // of info keys from local procs that called group_construct - if (NULL == members) { - members = (pmix_proc_t*)directives[i].value.data.darray->array; - num_members = directives[i].value.data.darray->size; - } else { - copied = true; - // need to aggregate these - mbrs = (pmix_proc_t*)directives[i].value.data.darray->array; - nmembers = directives[i].value.data.darray->size; - // create a new array - PMIX_PROC_CREATE(p, nmembers * num_members); - // xfer data across - memcpy(p, members, num_members * sizeof(pmix_proc_t)); - memcpy(&p[num_members], mbrs, nmembers * sizeof(pmix_proc_t)); - // release the old array - PMIX_PROC_FREE(members, num_members); - // complete the xfer - members = p; - num_members = num_members + nmembers; - } } } if (0 < tv.tv_sec) { - if (copied) { - PMIX_PROC_FREE(members, num_members); - } - return PMIX_ERR_NOT_SUPPORTED; + return PMIX_ERR_NOT_SUPPORTED; } /* if they don't want us to do a fence and they don't want a * context id assigned and they aren't adding members, or they * insist on forcing local completion of the operation, then * we are done */ - if ((!fence && !assignID && NULL == members) || force_local) { + if ((!fence && !assignID) || force_local) { pmix_output_verbose(2, prte_pmix_server_globals.output, "%s group request - purely local", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); if (force_local && assignID) { // we cannot do that - if (copied) { - PMIX_PROC_FREE(members, num_members); - } return PMIX_ERR_BAD_PARAM; } - cd = PMIX_NEW(prte_pmix_mdx_caddy_t); + cd = PMIX_NEW(prte_pmix_grp_caddy_t); cd->op = op; cd->grpid = strdup(grpid); - cd->procs = (pmix_proc_t*)procs; + cd->procs = procs; cd->nprocs = nprocs; - cd->infocbfunc = cbfunc; + cd->directives = directives; + cd->ndirs = ndirs; + cd->cbfunc = cbfunc; cd->cbdata = cbdata; PRTE_PMIX_THREADSHIFT(cd, prte_event_base, local_complete); - if (copied) { - PMIX_PROC_FREE(members, num_members); - } return PMIX_SUCCESS; } - cd = PMIX_NEW(prte_pmix_mdx_caddy_t); - cd->op = op; - cd->grpid = strdup(grpid); - /* have to copy the procs in case we add members */ - PMIX_PROC_CREATE(cd->procs, nprocs); - memcpy(cd->procs, procs, nprocs * sizeof(pmix_proc_t)); - cd->nprocs = nprocs; - cd->grpcbfunc = group_release; - cd->infocbfunc = cbfunc; - cd->cbdata = cbdata; - - /* compute the signature of this collective */ - cd->sig = PMIX_NEW(prte_grpcomm_signature_t); - cd->sig->groupID = strdup(grpid); - if (NULL != procs) { - cd->sig->sz = nprocs; - cd->sig->signature = (pmix_proc_t *) malloc(cd->sig->sz * sizeof(pmix_proc_t)); - memcpy(cd->sig->signature, procs, cd->sig->sz * sizeof(pmix_proc_t)); - } - cd->sig->bootstrap = bootstrap; - if (NULL != members) { - cd->sig->nmembers = num_members; - if (copied) { - cd->sig->addmembers = members; - } else { - cd->sig->addmembers = (pmix_proc_t *) malloc(num_members * sizeof(pmix_proc_t)); - memcpy(cd->sig->addmembers, members, num_members * sizeof(pmix_proc_t)); - } - } - /* setup the ctrls blob - this will include any "add_members" directive */ - rc = prte_pack_ctrl_options(&cd->ctrls, directives, ndirs); - if (PMIX_SUCCESS != rc) { - PMIX_RELEASE(cd); - return rc; - } - PMIX_DATA_BUFFER_CREATE(cd->buf); - /* if they provided us with a data blob, send it along */ - if (NULL != bo) { - /* We don't own the byte_object and so we have to - * copy it here */ - rc = PMIx_Data_embed(cd->buf, bo); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - } - } - - /* pass it to the global collective algorithm */ - if (PRTE_SUCCESS != (rc = prte_grpcomm.allgather(cd))) { - PRTE_ERROR_LOG(rc); - PMIX_RELEASE(cd); - return PMIX_ERROR; + rc = prte_grpcomm.group(op, grpid, procs, nprocs, + directives, ndirs, cbfunc, cbdata); + if (PRTE_SUCCESS != rc) { + rc = prte_pmix_convert_rc(rc); } - return PMIX_SUCCESS; + return rc; } diff --git a/src/prted/pmix/pmix_server_internal.h b/src/prted/pmix/pmix_server_internal.h index fdd1099440..f78e5adba7 100644 --- a/src/prted/pmix/pmix_server_internal.h +++ b/src/prted/pmix/pmix_server_internal.h @@ -335,6 +335,7 @@ PRTE_EXPORT extern pmix_status_t prte_pmix_set_scheduler(void); PRTE_EXPORT extern pmix_status_t prte_server_send_request(uint8_t cmd, pmix_server_req_t *req); + #define PRTE_PMIX_ALLOC_REQ 0 #define PRTE_PMIX_SESSION_CTRL 1 diff --git a/src/prted/pmix/pmix_server_session.c b/src/prted/pmix/pmix_server_session.c index 25d9476194..8d97cc07aa 100644 --- a/src/prted/pmix/pmix_server_session.c +++ b/src/prted/pmix/pmix_server_session.c @@ -142,8 +142,7 @@ static int process_directive(pmix_server_req_t *req) PMIX_LOAD_PROCID(&jdata->originator, requestor->nspace, requestor->rank); } - } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION) || - PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION_NODES) || + } else if (PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION_NODES) || PMIX_CHECK_KEY(&req->info[n], PMIX_SESSION_PROVISION_IMAGE)) { // we don't support these directives rc = PMIX_ERR_NOT_SUPPORTED; diff --git a/src/rml/rml_types.h b/src/rml/rml_types.h index a22975e9d1..dd8e3e1f6d 100644 --- a/src/rml/rml_types.h +++ b/src/rml/rml_types.h @@ -106,14 +106,8 @@ typedef void (*prte_rml_buffer_callback_fn_t)(int status, pmix_proc_t *peer, #define PRTE_RML_TAG_ROLLUP 11 #define PRTE_RML_TAG_REPORT_REMOTE_LAUNCH 12 -#define PRTE_RML_TAG_CKPT 13 - -#define PRTE_RML_TAG_RML_ROUTE 14 #define PRTE_RML_TAG_XCAST 15 -#define PRTE_RML_TAG_UPDATE_ROUTE_ACK 19 -#define PRTE_RML_TAG_SYNC 20 - /* For FileM Base */ #define PRTE_RML_TAG_FILEM_BASE 21 #define PRTE_RML_TAG_FILEM_BASE_RESP 22 @@ -123,23 +117,13 @@ typedef void (*prte_rml_buffer_callback_fn_t)(int status, pmix_proc_t *peer, #define PRTE_RML_TAG_JOBID_RESP 24 -/* For tools */ -#define PRTE_RML_TAG_TOOL 26 - /* support data store/lookup */ #define PRTE_RML_TAG_DATA_SERVER 27 #define PRTE_RML_TAG_DATA_CLIENT 28 -/* timing related */ -#define PRTE_RML_TAG_COLLECTIVE_TIMER 29 - /* collectives */ -#define PRTE_RML_TAG_COLLECTIVE 30 -#define PRTE_RML_TAG_COLL_RELEASE 31 -#define PRTE_RML_TAG_DAEMON_COLL 32 -#define PRTE_RML_TAG_ALLGATHER_DIRECT 33 -#define PRTE_RML_TAG_ALLGATHER_BRUCKS 34 -#define PRTE_RML_TAG_ALLGATHER_RCD 35 +#define PRTE_RML_TAG_FENCE_RELEASE 31 +#define PRTE_RML_TAG_FENCE 33 /* debugger release */ #define PRTE_RML_TAG_DEBUGGER_RELEASE 37 @@ -150,28 +134,6 @@ typedef void (*prte_rml_buffer_callback_fn_t)(int status, pmix_proc_t *peer, /* report a missed msg */ #define PRTE_RML_TAG_MISSED_MSG 39 -/* tag for receiving ack of abort msg */ -#define PRTE_RML_TAG_ABORT 40 - -/* tag for receiving heartbeats */ -#define PRTE_RML_TAG_HEARTBEAT 41 - -/* Process Migration Tool Tag */ -#define PRTE_RML_TAG_MIGRATE 42 - -/* For SStore Framework */ -#define PRTE_RML_TAG_SSTORE 43 -#define PRTE_RML_TAG_SSTORE_INTERNAL 44 - -#define PRTE_RML_TAG_SUBSCRIBE 45 - -/* Notify of failed processes */ -#define PRTE_RML_TAG_FAILURE_NOTICE 46 - -/* distributed file system */ -#define PRTE_RML_TAG_DFS_CMD 47 -#define PRTE_RML_TAG_DFS_DATA 48 - /* sensor data */ #define PRTE_RML_TAG_SENSOR_DATA 49 @@ -179,17 +141,6 @@ typedef void (*prte_rml_buffer_callback_fn_t)(int status, pmix_proc_t *peer, #define PRTE_RML_TAG_DIRECT_MODEX 50 #define PRTE_RML_TAG_DIRECT_MODEX_RESP 51 -/* notifier support */ -#define PRTE_RML_TAG_NOTIFIER_HNP 52 -#define PRTE_RML_TAG_NOTIFY_COMPLETE 53 - -/*** QOS specific RML TAGS ***/ -#define PRTE_RML_TAG_OPEN_CHANNEL_REQ 54 -#define PRTE_RML_TAG_OPEN_CHANNEL_RESP 55 -#define PRTE_RML_TAG_MSG_ACK 56 -#define PRTE_RML_TAG_CLOSE_CHANNEL_REQ 57 -#define PRTE_RML_TAG_CLOSE_CHANNEL_ACCEPT 58 - /* error notifications */ #define PRTE_RML_TAG_NOTIFICATION 59 @@ -211,19 +162,13 @@ typedef void (*prte_rml_buffer_callback_fn_t)(int status, pmix_proc_t *peer, /* pmix log requests */ #define PRTE_RML_TAG_LOGGING 65 -/* error propagate */ -#define PRTE_RML_TAG_RBCAST 66 - -/* heartbeat request */ -#define PRTE_RML_TAG_HEARTBEAT_REQUEST 70 - -/* error propagate */ -#define PRTE_RML_TAG_PROPAGATE 71 - /* scheduler requests */ #define PRTE_RML_TAG_SCHED 72 #define PRTE_RML_TAG_SCHED_RESP 73 +/* group construct */ +#define PRTE_RML_TAG_GROUP 74 +#define PRTE_RML_TAG_GROUP_RELEASE 75 #define PRTE_RML_TAG_MAX 100 diff --git a/src/runtime/data_type_support/prte_dt_copy_fns.c b/src/runtime/data_type_support/prte_dt_copy_fns.c index 46795525f3..5846fd94da 100644 --- a/src/runtime/data_type_support/prte_dt_copy_fns.c +++ b/src/runtime/data_type_support/prte_dt_copy_fns.c @@ -161,14 +161,3 @@ int prte_map_copy(struct prte_job_map_t **d, struct prte_job_map_t *s) return PRTE_SUCCESS; } - -/* - * GRPCOMM SIGNATURE - */ -int prte_grpcomm_sig_copy(prte_grpcomm_signature_t **d, - prte_grpcomm_signature_t *s) -{ - *d = s; - PMIX_RETAIN(s); - return PRTE_SUCCESS; -} diff --git a/src/runtime/data_type_support/prte_dt_packing_fns.c b/src/runtime/data_type_support/prte_dt_packing_fns.c index dff79c14c3..e3b0848616 100644 --- a/src/runtime/data_type_support/prte_dt_packing_fns.c +++ b/src/runtime/data_type_support/prte_dt_packing_fns.c @@ -601,84 +601,3 @@ int prte_map_pack(pmix_data_buffer_t *bkt, struct prte_job_map_t *mp) return PRTE_SUCCESS; } - -/* - * GRPCOMM SIGNATURE - */ -int prte_grpcomm_sig_pack(pmix_data_buffer_t *bkt, - prte_grpcomm_signature_t *sig) -{ - pmix_status_t rc; - - // always send the participating procs - rc = PMIx_Data_pack(NULL, bkt, &sig->sz, 1, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - if (0 < sig->sz) { - rc = PMIx_Data_pack(NULL, bkt, sig->signature, sig->sz, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - } - - // pack the context ID, if one was given - rc = PMIx_Data_pack(NULL, bkt, &sig->ctxid_assigned, 1, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - if (sig->ctxid_assigned) { - rc = PMIx_Data_pack(NULL, bkt, &sig->ctxid, 1, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - } - - // pack added members, if given - rc = PMIx_Data_pack(NULL, bkt, &sig->nmembers, 1, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - if (0 < sig->nmembers) { - rc = PMIx_Data_pack(NULL, bkt, sig->addmembers, sig->nmembers, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - } - - // pack bootstrap number - rc = PMIx_Data_pack(NULL, bkt, &sig->bootstrap, 1, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - - // add the groupID if one is given - rc = PMIx_Data_pack(NULL, bkt, &sig->groupID, 1, PMIX_STRING); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - - // pack final membership, if given - rc = PMIx_Data_pack(NULL, bkt, &sig->nfinal, 1, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - if (0 < sig->nfinal) { - rc = PMIx_Data_pack(NULL, bkt, sig->finalmembership, sig->nfinal, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } - } - - return PRTE_SUCCESS; -} diff --git a/src/runtime/data_type_support/prte_dt_print_fns.c b/src/runtime/data_type_support/prte_dt_print_fns.c index 087bc51e7c..efad1a49d7 100644 --- a/src/runtime/data_type_support/prte_dt_print_fns.c +++ b/src/runtime/data_type_support/prte_dt_print_fns.c @@ -641,17 +641,3 @@ void prte_map_print(char **output, prte_job_t *jdata) return; } - -/* GRPCOMM SIG */ -void prte_grpcomm_sig_print(char **output, prte_grpcomm_signature_t *s) -{ - char *tmp; - - if (NULL != s->groupID) { - pmix_asprintf(&tmp, "Group ID: %s", s->groupID); - } else { - tmp = strdup("No GroupID - signature is array of procs"); - } - *output = tmp; - return; -} diff --git a/src/runtime/data_type_support/prte_dt_unpacking_fns.c b/src/runtime/data_type_support/prte_dt_unpacking_fns.c index bb2b183e7e..c2c427be7d 100644 --- a/src/runtime/data_type_support/prte_dt_unpacking_fns.c +++ b/src/runtime/data_type_support/prte_dt_unpacking_fns.c @@ -704,109 +704,3 @@ int prte_map_unpack(pmix_data_buffer_t *bkt, struct prte_job_map_t **mp) *mp = map; return PRTE_SUCCESS; } - -int prte_grpcomm_sig_unpack(pmix_data_buffer_t *buffer, - prte_grpcomm_signature_t **sig) -{ - pmix_status_t rc; - int32_t cnt; - prte_grpcomm_signature_t *s; - - s = PMIX_NEW(prte_grpcomm_signature_t); - - // unpack the participating procs - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->sz, &cnt, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - if (0 < s->sz) { - PMIX_PROC_CREATE(s->signature, s->sz); - cnt = s->sz; - rc = PMIx_Data_unpack(NULL, buffer, s->signature, &cnt, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - } - - // unpack the context ID, if one was assigned - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->ctxid_assigned, &cnt, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - if (s->ctxid_assigned) { - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->ctxid, &cnt, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - } - - // unpack the added members - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->nmembers, &cnt, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - if (0 < s->nmembers) { - PMIX_PROC_CREATE(s->addmembers, s->nmembers); - cnt = s->nmembers; - rc = PMIx_Data_unpack(NULL, buffer, s->addmembers, &cnt, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - } - - // unpack the bootstrap count - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->bootstrap, &cnt, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - - // unpack the groupID - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->groupID, &cnt, PMIX_STRING); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - - // unpack the final membership - cnt = 1; - rc = PMIx_Data_unpack(NULL, buffer, &s->nfinal, &cnt, PMIX_SIZE); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - if (0 < s->nfinal) { - PMIX_PROC_CREATE(s->finalmembership, s->nfinal); - cnt = s->nfinal; - rc = PMIx_Data_unpack(NULL, buffer, s->finalmembership, &cnt, PMIX_PROC); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(s); - return prte_pmix_convert_status(rc); - } - } - - *sig = s; - return PRTE_SUCCESS; -} diff --git a/src/runtime/prte_globals.h b/src/runtime/prte_globals.h index b4e4af24ef..7686e5dbed 100644 --- a/src/runtime/prte_globals.h +++ b/src/runtime/prte_globals.h @@ -505,15 +505,6 @@ PRTE_EXPORT int prte_node_unpack(pmix_data_buffer_t *bkt, prte_node_t **node); PRTE_EXPORT int prte_node_copy(prte_node_t **dest, prte_node_t *src); PRTE_EXPORT void prte_node_print(char **output, prte_job_t *jdata, prte_node_t *src); -/** grpcomm signature */ -PRTE_EXPORT int prte_grpcomm_sig_pack(pmix_data_buffer_t *bkt, - prte_grpcomm_signature_t *sig); -PRTE_EXPORT int prte_grpcomm_sig_unpack(pmix_data_buffer_t *bkt, - prte_grpcomm_signature_t **sig); -PRTE_EXPORT int prte_grpcomm_sig_copy(prte_grpcomm_signature_t **d, - prte_grpcomm_signature_t *s); -PRTE_EXPORT void prte_grpcomm_sig_print(char **output, prte_grpcomm_signature_t *s); - /** * Get a proc data object */