diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 85f9d9b33a8..6501d23ce72 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -139,9 +139,15 @@ struct mca_btl_uct_component_t { /** allowed UCT memory domains */ char *memory_domains; + mca_btl_uct_include_list_t memory_domain_list; /** allowed transports */ char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; + + /** transports to consider for forming connections */ + char *connection_domains; + mca_btl_uct_include_list_t connection_domain_list; /** number of worker contexts to create */ int num_contexts_per_module; @@ -153,6 +159,10 @@ struct mca_btl_uct_component_t { /** disable UCX memory hooks */ bool disable_ucx_memory_hooks; + + /** alternate connection-only module that can be used if no suitable + * connection tl is found. this is usually a tcp tl. */ + mca_btl_uct_module_t *conn_module; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -289,7 +299,8 @@ struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t opal_proc_t *proc); int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count); + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); @@ -336,5 +347,15 @@ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + END_C_DECLS #endif diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index d0c8c72e58d..4f7d67f65a1 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -48,7 +48,7 @@ static int mca_btl_uct_component_register(void) { mca_btl_uct_module_t *module = &mca_btl_uct_module_template; - mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0"; + mca_btl_uct_component.memory_domains = "mlx5_0,mlx4_0,rocep0s4"; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "memory_domains", "Comma-delimited list of memory domains of the form " @@ -67,6 +67,15 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); + mca_btl_uct_component.connection_domains = "tcp"; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_domains", + "Comma-delimited list of connection-only domains to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: tcp)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_domains); + mca_btl_uct_component.num_contexts_per_module = 0; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "num_contexts_per_module", @@ -113,6 +122,45 @@ static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, b ucm_vm_munmap(buf, length); } +static void mca_btl_uct_component_parse_include_list (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +static void mca_btl_uct_include_list_free (mca_btl_uct_include_list_t *list) { + opal_argv_free (list->list); + list->list = NULL; +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + if (0 == strcasecmp(name, list->list[i])) { + return list->include ? i + 1 : -(i + 1); + } + if (0 == strcasecmp("any", list->list[i])) { + return i + i; + } + } + + return list->include ? -1 : 1; +} + static int mca_btl_uct_component_open(void) { if (0 == mca_btl_uct_component.num_contexts_per_module) { @@ -156,10 +204,19 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { + if (NULL != mca_btl_uct_component.conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } + mca_btl_uct_include_list_free (&mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.connection_domain_list); + return OPAL_SUCCESS; } @@ -224,6 +281,34 @@ static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_dat return modex_size; } +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *modex_data) +{ + size_t name_len = strlen(module->md_name); + + /* pack the size */ + *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); + + modex_data += 4; + + strcpy((char *) modex_data, module->md_name); + modex_data += name_len + 1; + + if (module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); + } + + if (module->am_tl && module->am_tl != module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); + } + + if (module->conn_tl && module->conn_tl != module->rdma_tl + && module->conn_tl != module->am_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); + } + + return modex_data; +} + static int mca_btl_uct_modex_send(void) { size_t modex_size = sizeof(mca_btl_uct_modex_t); @@ -235,35 +320,24 @@ static int mca_btl_uct_modex_send(void) modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); } + if (mca_btl_uct_component.conn_module != NULL) { + modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.conn_module); + } + modex = alloca(modex_size); modex_data = modex->data; modex->module_count = mca_btl_uct_component.module_count; + if (mca_btl_uct_component.conn_module != NULL) { + ++modex->module_count; + } for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen(module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); - } + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.modules[i], modex_data); + } - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); - } + if (mca_btl_uct_component.conn_module != NULL) { + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.conn_module, modex_data); } OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); @@ -332,11 +406,9 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign #if UCT_API >= UCT_VERSION(1, 7) static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) + uct_md_resource_desc_t *md_desc) #else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) #endif { mca_rcache_base_resources_t rcache_resources; @@ -345,29 +417,35 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, uct_md_config_t *uct_config; uct_md_attr_t md_attr; mca_btl_uct_md_t *md; - bool found = false; + int list_rank; unsigned num_tls; char *tmp; + int connection_list_rank = -1; + bool consider_for_connection_module = false; ucs_status_t ucs_status; + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { BTL_VERBOSE(("created the maximum number of allowable modules")); return OPAL_ERR_NOT_AVAILABLE; } - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); - for (int j = 0; allowed_ifaces[j]; ++j) { - if (0 == strncmp(allowed_ifaces[j], md_desc->md_name, strlen(md_desc->md_name)) - || 0 == strcmp(allowed_ifaces[j], "all")) { - found = true; - break; + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; } - } - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; } md = OBJ_NEW(mca_btl_uct_md_t); @@ -414,7 +492,9 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, return OPAL_ERR_OUT_OF_RESOURCE; } - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls); + /* if this module is not to be used for communication check if it has a transport suitable + * for forming connections. */ + (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls, consider_for_connection_module); uct_release_tl_resource_list(tl_desc); @@ -422,7 +502,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, * remain open until those modules are finalized. */ OBJ_RELEASE(md); - if (NULL == module->am_tl && NULL == module->rdma_tl) { + if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); mca_btl_uct_finalize(&module->super); return OPAL_ERR_NOT_AVAILABLE; @@ -432,35 +512,43 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, module->uct_component = component; #endif - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; + if (!consider_for_connection_module) { + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + (void) opal_asprintf(&tmp, "uct.%s", module->md_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) + + module->super.btl_registration_handle_size; + rcache_resources.register_mem = mca_btl_uct_reg_mem; + rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + mca_btl_uct_finalize(&module->super); + return OPAL_ERROR; + } + } else { + if (NULL == mca_btl_uct_component.conn_module) { + BTL_VERBOSE(("memory domain %s may be used for connections", md_desc->md_name)); + mca_btl_uct_component.conn_module = module; + } else { + mca_btl_uct_finalize(&module->super); + } } return OPAL_SUCCESS; } #if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) { uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; @@ -482,7 +570,7 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -494,6 +582,63 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } #endif /* UCT_API >= UCT_VERSION(1, 7) */ +static void mca_btl_uct_component_validate_modules(void) { + if (mca_btl_uct_component.conn_module != NULL) { + /* verify that a connection-only module is required. this might be the case in some systems + * where rc verbs is avaiable but ud is not. */ + bool need_conn_module = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (module->conn_tl != NULL) { + continue; + } + if ((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) { + need_conn_module = true; + break; + } + } + + if (!need_conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + } else { + int usable_module_count = mca_btl_uct_component.module_count; + + /* check that all modules can be used */ + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (NULL != module->conn_tl) { + /* module has its own connection transport */ + continue; + } + + if ((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl)) + && NULL == module->conn_tl) { + /* module can not be used */ + BTL_VERBOSE(("module for memory domain %s can not be used due to missing connection transport", + module->md_name)); + mca_btl_uct_finalize (&mca_btl_uct_component.modules[i]->super); + mca_btl_uct_component.modules[i] = NULL; + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + } +} + /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -510,7 +655,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, */ struct mca_btl_base_module_t **base_modules; ucs_status_t ucs_status; - char **allowed_ifaces; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -522,10 +666,9 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - allowed_ifaces = opal_argv_split(mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { - return NULL; - } + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.memory_domains, &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.allowed_transports, &mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.connection_domains, &mca_btl_uct_component.connection_domain_list); mca_btl_uct_component.module_count = 0; @@ -541,7 +684,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i], allowed_ifaces); + rc = mca_btl_uct_component_process_uct_component(components[i]); if (OPAL_SUCCESS != rc) { break; } @@ -557,7 +700,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -567,7 +710,9 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, #endif /* UCT_API >= UCT_VERSION(1, 7) */ - opal_argv_free(allowed_ifaces); + /* filter out unusable modules before sending the modex */ + mca_btl_uct_component_validate_modules(); + mca_btl_uct_modex_send(); /* pass module array back to caller */ @@ -633,6 +778,38 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } +static int mca_btl_uct_component_progress_connections (mca_btl_uct_module_t *module) { + mca_btl_uct_pending_connection_request_t *request; + int ret; + + if (module->conn_tl == NULL) { + return 0; + } + + ret = mca_btl_uct_tl_progress(module->conn_tl, 0); + + while (NULL + != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( + &module->pending_connection_reqs))) { + mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; + BTL_VERBOSE(("processing connection request....")); + for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { + if (0 == strcmp(mca_btl_uct_component.modules[i]->md_name, conn_req->module_name)) { + module = mca_btl_uct_component.modules[i]; + break; + } + } + int rc = mca_btl_uct_process_connection_request(module, conn_req); + if (rc != OPAL_SUCCESS) { + opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + break; + } + OBJ_RELEASE(request); + } + + return ret; +} + /** * @brief UCT BTL progress function * @@ -654,27 +831,17 @@ static int mca_btl_uct_component_progress(void) ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); } - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->conn_tl, 0); - } - - while (NULL - != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request(module, (mca_btl_uct_conn_req_t *) - request->request_data); - OBJ_RELEASE(request); - } - } + mca_btl_uct_component_progress_connections (module); if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } + if (NULL != mca_btl_uct_component.conn_module) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module); + } + return (int) ret; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 04367ccf2f4..22bfd233292 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -201,20 +201,48 @@ static void mca_btl_uct_endpoint_flush_complete(uct_completion_t *self, ucs_stat static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_uct_device_context_t *conn_tl_context, + mca_btl_uct_module_t *conn_module, + uint8_t *conn_tl_data, mca_btl_uct_conn_req_t *request, size_t request_length) { mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false}; + mca_btl_uct_tl_t *conn_tl = conn_module->conn_tl; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; ucs_status_t ucs_status; + if (NULL == conn_ep) { + uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; + uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print(endpoint->ep_proc->proc_name))); + + endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, + device_addr, iface_addr, + &conn_ep->uct_ep); + }); + + if (UCS_OK != ucs_status) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + BTL_VERBOSE( ("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t, request->context_id, request->type, request_length)); - OBJ_RETAIN(endpoint->conn_ep); + OBJ_RETAIN(conn_ep); /* need to drop the lock to avoid hold-and-wait */ opal_mutex_unlock(&endpoint->ep_lock); @@ -258,56 +286,21 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, + mca_btl_uct_module_t *uct_btl, mca_btl_uct_module_t *conn_module, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) { size_t request_length = sizeof(mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; - mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; mca_btl_uct_conn_req_t *request = alloca(request_length); - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; ucs_status_t ucs_status; int rc; - assert(NULL != conn_tl); - BTL_VERBOSE(("connecting endpoint to remote endpoint")); - if (NULL == conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print(endpoint->ep_proc->proc_name))); - - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); - - endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { - ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, - device_addr, iface_addr, - &conn_ep->uct_ep); - }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE( - ("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(conn_ep); - } - /* fill in common request parameters */ request->proc_name = OPAL_PROC_MY_NAME; + strncpy(request->module_name, uct_btl->md_name, sizeof(request->module_name)); request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; request->type = !!(ep_addr); @@ -325,7 +318,8 @@ static int mca_btl_uct_endpoint_connect_endpoint( } } - if (ep_addr) { + + if (ep_addr && !(tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) { BTL_VERBOSE( ("using remote endpoint address to connect endpoint for tl %s, index %d. ep_addr = %p", tl->uct_tl_name, tl_context->context_id, ep_addr)); @@ -336,6 +330,7 @@ static int mca_btl_uct_endpoint_connect_endpoint( if (UCS_OK != ucs_status) { return OPAL_ERROR; } + mca_btl_uct_tl_endpoint_set_flag(tl_endpoint, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC); } /* fill in connection request */ @@ -350,17 +345,57 @@ static int mca_btl_uct_endpoint_connect_endpoint( /* let the remote side know that the connection has been established and * wait for the message to be sent */ - rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, - request_length); + rc = mca_btl_uct_endpoint_send_conn_req(conn_module, endpoint, conn_module, conn_tl_data, + request, request_length); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OBJ_RELEASE(endpoint->conn_ep); - uct_ep_destroy(tl_endpoint->uct_ep); - tl_endpoint->uct_ep = NULL; - return OPAL_ERROR; + if (OPAL_ERR_OUT_OF_RESOURCE != rc) { + OBJ_RELEASE(endpoint->conn_ep); + uct_ep_destroy(tl_endpoint->uct_ep); + tl_endpoint->uct_ep = NULL; + } + + return rc; + } + + if (!mca_btl_uct_tl_endpoint_ready(tl_endpoint)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + btl_uct_release_pending_frags(uct_btl, endpoint, tl_context->context_id); + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_modex_t *modex, + uint8_t **rdma_tl_data, uint8_t **am_tl_data, uint8_t **conn_tl_data) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + uint32_t modex_size = *((uint32_t *) modex_data); + + BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); + + modex_data += 4; + + if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += modex_size - 4; + continue; + } + + modex_data += strlen((char *) modex_data) + 1; + + mca_btl_uct_process_modex(uct_btl, modex_data, rdma_tl_data, am_tl_data, conn_tl_data); + + BTL_VERBOSE(("finished processing modex for %s", uct_btl->md_name)); + + return OPAL_SUCCESS; } - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS - : OPAL_ERR_OUT_OF_RESOURCE; + BTL_ERROR(("could not find modex for %s", uct_btl->md_name)); + + return OPAL_ERR_NOT_FOUND; } int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, @@ -375,7 +410,6 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; mca_btl_uct_connection_ep_t *conn_ep = NULL; mca_btl_uct_modex_t *modex; - uint8_t *modex_data; size_t msg_size; int rc; @@ -415,43 +449,37 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE( - ("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - modex_data += strlen((char *) modex_data) + 1; + rc = mca_btl_uct_find_modex (uct_btl, modex, &rdma_tl_data, &am_tl_data, &conn_tl_data); - mca_btl_uct_process_modex(uct_btl, modex_data, &rdma_tl_data, &am_tl_data, - &conn_tl_data); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { break; } tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; - - if (NULL == tl_data) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_UNREACH; + if (OPAL_UNLIKELY(NULL == tl_data)) { + BTL_ERROR(("could not find modex data for this transport")); + break; } /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl(tl)) { - rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); - } else { - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, + if (mca_btl_uct_tl_requires_connection_tl(tl)) { + mca_btl_uct_module_t *conn_module = uct_btl; + if (NULL == uct_btl->conn_tl) { + rc = mca_btl_uct_find_modex (mca_btl_uct_component.conn_module, modex, /*rdma_tl_data=*/NULL, + /*am_tl_data=*/NULL, &conn_tl_data); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; + } + BTL_VERBOSE(("using separate connection module for tl")); + conn_module = mca_btl_uct_component.conn_module; + } + + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, conn_module, endpoint, tl, tl_context, tl_endpoint, tl_data, conn_tl_data, ep_addr); + } else { + rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } } while (0); @@ -469,7 +497,11 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp OBJ_RELEASE(conn_ep); } - BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet")); + if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_SUCCESS == rc)) { + BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet")); + } else { + BTL_ERROR(("unable to connect endpoint, rc=%d", rc)); + } return rc; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.h b/opal/mca/btl/uct/btl_uct_endpoint.h index 49d1b941457..7a782d37121 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.h +++ b/opal/mca/btl/uct/btl_uct_endpoint.h @@ -34,6 +34,31 @@ mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create(opal_proc_t *proc); int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, int ep_index, void *ep_addr, int tl_index); +static inline bool mca_btl_uct_tl_endpoint_ready(mca_btl_uct_tl_endpoint_t *tl_endpoint) +{ + return !!(tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY); +} + +/** + * @brief Mark all frags associated with the endpoint/context_id pair as ready. + * + * @param[in] module UCT BTL module + * @param[in] endpoint UCT BTL endpoint + * @param[in] context_id context id + * + * Requires holding the endpoint mutex. + */ +static inline void btl_uct_release_pending_frags(mca_btl_uct_module_t *module, mca_btl_base_endpoint_t *endpoint, + int context_id) +{ + mca_btl_uct_base_frag_t *frag; + OPAL_LIST_FOREACH (frag, &module->pending_frags, mca_btl_uct_base_frag_t) { + if (frag->context->context_id == context_id && endpoint == frag->endpoint) { + frag->ready = true; + } + } +} + static inline int mca_btl_uct_endpoint_test_am(mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, mca_btl_uct_device_context_t *context, @@ -42,8 +67,7 @@ static inline int mca_btl_uct_endpoint_test_am(mca_btl_uct_module_t *module, int tl_index = module->am_tl->tl_index; int ep_index = context->context_id; - if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY - & endpoint->uct_eps[ep_index][tl_index].flags)) { + if (OPAL_LIKELY(mca_btl_uct_tl_endpoint_ready(endpoint->uct_eps[ep_index] + tl_index))) { *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; return OPAL_SUCCESS; } @@ -51,6 +75,16 @@ static inline int mca_btl_uct_endpoint_test_am(mca_btl_uct_module_t *module, return OPAL_ERR_NOT_AVAILABLE; } +static inline void mca_btl_uct_tl_endpoint_set_flag(mca_btl_uct_tl_endpoint_t *tl_endpoint, int32_t flag) +{ + int32_t flags = opal_atomic_or_fetch_32(&tl_endpoint->flags, flag); + int32_t conn_ready_flags = MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC; + if ((flags & conn_ready_flags) == conn_ready_flags) { + /* remote side is ready and the local endpoint is connected */ + (void) opal_atomic_or_fetch_32(&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY); + } +} + /** * @brief Check if the endpoint is connected and start the connection if not * @@ -72,8 +106,7 @@ static inline int mca_btl_uct_endpoint_check(mca_btl_uct_module_t *module, int ep_index = context->context_id; int rc; - if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY - & endpoint->uct_eps[ep_index][tl_index].flags)) { + if (OPAL_LIKELY(mca_btl_uct_tl_endpoint_ready(endpoint->uct_eps[ep_index] + tl_index))) { *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; return OPAL_SUCCESS; } diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index 5669e88c061..061745e0934 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -196,7 +196,6 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, struct opal_proc_t *remote_proc = opal_proc_for_name(req->proc_name); mca_btl_base_endpoint_t *endpoint = mca_btl_uct_get_ep(&module->super, remote_proc); mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->context_id] + req->tl_index; - int32_t ep_flags; int rc; BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d", @@ -209,16 +208,12 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, assert(req->type < 2); - ep_flags = opal_atomic_fetch_or_32(&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC); - - if (!(ep_flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) { - /* create any necessary resources */ - rc = mca_btl_uct_endpoint_connect(module, endpoint, req->context_id, req->ep_addr, - req->tl_index); - if (OPAL_SUCCESS != rc && OPAL_ERR_OUT_OF_RESOURCE != rc) { - BTL_ERROR(("could not setup rdma endpoint. rc = %d", rc)); - return rc; - } + /* create any necessary resources */ + rc = mca_btl_uct_endpoint_connect(module, endpoint, req->context_id, req->ep_addr, + req->tl_index); + if (OPAL_SUCCESS != rc && OPAL_ERR_OUT_OF_RESOURCE != rc) { + BTL_ERROR(("could not setup rdma endpoint. rc = %d", rc)); + return rc; } /* the connection is ready once we have received the connection data and also a connection ready @@ -226,20 +221,15 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, * an endpoint can be used. */ if (req->type == 1) { /* remote side is ready */ - mca_btl_uct_base_frag_t *frag; /* to avoid a race with send adding pending frags grab the lock here */ OPAL_THREAD_SCOPED_LOCK(&endpoint->ep_lock, { BTL_VERBOSE(("connection ready. sending %" PRIsize_t " frags", opal_list_get_size(&module->pending_frags))); - (void) opal_atomic_or_fetch_32(&tl_endpoint->flags, - MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY); + mca_btl_uct_tl_endpoint_set_flag(tl_endpoint, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY); opal_atomic_wmb(); - - OPAL_LIST_FOREACH (frag, &module->pending_frags, mca_btl_uct_base_frag_t) { - if (frag->context->context_id == req->context_id && endpoint == frag->endpoint) { - frag->ready = true; - } + if (mca_btl_uct_tl_endpoint_ready(tl_endpoint)) { + btl_uct_release_pending_frags(module, endpoint, req->context_id); } }); } @@ -562,57 +552,30 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ } int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count) + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only) { - bool include = true, any = false; mca_btl_uct_tl_t *tl; opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; OBJ_CONSTRUCT(&tl_list, opal_list_t); - tl_filter = opal_argv_split(mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup(tl_filter[0] + 1); + for (unsigned i = 0; i < tl_count; ++i) { + int priority = 0; - free(tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } + BTL_VERBOSE(("processing tl %s, evaluate_for_conn_only=%d", tl_descs[i].tl_name, evaluate_for_conn_only)); + + if (!evaluate_for_conn_only) { + priority = mca_btl_uct_include_list_rank (tl_descs[i].tl_name, &mca_btl_uct_component.allowed_transport_list); - /* check for the any keyword */ - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } + BTL_VERBOSE(("tl filter: tl_name = %s, priority = %d", tl_descs[i].tl_name, + priority)); - if (any && !include) { - opal_argv_free(tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } - - for (unsigned i = 0; i < tl_count; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; + if (priority < 0) { + continue; } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, - try_tl, priority)); - - if (!try_tl) { + } else if (tl_descs[i].dev_type != UCT_DEVICE_TYPE_NET) { + /* only network types are suitable for forming connections */ continue; } @@ -625,12 +588,23 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); if (tl) { - opal_list_append(&tl_list, &tl->super); + if (mca_btl_uct_tl_supports_conn(tl) && evaluate_for_conn_only) { + BTL_VERBOSE(("evaluating tl %s for forming connections", tl_descs[i].tl_name)); + int rc = mca_btl_uct_set_tl_conn(module, tl); + OBJ_RELEASE(tl); + + if (OPAL_SUCCESS == rc) { + mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("tl %s cannot be used for forming connections", tl_descs[i].tl_name)); + } else { + opal_list_append(&tl_list, &tl->super); + } } } - opal_argv_free(tl_filter); - if (0 == opal_list_get_size(&tl_list)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); OBJ_DESTRUCT(&tl_list); @@ -679,10 +653,6 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, /* no connection tl needed for selected transports */ OBJ_RELEASE(module->conn_tl); module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; } return OPAL_SUCCESS; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 156451fa307..7dde374754e 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -85,6 +85,9 @@ struct mca_btl_uct_conn_req_t { /** transport index that should be connected */ int tl_index; + /** name of the connecting module */ + char module_name[64]; + /** endpoint address data */ uint8_t ep_addr[]; }; @@ -181,6 +184,18 @@ union mca_btl_uct_am_header_t { typedef union mca_btl_uct_am_header_t mca_btl_uct_am_header_t; +/** + * @brief parsed include/exclude list + * + */ +struct mca_btl_uct_include_list_t { + /** argv-style (NULL terminated) array of strings */ + char **list; + /** is an inclusive list (vs exclusive) */ + bool include; +}; +typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; + /** * @brief structure to keep track of btl callback *