diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 65bc69fddb2..b07a6efc9a6 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -141,9 +141,15 @@ struct mca_btl_uct_component_t { /** allowed UCT memory domains */ char *memory_domains; + mca_btl_uct_include_list_t memory_domain_list; /** allowed transports */ char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; + + /** transports to consider for forming connections */ + char *connection_domains; + mca_btl_uct_include_list_t connection_domain_list; /** number of worker contexts to create */ int num_contexts_per_module; @@ -158,6 +164,10 @@ struct mca_btl_uct_component_t { /** connection retry timeout */ unsigned int connection_retry_timeout; + + /** alternate connection-only module that can be used if no suitable + * connection tl is found. this is usually a tcp tl. */ + mca_btl_uct_module_t *conn_module; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -294,7 +304,8 @@ struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t opal_proc_t *proc); int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count); + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); @@ -341,5 +352,15 @@ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + END_C_DECLS #endif diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 1aae456842c..85d89d2d734 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -55,7 +55,6 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade { uint32_t iov_count = 1; struct iovec iov; - size_t length; if (header_size > 0) { assert(NULL != header); diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 5eec97ec487..5f4ed3776c6 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -27,6 +27,8 @@ * $HEADER$ */ +#include + #include "opal_config.h" #include "opal/mca/btl/base/base.h" @@ -67,6 +69,15 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); + mca_btl_uct_component.connection_domains = "tcp"; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_domains", + "Comma-delimited list of connection-only domains to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: tcp)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_domains); + mca_btl_uct_component.num_contexts_per_module = 0; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "num_contexts_per_module", @@ -124,6 +135,54 @@ static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, b ucm_vm_munmap(buf, length); } +static void mca_btl_uct_component_parse_include_list (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +static void mca_btl_uct_include_list_free (mca_btl_uct_include_list_t *list) { + opal_argv_free (list->list); + list->list = NULL; +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + regex_t preg; + + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + int rc = regcomp(&preg, list->list[i], REG_ICASE); + if (0 != rc) { + char errbuf[256]; + regerror(rc, &preg, errbuf, sizeof(errbuf)); + BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); + continue; + } + + int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); + regfree(&preg); + if (0 == result) { + return list->include ? i + 1 : -(i + 1); + } + } + + return list->include ? -1 : 1; +} + static int mca_btl_uct_component_open(void) { if (0 == mca_btl_uct_component.num_contexts_per_module) { @@ -167,10 +226,19 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { + if (NULL != mca_btl_uct_component.conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } + mca_btl_uct_include_list_free (&mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.connection_domain_list); + return OPAL_SUCCESS; } @@ -235,6 +303,34 @@ static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_dat return modex_size; } +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *modex_data) +{ + size_t name_len = strlen(module->md_name); + + /* pack the size */ + *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); + + modex_data += 4; + + strcpy((char *) modex_data, module->md_name); + modex_data += name_len + 1; + + if (module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); + } + + if (module->am_tl && module->am_tl != module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); + } + + if (module->conn_tl && module->conn_tl != module->rdma_tl + && module->conn_tl != module->am_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); + } + + return modex_data; +} + static int mca_btl_uct_modex_send(void) { size_t modex_size = sizeof(mca_btl_uct_modex_t); @@ -246,35 +342,22 @@ static int mca_btl_uct_modex_send(void) modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); } + if (mca_btl_uct_component.conn_module != NULL) { + modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.conn_module); + } + modex = alloca(modex_size); modex_data = modex->data; modex->module_count = mca_btl_uct_component.module_count; for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen(module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); - } + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.modules[i], modex_data); + } - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); - } + if (mca_btl_uct_component.conn_module != NULL) { + ++modex->module_count; + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.conn_module, modex_data); } OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); @@ -323,6 +406,10 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; mca_btl_uct_module_t *uct_btl = tl_context->uct_btl; mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data; + if (header->data.tag == 0xff) { + fprintf (stderr, "%d: got an invalid tag\n"); + while (true) {} + } mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + header->data.tag; mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof(*header))}, @@ -337,17 +424,16 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign tl_context->in_am_callback = true; reg->cbfunc(&uct_btl->super, &desc); tl_context->in_am_callback = false; + header->data.tag = 0xff; return UCS_OK; } #if UCT_API >= UCT_VERSION(1, 7) static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) + uct_md_resource_desc_t *md_desc) #else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) #endif { mca_rcache_base_resources_t rcache_resources; @@ -356,29 +442,35 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, uct_md_config_t *uct_config; uct_md_attr_t md_attr; mca_btl_uct_md_t *md; - bool found = false; + int list_rank; unsigned num_tls; char *tmp; ucs_status_t ucs_status; + int connection_list_rank = -1; + bool consider_for_connection_module = false; + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { BTL_VERBOSE(("created the maximum number of allowable modules")); return OPAL_ERR_NOT_AVAILABLE; } - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); - for (int j = 0; allowed_ifaces[j]; ++j) { - if (0 == strncmp(allowed_ifaces[j], md_desc->md_name, strlen(md_desc->md_name)) - || 0 == strcmp(allowed_ifaces[j], "all")) { - found = true; - break; + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; } - } - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; } md = OBJ_NEW(mca_btl_uct_md_t); @@ -425,7 +517,9 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, return OPAL_ERR_OUT_OF_RESOURCE; } - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls); + /* if this module is not to be used for communication check if it has a transport suitable + * for forming connections. */ + (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls, consider_for_connection_module); uct_release_tl_resource_list(tl_desc); @@ -433,7 +527,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, * remain open until those modules are finalized. */ OBJ_RELEASE(md); - if (NULL == module->am_tl && NULL == module->rdma_tl) { + if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); mca_btl_uct_finalize(&module->super); return OPAL_ERR_NOT_AVAILABLE; @@ -443,35 +537,43 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, module->uct_component = component; #endif - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; + if (!consider_for_connection_module) { + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + (void) opal_asprintf(&tmp, "uct.%s", module->md_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) + + module->super.btl_registration_handle_size; + rcache_resources.register_mem = mca_btl_uct_reg_mem; + rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + mca_btl_uct_finalize(&module->super); + return OPAL_ERROR; + } + } else { + if (NULL == mca_btl_uct_component.conn_module) { + BTL_VERBOSE(("memory domain %s may be used for connections", md_desc->md_name)); + mca_btl_uct_component.conn_module = module; + } else { + mca_btl_uct_finalize(&module->super); + } } return OPAL_SUCCESS; } #if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) { uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; @@ -493,7 +595,7 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -505,6 +607,63 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } #endif /* UCT_API >= UCT_VERSION(1, 7) */ +static void mca_btl_uct_component_validate_modules(void) { + if (mca_btl_uct_component.conn_module != NULL) { + /* verify that a connection-only module is required. this might be the case in some systems + * where rc verbs is avaiable but ud is not. */ + bool need_conn_module = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (module->conn_tl != NULL) { + continue; + } + if ((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) { + need_conn_module = true; + break; + } + } + + if (!need_conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + } else { + int usable_module_count = mca_btl_uct_component.module_count; + + /* check that all modules can be used */ + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (NULL != module->conn_tl) { + /* module has its own connection transport */ + continue; + } + + if (((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) + && NULL == module->conn_tl) { + /* module can not be used */ + BTL_VERBOSE(("module for memory domain %s can not be used due to missing connection transport", + module->md_name)); + mca_btl_uct_finalize (&mca_btl_uct_component.modules[i]->super); + mca_btl_uct_component.modules[i] = NULL; + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + } +} + /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -521,7 +680,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, */ struct mca_btl_base_module_t **base_modules; ucs_status_t ucs_status; - char **allowed_ifaces; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -533,10 +691,12 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - allowed_ifaces = opal_argv_split(mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { - return NULL; - } + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.memory_domains, + &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.allowed_transports, + &mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.connection_domains, + &mca_btl_uct_component.connection_domain_list); mca_btl_uct_component.module_count = 0; @@ -552,7 +712,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i], allowed_ifaces); + rc = mca_btl_uct_component_process_uct_component(components[i]); if (OPAL_SUCCESS != rc) { break; } @@ -568,7 +728,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -578,7 +738,9 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, #endif /* UCT_API >= UCT_VERSION(1, 7) */ - opal_argv_free(allowed_ifaces); + /* filter out unusable modules before sending the modex */ + mca_btl_uct_component_validate_modules(); + mca_btl_uct_modex_send(); /* pass module array back to caller */ @@ -644,6 +806,38 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } +static int mca_btl_uct_component_progress_connections (mca_btl_uct_module_t *module) { + mca_btl_uct_pending_connection_request_t *request; + int ret; + + if (module->conn_tl == NULL) { + return 0; + } + + ret = mca_btl_uct_tl_progress(module->conn_tl, 0); + + while (NULL + != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( + &module->pending_connection_reqs))) { + mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; + BTL_VERBOSE(("processing connection request....")); + for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { + if (0 == strncmp(mca_btl_uct_component.modules[i]->md_name, conn_req->module_name, sizeof(conn_req->module_name) - 1)) { + module = mca_btl_uct_component.modules[i]; + break; + } + } + int rc = mca_btl_uct_process_connection_request(module, conn_req); + if (rc != OPAL_SUCCESS) { + opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + break; + } + OBJ_RELEASE(request); + } + + return ret; +} + /** * @brief UCT BTL progress function * @@ -665,27 +859,17 @@ static int mca_btl_uct_component_progress(void) ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); } - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->conn_tl, 0); - } - - while (NULL - != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request(module, (mca_btl_uct_conn_req_t *) - request->request_data); - OBJ_RELEASE(request); - } - } - + mca_btl_uct_component_progress_connections (module); + if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } + if (NULL != mca_btl_uct_component.conn_module) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module); + } + return (int) ret; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 695fd754aa2..42a0cd2684a 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -256,50 +256,57 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_SUCCESS; } -static int mca_btl_uct_endpoint_send_connection_data( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *conn_tl_data, int request_type) +static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *conn_tl, uint8_t *conn_tl_data) { - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; - - assert(NULL != conn_tl); - - BTL_VERBOSE(("connecting endpoint to remote endpoint")); + if (NULL != endpoint->conn_ep) { + BTL_VERBOSE(("re-using existing connection endpoint")); + OBJ_RETAIN(endpoint->conn_ep); + return OPAL_SUCCESS; + } - if (NULL == endpoint->conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print(endpoint->ep_proc->proc_name))); + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print(endpoint->ep_proc->proc_name))); - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; + uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); - endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } + endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, device_addr, iface_addr, &endpoint->conn_ep->uct_ep); }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE( - ("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(endpoint->conn_ep); + if (UCS_OK != ucs_status) { + BTL_VERBOSE( + ("could not create an endpoint for forming connection to remote peer. code = %d", + ucs_status)); + return OPAL_ERROR; } + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_send_connection_data( + mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type) +{ + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + ucs_status_t ucs_status; + + assert(NULL != conn_tl); + + BTL_VERBOSE(("connecting endpoint to remote endpoint")); + size_t request_length = sizeof(mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; mca_btl_uct_conn_req_t *request = alloca(request_length); @@ -309,6 +316,7 @@ static int mca_btl_uct_endpoint_send_connection_data( request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; request->type = request_type; + strncpy(request->module_name, uct_btl->md_name, sizeof(request->module_name) - 1); /* fill in connection request */ ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); @@ -337,9 +345,9 @@ static int mca_btl_uct_endpoint_send_connection_data( } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) + mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, void *ep_addr) { ucs_status_t ucs_status; @@ -378,11 +386,44 @@ static int mca_btl_uct_endpoint_connect_endpoint( : OPAL_ERR_OUT_OF_RESOURCE; } - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, - conn_tl_data, /*request_type=*/!!ep_addr); + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, conn_tl, endpoint, tl, tl_context, tl_endpoint, + /*request_type=*/!!ep_addr); return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; } +static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_modex_t *modex, + uint8_t **rdma_tl_data, uint8_t **am_tl_data, uint8_t **conn_tl_data) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + uint32_t modex_size = *((uint32_t *) modex_data); + + BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); + + modex_data += 4; + + if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += modex_size - 4; + continue; + } + + modex_data += strlen((char *) modex_data) + 1; + + mca_btl_uct_process_modex(uct_btl, modex_data, rdma_tl_data, am_tl_data, conn_tl_data); + + BTL_VERBOSE(("finished processing modex for %s", uct_btl->md_name)); + + return OPAL_SUCCESS; + } + + BTL_ERROR(("could not find modex for %s", uct_btl->md_name)); + + return OPAL_ERR_NOT_FOUND; +} + + int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, void *ep_addr, int tl_index) { @@ -394,7 +435,6 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp = mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id); uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; mca_btl_uct_modex_t *modex; - uint8_t *modex_data; size_t msg_size; int rc; @@ -410,19 +450,20 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); opal_mutex_lock(&endpoint->ep_lock); - if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { - opal_mutex_unlock(&endpoint->ep_lock); - /* nothing more to do. someone else completed the connection */ - return OPAL_SUCCESS; - } - - /* dumpicate connection request. nothing to do until the endpoint data is received */ - if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } do { + if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { + /* nothing more to do. someone else completed the connection */ + rc = OPAL_SUCCESS; + break; + } + + /* dumpicate connection request. nothing to do until the endpoint data is received */ + if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + break; + } + /* read the modex. this is done both to start the connection and to process endpoint data */ OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, &endpoint->ep_proc->proc_name, (void **) &modex, &msg_size); @@ -434,45 +475,50 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE( - ("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - modex_data += strlen((char *) modex_data) + 1; - - mca_btl_uct_process_modex(uct_btl, modex_data, &rdma_tl_data, &am_tl_data, - &conn_tl_data); + rc = mca_btl_uct_find_modex (uct_btl, modex, &rdma_tl_data, &am_tl_data, &conn_tl_data); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { break; } tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; - if (NULL == tl_data) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_UNREACH; + if (OPAL_UNLIKELY(NULL == tl_data)) { + BTL_ERROR(("could not find modex data for this transport")); + rc = OPAL_ERR_UNREACH; + break; } /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl(tl)) { - rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); + if (mca_btl_uct_tl_requires_connection_tl(tl)) { + mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; + if (NULL == conn_tl) { + rc = mca_btl_uct_find_modex (mca_btl_uct_component.conn_module, modex, + /*rdma_tl_data=*/NULL, /*am_tl_data=*/NULL, + &conn_tl_data); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; + } + + BTL_VERBOSE(("using separate connection module for tl")); + conn_tl = mca_btl_uct_component.conn_module->conn_tl; + } + + if (NULL == tl_endpoint->uct_ep) { + /* allocate or retain a connection endpoint */ + rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, conn_tl, + conn_tl_data); + if (OPAL_SUCCESS != rc) { + break; + } + } + + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, conn_tl, endpoint, tl, + tl_context, tl_endpoint, tl_data, ep_addr); } else { - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, - tl_endpoint, tl_data, conn_tl_data, ep_addr); + rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } - } while (0); opal_mutex_unlock(&endpoint->ep_lock); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index c1ef4c6d727..fd8061fa81c 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -553,57 +553,27 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ } int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count) + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only) { - bool include = true, any = false; mca_btl_uct_tl_t *tl; opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; OBJ_CONSTRUCT(&tl_list, opal_list_t); - tl_filter = opal_argv_split(mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup(tl_filter[0] + 1); - - free(tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } - - /* check for the any keyword */ - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } - - if (any && !include) { - opal_argv_free(tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } - for (unsigned i = 0; i < tl_count; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; + int priority = 0; + BTL_VERBOSE(("processing tl %s, evaluate_for_conn_only=%d", tl_descs[i].tl_name, evaluate_for_conn_only)); + + if (!evaluate_for_conn_only) { + priority = mca_btl_uct_include_list_rank (tl_descs[i].tl_name, &mca_btl_uct_component.allowed_transport_list); + BTL_VERBOSE(("tl filter: tl_name = %s, priority = %d", tl_descs[i].tl_name, + priority)); + if (priority < 0) { + continue; } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, - try_tl, priority)); - - if (!try_tl) { + } else if (tl_descs[i].dev_type != UCT_DEVICE_TYPE_NET) { + /* only network types are suitable for forming connections */ continue; } @@ -616,12 +586,23 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); if (tl) { - opal_list_append(&tl_list, &tl->super); + if (mca_btl_uct_tl_supports_conn(tl) && evaluate_for_conn_only) { + BTL_VERBOSE(("evaluating tl %s for forming connections", tl_descs[i].tl_name)); + int rc = mca_btl_uct_set_tl_conn(module, tl); + OBJ_RELEASE(tl); + + if (OPAL_SUCCESS == rc) { + mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("tl %s cannot be used for forming connections", tl_descs[i].tl_name)); + } else { + opal_list_append(&tl_list, &tl->super); + } } } - opal_argv_free(tl_filter); - if (0 == opal_list_get_size(&tl_list)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); OBJ_DESTRUCT(&tl_list); @@ -670,10 +651,6 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, /* no connection tl needed for selected transports */ OBJ_RELEASE(module->conn_tl); module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; } return OPAL_SUCCESS; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index b2bac61be61..7b54b02d640 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -90,6 +90,8 @@ struct mca_btl_uct_conn_req_t { /** transport index that should be connected */ int tl_index; + char module_name[16]; + /** endpoint address data */ uint8_t ep_addr[]; }; @@ -347,4 +349,17 @@ struct mca_btl_uct_pending_connection_request_t { typedef struct mca_btl_uct_pending_connection_request_t mca_btl_uct_pending_connection_request_t; OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); +/** + * @brief parsed include/exclude list + * + */ +struct mca_btl_uct_include_list_t { + /** argv-style (NULL terminated) array of strings */ + char **list; + /** is an inclusive list (vs exclusive) */ + bool include; +}; +typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; + + #endif /* !defined(BTL_UCT_TYPES_H) */