From 8b3e7b5e3eb56b5167aff978cfe0184c63aadc93 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 27 Feb 2025 13:59:40 -0700 Subject: [PATCH] btl/uct: allow connections to be formed using a separate memory domain It is possible that the current memory domain does not have an adequate transport for forming endpoint to endpoint connections. When this is the case the btl will fail to function. To support these situations this CL adds support for using an alternate transport (usually tcp) which can be used to make the endpoint connections. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 23 +- opal/mca/btl/uct/btl_uct_am.c | 1 - opal/mca/btl/uct/btl_uct_component.c | 358 ++++++++++++++++++++------- opal/mca/btl/uct/btl_uct_endpoint.c | 202 +++++++++------ opal/mca/btl/uct/btl_uct_tl.c | 77 ++---- opal/mca/btl/uct/btl_uct_types.h | 15 ++ 6 files changed, 459 insertions(+), 217 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 65bc69fddb2..b07a6efc9a6 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -141,9 +141,15 @@ struct mca_btl_uct_component_t { /** allowed UCT memory domains */ char *memory_domains; + mca_btl_uct_include_list_t memory_domain_list; /** allowed transports */ char *allowed_transports; + mca_btl_uct_include_list_t allowed_transport_list; + + /** transports to consider for forming connections */ + char *connection_domains; + mca_btl_uct_include_list_t connection_domain_list; /** number of worker contexts to create */ int num_contexts_per_module; @@ -158,6 +164,10 @@ struct mca_btl_uct_component_t { /** connection retry timeout */ unsigned int connection_retry_timeout; + + /** alternate connection-only module that can be used if no suitable + * connection tl is found. this is usually a tcp tl. */ + mca_btl_uct_module_t *conn_module; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; @@ -294,7 +304,8 @@ struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t opal_proc_t *proc); int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count); + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); @@ -341,5 +352,15 @@ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } +/** + * @brief Find the rank of `name` in the include list `list`. + * + * @param[in] name name to find + * @param[in] list list to search + * + * A negative result means the name is not present or the list is negated. + */ +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list); + END_C_DECLS #endif diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 1aae456842c..85d89d2d734 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -55,7 +55,6 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade { uint32_t iov_count = 1; struct iovec iov; - size_t length; if (header_size > 0) { assert(NULL != header); diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 5eec97ec487..5f4ed3776c6 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -27,6 +27,8 @@ * $HEADER$ */ +#include + #include "opal_config.h" #include "opal/mca/btl/base/base.h" @@ -67,6 +69,15 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.allowed_transports); + mca_btl_uct_component.connection_domains = "tcp"; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_domains", + "Comma-delimited list of connection-only domains to use sorted by increasing " + "priority. The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: tcp)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_domains); + mca_btl_uct_component.num_contexts_per_module = 0; (void) mca_base_component_var_register( &mca_btl_uct_component.super.btl_version, "num_contexts_per_module", @@ -124,6 +135,54 @@ static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, b ucm_vm_munmap(buf, length); } +static void mca_btl_uct_component_parse_include_list (const char *value, mca_btl_uct_include_list_t *list) { + list->list = NULL; + list->include = true; + + if (value == NULL) { + return; + } + + if (value[0] == '^') { + list->include = false; + value++; + } + + list->list = opal_argv_split(value, ','); +} + +static void mca_btl_uct_include_list_free (mca_btl_uct_include_list_t *list) { + opal_argv_free (list->list); + list->list = NULL; +} + +int mca_btl_uct_include_list_rank (const char *name, const mca_btl_uct_include_list_t *list) { + if (list->list == NULL) { + return -1; + } + + for (int i = 0; list->list[i]; ++i) { + regex_t preg; + + BTL_VERBOSE(("evaluating %s vs %s-list item %s", name, list->include ? "include" : "exclude", list->list[i])); + int rc = regcomp(&preg, list->list[i], REG_ICASE); + if (0 != rc) { + char errbuf[256]; + regerror(rc, &preg, errbuf, sizeof(errbuf)); + BTL_ERROR(("when matching name, could not parse regular expression: %s, error: %s", list->list[i], errbuf)); + continue; + } + + int result = regexec(&preg, name, /*nmatch=*/0, /*pmatch=*/NULL, /*eflags=*/0); + regfree(&preg); + if (0 == result) { + return list->include ? i + 1 : -(i + 1); + } + } + + return list->include ? -1 : 1; +} + static int mca_btl_uct_component_open(void) { if (0 == mca_btl_uct_component.num_contexts_per_module) { @@ -167,10 +226,19 @@ static int mca_btl_uct_component_open(void) */ static int mca_btl_uct_component_close(void) { + if (NULL != mca_btl_uct_component.conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + if (mca_btl_uct_component.disable_ucx_memory_hooks) { opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb); } + mca_btl_uct_include_list_free (&mca_btl_uct_component.memory_domain_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_include_list_free (&mca_btl_uct_component.connection_domain_list); + return OPAL_SUCCESS; } @@ -235,6 +303,34 @@ static size_t mca_btl_uct_tl_modex_pack(mca_btl_uct_tl_t *tl, uint8_t *modex_dat return modex_size; } +static uint8_t *mca_btl_uct_modex_pack(mca_btl_uct_module_t *module, uint8_t *modex_data) +{ + size_t name_len = strlen(module->md_name); + + /* pack the size */ + *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); + + modex_data += 4; + + strcpy((char *) modex_data, module->md_name); + modex_data += name_len + 1; + + if (module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); + } + + if (module->am_tl && module->am_tl != module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); + } + + if (module->conn_tl && module->conn_tl != module->rdma_tl + && module->conn_tl != module->am_tl) { + modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); + } + + return modex_data; +} + static int mca_btl_uct_modex_send(void) { size_t modex_size = sizeof(mca_btl_uct_modex_t); @@ -246,35 +342,22 @@ static int mca_btl_uct_modex_send(void) modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.modules[i]); } + if (mca_btl_uct_component.conn_module != NULL) { + modex_size += mca_btl_uct_module_modex_size(mca_btl_uct_component.conn_module); + } + modex = alloca(modex_size); modex_data = modex->data; modex->module_count = mca_btl_uct_component.module_count; for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { - mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; - size_t name_len = strlen(module->md_name); - - /* pack the size */ - *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size(module); - - modex_data += 4; - - strcpy((char *) modex_data, module->md_name); - modex_data += name_len + 1; - - if (module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->rdma_tl, modex_data); - } - - if (module->am_tl && module->am_tl != module->rdma_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->am_tl, modex_data); - } + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.modules[i], modex_data); + } - if (module->conn_tl && module->conn_tl != module->rdma_tl - && module->conn_tl != module->am_tl) { - modex_data += mca_btl_uct_tl_modex_pack(module->conn_tl, modex_data); - } + if (mca_btl_uct_component.conn_module != NULL) { + ++modex->module_count; + modex_data = mca_btl_uct_modex_pack (mca_btl_uct_component.conn_module, modex_data); } OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); @@ -323,6 +406,10 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; mca_btl_uct_module_t *uct_btl = tl_context->uct_btl; mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data; + if (header->data.tag == 0xff) { + fprintf (stderr, "%d: got an invalid tag\n"); + while (true) {} + } mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + header->data.tag; mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof(*header))}, @@ -337,17 +424,16 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign tl_context->in_am_callback = true; reg->cbfunc(&uct_btl->super, &desc); tl_context->in_am_callback = false; + header->data.tag = 0xff; return UCS_OK; } #if UCT_API >= UCT_VERSION(1, 7) static int mca_btl_uct_component_process_uct_md(uct_component_h component, - uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) + uct_md_resource_desc_t *md_desc) #else -static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc) #endif { mca_rcache_base_resources_t rcache_resources; @@ -356,29 +442,35 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, uct_md_config_t *uct_config; uct_md_attr_t md_attr; mca_btl_uct_md_t *md; - bool found = false; + int list_rank; unsigned num_tls; char *tmp; ucs_status_t ucs_status; + int connection_list_rank = -1; + bool consider_for_connection_module = false; + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { BTL_VERBOSE(("created the maximum number of allowable modules")); return OPAL_ERR_NOT_AVAILABLE; } - BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + BTL_VERBOSE(("checking if %s should be used for communication", md_desc->md_name)); + list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.memory_domain_list); - for (int j = 0; allowed_ifaces[j]; ++j) { - if (0 == strncmp(allowed_ifaces[j], md_desc->md_name, strlen(md_desc->md_name)) - || 0 == strcmp(allowed_ifaces[j], "all")) { - found = true; - break; + if (list_rank < 0) { + BTL_VERBOSE(("checking if %s should be used for connections", md_desc->md_name)); + connection_list_rank = mca_btl_uct_include_list_rank (md_desc->md_name, &mca_btl_uct_component.connection_domain_list); + + if (connection_list_rank < 0) { + /* nothing to do */ + BTL_VERBOSE(("not continuing with memory domain %s", md_desc->md_name)); + return OPAL_SUCCESS; } - } - if (!found) { - /* nothing to do */ - return OPAL_SUCCESS; + BTL_VERBOSE(("will be considering domain %s for connections only", md_desc->md_name)); + consider_for_connection_module = true; } md = OBJ_NEW(mca_btl_uct_md_t); @@ -425,7 +517,9 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, return OPAL_ERR_OUT_OF_RESOURCE; } - (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls); + /* if this module is not to be used for communication check if it has a transport suitable + * for forming connections. */ + (void) mca_btl_uct_query_tls(module, md, tl_desc, num_tls, consider_for_connection_module); uct_release_tl_resource_list(tl_desc); @@ -433,7 +527,7 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, * remain open until those modules are finalized. */ OBJ_RELEASE(md); - if (NULL == module->am_tl && NULL == module->rdma_tl) { + if (NULL == module->am_tl && NULL == module->rdma_tl && (NULL == module->conn_tl || !consider_for_connection_module)) { BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); mca_btl_uct_finalize(&module->super); return OPAL_ERR_NOT_AVAILABLE; @@ -443,35 +537,43 @@ static int mca_btl_uct_component_process_uct_md(uct_md_resource_desc_t *md_desc, module->uct_component = component; #endif - mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; - - /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable - * performance benefits to using rcache/grdma instead of assuming UCT will do the right - * thing. */ - (void) opal_asprintf(&tmp, "uct.%s", module->md_name); - - rcache_resources.cache_name = tmp; - rcache_resources.reg_data = (void *) module; - rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) - + module->super.btl_registration_handle_size; - rcache_resources.register_mem = mca_btl_uct_reg_mem; - rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; - - module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); - free(tmp); - if (NULL == module->rcache) { - /* something when horribly wrong */ - BTL_VERBOSE(("could not allocate a registration cache for this btl module")); - mca_btl_uct_finalize(&module->super); - return OPAL_ERROR; + if (!consider_for_connection_module) { + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + (void) opal_asprintf(&tmp, "uct.%s", module->md_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof(mca_btl_uct_reg_t) + + module->super.btl_registration_handle_size; + rcache_resources.register_mem = mca_btl_uct_reg_mem; + rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; + + module->rcache = mca_rcache_base_module_create("grdma", module, &rcache_resources); + free(tmp); + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + mca_btl_uct_finalize(&module->super); + return OPAL_ERROR; + } + } else { + if (NULL == mca_btl_uct_component.conn_module) { + BTL_VERBOSE(("memory domain %s may be used for connections", md_desc->md_name)); + mca_btl_uct_component.conn_module = module; + } else { + mca_btl_uct_finalize(&module->super); + } } return OPAL_SUCCESS; } #if UCT_API >= UCT_VERSION(1, 7) -static int mca_btl_uct_component_process_uct_component(uct_component_h component, - char **allowed_ifaces) +static int mca_btl_uct_component_process_uct_component(uct_component_h component) { uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; @@ -493,7 +595,7 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } for (unsigned i = 0; i < attr.md_resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(component, attr.md_resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -505,6 +607,63 @@ static int mca_btl_uct_component_process_uct_component(uct_component_h component } #endif /* UCT_API >= UCT_VERSION(1, 7) */ +static void mca_btl_uct_component_validate_modules(void) { + if (mca_btl_uct_component.conn_module != NULL) { + /* verify that a connection-only module is required. this might be the case in some systems + * where rc verbs is avaiable but ud is not. */ + bool need_conn_module = false; + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (module->conn_tl != NULL) { + continue; + } + if ((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) { + need_conn_module = true; + break; + } + } + + if (!need_conn_module) { + mca_btl_uct_finalize (&mca_btl_uct_component.conn_module->super); + mca_btl_uct_component.conn_module = NULL; + } + } else { + int usable_module_count = mca_btl_uct_component.module_count; + + /* check that all modules can be used */ + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + if (NULL != module->conn_tl) { + /* module has its own connection transport */ + continue; + } + + if (((module->rdma_tl && mca_btl_uct_tl_requires_connection_tl(module->rdma_tl)) || + (module->am_tl && mca_btl_uct_tl_requires_connection_tl(module->am_tl))) + && NULL == module->conn_tl) { + /* module can not be used */ + BTL_VERBOSE(("module for memory domain %s can not be used due to missing connection transport", + module->md_name)); + mca_btl_uct_finalize (&mca_btl_uct_component.modules[i]->super); + mca_btl_uct_component.modules[i] = NULL; + } + } + + /* remove holes in the module array */ + if (usable_module_count < mca_btl_uct_component.module_count) { + for (int i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + if (mca_btl_uct_component.modules[i] == NULL) { + for (int j = i ; j < mca_btl_uct_component.module_count ; ++j) { + mca_btl_uct_component.modules[i++] = mca_btl_uct_component.modules[j]; + } + } + } + mca_btl_uct_component.module_count = usable_module_count; + } + } +} + /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -521,7 +680,6 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, */ struct mca_btl_base_module_t **base_modules; ucs_status_t ucs_status; - char **allowed_ifaces; int rc; BTL_VERBOSE(("initializing uct btl")); @@ -533,10 +691,12 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, return NULL; } - allowed_ifaces = opal_argv_split(mca_btl_uct_component.memory_domains, ','); - if (NULL == allowed_ifaces) { - return NULL; - } + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.memory_domains, + &mca_btl_uct_component.memory_domain_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.allowed_transports, + &mca_btl_uct_component.allowed_transport_list); + mca_btl_uct_component_parse_include_list(mca_btl_uct_component.connection_domains, + &mca_btl_uct_component.connection_domain_list); mca_btl_uct_component.module_count = 0; @@ -552,7 +712,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < num_components; ++i) { - rc = mca_btl_uct_component_process_uct_component(components[i], allowed_ifaces); + rc = mca_btl_uct_component_process_uct_component(components[i]); if (OPAL_SUCCESS != rc) { break; } @@ -568,7 +728,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, /* generate all suitable btl modules */ for (unsigned i = 0; i < resource_count; ++i) { - rc = mca_btl_uct_component_process_uct_md(resources + i, allowed_ifaces); + rc = mca_btl_uct_component_process_uct_md(resources + i); if (OPAL_SUCCESS != rc) { break; } @@ -578,7 +738,9 @@ static mca_btl_base_module_t **mca_btl_uct_component_init(int *num_btl_modules, #endif /* UCT_API >= UCT_VERSION(1, 7) */ - opal_argv_free(allowed_ifaces); + /* filter out unusable modules before sending the modex */ + mca_btl_uct_component_validate_modules(); + mca_btl_uct_modex_send(); /* pass module array back to caller */ @@ -644,6 +806,38 @@ static int mca_btl_uct_component_progress_pending(mca_btl_uct_module_t *uct_btl) return completed; } +static int mca_btl_uct_component_progress_connections (mca_btl_uct_module_t *module) { + mca_btl_uct_pending_connection_request_t *request; + int ret; + + if (module->conn_tl == NULL) { + return 0; + } + + ret = mca_btl_uct_tl_progress(module->conn_tl, 0); + + while (NULL + != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( + &module->pending_connection_reqs))) { + mca_btl_uct_conn_req_t *conn_req = (mca_btl_uct_conn_req_t *) request->request_data; + BTL_VERBOSE(("processing connection request....")); + for (int i = 0; i < mca_btl_uct_component.module_count; ++i) { + if (0 == strncmp(mca_btl_uct_component.modules[i]->md_name, conn_req->module_name, sizeof(conn_req->module_name) - 1)) { + module = mca_btl_uct_component.modules[i]; + break; + } + } + int rc = mca_btl_uct_process_connection_request(module, conn_req); + if (rc != OPAL_SUCCESS) { + opal_fifo_push_atomic(&module->pending_connection_reqs, &request->super); + break; + } + OBJ_RELEASE(request); + } + + return ret; +} + /** * @brief UCT BTL progress function * @@ -665,27 +859,17 @@ static int mca_btl_uct_component_progress(void) ret += mca_btl_uct_tl_progress(module->am_tl, starting_index); } - if (module->conn_tl) { - mca_btl_uct_pending_connection_request_t *request; - - if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { - ret += mca_btl_uct_tl_progress(module->conn_tl, 0); - } - - while (NULL - != (request = (mca_btl_uct_pending_connection_request_t *) opal_fifo_pop_atomic( - &module->pending_connection_reqs))) { - mca_btl_uct_process_connection_request(module, (mca_btl_uct_conn_req_t *) - request->request_data); - OBJ_RELEASE(request); - } - } - + mca_btl_uct_component_progress_connections (module); + if (0 != opal_list_get_size(&module->pending_frags)) { mca_btl_uct_component_progress_pending(module); } } + if (NULL != mca_btl_uct_component.conn_module) { + ret += mca_btl_uct_component_progress_connections (mca_btl_uct_component.conn_module); + } + return (int) ret; } diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 695fd754aa2..42a0cd2684a 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -256,50 +256,57 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_SUCCESS; } -static int mca_btl_uct_endpoint_send_connection_data( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *conn_tl_data, int request_type) +static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *conn_tl, uint8_t *conn_tl_data) { - mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; - mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - uct_device_addr_t *device_addr = NULL; - uct_iface_addr_t *iface_addr; - ucs_status_t ucs_status; - - assert(NULL != conn_tl); - - BTL_VERBOSE(("connecting endpoint to remote endpoint")); + if (NULL != endpoint->conn_ep) { + BTL_VERBOSE(("re-using existing connection endpoint")); + OBJ_RETAIN(endpoint->conn_ep); + return OPAL_SUCCESS; + } - if (NULL == endpoint->conn_ep) { - BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", - opal_process_name_print(endpoint->ep_proc->proc_name))); + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print(endpoint->ep_proc->proc_name))); - iface_addr = (uct_iface_addr_t *) conn_tl_data; - device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data - + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); + uct_iface_addr_t *iface_addr = (uct_iface_addr_t *) conn_tl_data; + uct_device_addr_t *device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + + MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len); - endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); - if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } + endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } - /* create a temporary endpoint for setting up the rdma endpoint */ - MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status_t ucs_status; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface, device_addr, iface_addr, &endpoint->conn_ep->uct_ep); }); - if (UCS_OK != ucs_status) { - BTL_VERBOSE( - ("could not create an endpoint for forming connection to remote peer. code = %d", - ucs_status)); - return OPAL_ERROR; - } - } else { - OBJ_RETAIN(endpoint->conn_ep); + if (UCS_OK != ucs_status) { + BTL_VERBOSE( + ("could not create an endpoint for forming connection to remote peer. code = %d", + ucs_status)); + return OPAL_ERROR; } + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_send_connection_data( + mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, int request_type) +{ + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + ucs_status_t ucs_status; + + assert(NULL != conn_tl); + + BTL_VERBOSE(("connecting endpoint to remote endpoint")); + size_t request_length = sizeof(mca_btl_uct_conn_req_t) + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; mca_btl_uct_conn_req_t *request = alloca(request_length); @@ -309,6 +316,7 @@ static int mca_btl_uct_endpoint_send_connection_data( request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; request->type = request_type; + strncpy(request->module_name, uct_btl->md_name, sizeof(request->module_name) - 1); /* fill in connection request */ ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); @@ -337,9 +345,9 @@ static int mca_btl_uct_endpoint_send_connection_data( } static int mca_btl_uct_endpoint_connect_endpoint( - mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, - mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) + mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *conn_tl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, void *ep_addr) { ucs_status_t ucs_status; @@ -378,11 +386,44 @@ static int mca_btl_uct_endpoint_connect_endpoint( : OPAL_ERR_OUT_OF_RESOURCE; } - int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, - conn_tl_data, /*request_type=*/!!ep_addr); + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, conn_tl, endpoint, tl, tl_context, tl_endpoint, + /*request_type=*/!!ep_addr); return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; } +static int mca_btl_uct_find_modex(mca_btl_uct_module_t *uct_btl, mca_btl_uct_modex_t *modex, + uint8_t **rdma_tl_data, uint8_t **am_tl_data, uint8_t **conn_tl_data) { + uint8_t *modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0; i < modex->module_count; ++i) { + uint32_t modex_size = *((uint32_t *) modex_data); + + BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); + + modex_data += 4; + + if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += modex_size - 4; + continue; + } + + modex_data += strlen((char *) modex_data) + 1; + + mca_btl_uct_process_modex(uct_btl, modex_data, rdma_tl_data, am_tl_data, conn_tl_data); + + BTL_VERBOSE(("finished processing modex for %s", uct_btl->md_name)); + + return OPAL_SUCCESS; + } + + BTL_ERROR(("could not find modex for %s", uct_btl->md_name)); + + return OPAL_ERR_NOT_FOUND; +} + + int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, void *ep_addr, int tl_index) { @@ -394,7 +435,6 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp = mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id); uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; mca_btl_uct_modex_t *modex; - uint8_t *modex_data; size_t msg_size; int rc; @@ -410,19 +450,20 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); opal_mutex_lock(&endpoint->ep_lock); - if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { - opal_mutex_unlock(&endpoint->ep_lock); - /* nothing more to do. someone else completed the connection */ - return OPAL_SUCCESS; - } - - /* dumpicate connection request. nothing to do until the endpoint data is received */ - if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } do { + if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { + /* nothing more to do. someone else completed the connection */ + rc = OPAL_SUCCESS; + break; + } + + /* dumpicate connection request. nothing to do until the endpoint data is received */ + if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + break; + } + /* read the modex. this is done both to start the connection and to process endpoint data */ OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, &endpoint->ep_proc->proc_name, (void **) &modex, &msg_size); @@ -434,45 +475,50 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); - modex_data = modex->data; - - /* look for matching transport in the modex */ - for (int i = 0; i < modex->module_count; ++i) { - uint32_t modex_size = *((uint32_t *) modex_data); - - BTL_VERBOSE( - ("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); - - modex_data += 4; - - if (0 != strcmp((char *) modex_data, uct_btl->md_name)) { - /* modex belongs to a different module, skip it and continue */ - modex_data += modex_size - 4; - continue; - } - modex_data += strlen((char *) modex_data) + 1; - - mca_btl_uct_process_modex(uct_btl, modex_data, &rdma_tl_data, &am_tl_data, - &conn_tl_data); + rc = mca_btl_uct_find_modex (uct_btl, modex, &rdma_tl_data, &am_tl_data, &conn_tl_data); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { break; } tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; - if (NULL == tl_data) { - opal_mutex_unlock(&endpoint->ep_lock); - return OPAL_ERR_UNREACH; + if (OPAL_UNLIKELY(NULL == tl_data)) { + BTL_ERROR(("could not find modex data for this transport")); + rc = OPAL_ERR_UNREACH; + break; } /* connect the endpoint */ - if (!mca_btl_uct_tl_requires_connection_tl(tl)) { - rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); + if (mca_btl_uct_tl_requires_connection_tl(tl)) { + mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; + if (NULL == conn_tl) { + rc = mca_btl_uct_find_modex (mca_btl_uct_component.conn_module, modex, + /*rdma_tl_data=*/NULL, /*am_tl_data=*/NULL, + &conn_tl_data); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("could not find modex for connection module")); + break; + } + + BTL_VERBOSE(("using separate connection module for tl")); + conn_tl = mca_btl_uct_component.conn_module->conn_tl; + } + + if (NULL == tl_endpoint->uct_ep) { + /* allocate or retain a connection endpoint */ + rc = mca_btl_uct_endpoint_get_helper_endpoint(uct_btl, endpoint, conn_tl, + conn_tl_data); + if (OPAL_SUCCESS != rc) { + break; + } + } + + rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, conn_tl, endpoint, tl, + tl_context, tl_endpoint, tl_data, ep_addr); } else { - rc = mca_btl_uct_endpoint_connect_endpoint(uct_btl, endpoint, tl, tl_context, - tl_endpoint, tl_data, conn_tl_data, ep_addr); + rc = mca_btl_uct_endpoint_connect_iface(uct_btl, tl, tl_context, tl_endpoint, tl_data); } - } while (0); opal_mutex_unlock(&endpoint->ep_lock); diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index c1ef4c6d727..fd8061fa81c 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -553,57 +553,27 @@ static int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_ } int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, - uct_tl_resource_desc_t *tl_descs, unsigned tl_count) + uct_tl_resource_desc_t *tl_descs, unsigned tl_count, + bool evaluate_for_conn_only) { - bool include = true, any = false; mca_btl_uct_tl_t *tl; opal_list_t tl_list; - char **tl_filter; - int any_priority = 0; OBJ_CONSTRUCT(&tl_list, opal_list_t); - tl_filter = opal_argv_split(mca_btl_uct_component.allowed_transports, ','); - - if ('^' == tl_filter[0][0]) { - /* user has negated the include list */ - char *tmp = strdup(tl_filter[0] + 1); - - free(tl_filter[0]); - tl_filter[0] = tmp; - include = false; - } - - /* check for the any keyword */ - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], "any")) { - any_priority = j; - any = true; - break; - } - } - - if (any && !include) { - opal_argv_free(tl_filter); - return OPAL_ERR_NOT_AVAILABLE; - } - for (unsigned i = 0; i < tl_count; ++i) { - bool try_tl = any; - int priority = any_priority; - - for (unsigned j = 0; tl_filter[j]; ++j) { - if (0 == strcmp(tl_filter[j], tl_descs[i].tl_name)) { - try_tl = include; - priority = j; - break; + int priority = 0; + BTL_VERBOSE(("processing tl %s, evaluate_for_conn_only=%d", tl_descs[i].tl_name, evaluate_for_conn_only)); + + if (!evaluate_for_conn_only) { + priority = mca_btl_uct_include_list_rank (tl_descs[i].tl_name, &mca_btl_uct_component.allowed_transport_list); + BTL_VERBOSE(("tl filter: tl_name = %s, priority = %d", tl_descs[i].tl_name, + priority)); + if (priority < 0) { + continue; } - } - - BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, - try_tl, priority)); - - if (!try_tl) { + } else if (tl_descs[i].dev_type != UCT_DEVICE_TYPE_NET) { + /* only network types are suitable for forming connections */ continue; } @@ -616,12 +586,23 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, tl = mca_btl_uct_create_tl(module, md, tl_descs + i, priority); if (tl) { - opal_list_append(&tl_list, &tl->super); + if (mca_btl_uct_tl_supports_conn(tl) && evaluate_for_conn_only) { + BTL_VERBOSE(("evaluating tl %s for forming connections", tl_descs[i].tl_name)); + int rc = mca_btl_uct_set_tl_conn(module, tl); + OBJ_RELEASE(tl); + + if (OPAL_SUCCESS == rc) { + mca_btl_uct_context_enable_progress(tl->uct_dev_contexts[0]); + return OPAL_SUCCESS; + } + + BTL_VERBOSE(("tl %s cannot be used for forming connections", tl_descs[i].tl_name)); + } else { + opal_list_append(&tl_list, &tl->super); + } } } - opal_argv_free(tl_filter); - if (0 == opal_list_get_size(&tl_list)) { BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); OBJ_DESTRUCT(&tl_list); @@ -670,10 +651,6 @@ int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, /* no connection tl needed for selected transports */ OBJ_RELEASE(module->conn_tl); module->conn_tl = NULL; - } else if (NULL == module->conn_tl) { - BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", - mca_btl_uct_component.allowed_transports)); - return OPAL_ERROR; } return OPAL_SUCCESS; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index b2bac61be61..7b54b02d640 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -90,6 +90,8 @@ struct mca_btl_uct_conn_req_t { /** transport index that should be connected */ int tl_index; + char module_name[16]; + /** endpoint address data */ uint8_t ep_addr[]; }; @@ -347,4 +349,17 @@ struct mca_btl_uct_pending_connection_request_t { typedef struct mca_btl_uct_pending_connection_request_t mca_btl_uct_pending_connection_request_t; OBJ_CLASS_DECLARATION(mca_btl_uct_pending_connection_request_t); +/** + * @brief parsed include/exclude list + * + */ +struct mca_btl_uct_include_list_t { + /** argv-style (NULL terminated) array of strings */ + char **list; + /** is an inclusive list (vs exclusive) */ + bool include; +}; +typedef struct mca_btl_uct_include_list_t mca_btl_uct_include_list_t; + + #endif /* !defined(BTL_UCT_TYPES_H) */