From acfc908e48961ba592b99f523e14c50188629fc1 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Fri, 24 Jan 2025 14:00:37 +0000 Subject: [PATCH] TL/UCP: add radix selection to kn radix --- .../tl/ucp/allgather/allgather_knomial.c | 17 +++++++++-- .../tl/ucp/allreduce/allreduce_sra_knomial.c | 7 +++-- .../tl/ucp/bcast/bcast_sag_knomial.c | 8 +++-- src/components/tl/ucp/tl_ucp.c | 4 +-- src/components/tl/ucp/tl_ucp.h | 5 ++-- src/components/tl/ucp/tl_ucp_lib.c | 1 - src/components/tl/ucp/tl_ucp_team.c | 30 +++++++++++++------ 7 files changed, 49 insertions(+), 23 deletions(-) diff --git a/src/components/tl/ucp/allgather/allgather_knomial.c b/src/components/tl/ucp/allgather/allgather_knomial.c index 1fbcf773cc..a9f9afadbf 100644 --- a/src/components/tl/ucp/allgather/allgather_knomial.c +++ b/src/components/tl/ucp/allgather/allgather_knomial.c @@ -314,9 +314,20 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_init(ucc_base_coll_args_t *coll_args, ucc_coll_task_t **task_h) { ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); - ucc_rank_t size = UCC_TL_TEAM_SIZE(tl_team); - ucc_kn_radix_t radix; + ucc_mrange_uint_t *p = &tl_team->cfg.allgather_kn_radix; + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(tl_team); + size_t count = coll_args->args.dst.info.count / tsize; + ucc_datatype_t dtype = coll_args->args.dst.info.datatype; + ucc_memory_type_t mtype = coll_args->args.dst.info.mem_type; + ucc_kn_radix_t radix, cfg_radix, opt_radix; + + opt_radix = (mtype == UCC_MEMORY_TYPE_HOST) ? tl_team->opt_radix_host : + tl_team->opt_radix; + + cfg_radix = ucc_tl_ucp_get_radix_from_range(tl_team, + count * ucc_dt_size(dtype), + mtype, p, opt_radix); + radix = ucc_min(cfg_radix, tsize); - radix = ucc_min(UCC_TL_UCP_TEAM_LIB(tl_team)->cfg.allgather_kn_radix, size); return ucc_tl_ucp_allgather_knomial_init_r(coll_args, team, task_h, radix); } diff --git a/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c b/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c index d24eca786c..c20918bd28 100644 --- a/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c +++ b/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c @@ -97,7 +97,7 @@ ucc_tl_ucp_allreduce_sra_knomial_frag_init(ucc_base_coll_args_t *coll_args, ucc_schedule_t *schedule; ucc_coll_task_t *task, *rs_task; ucc_status_t status; - ucc_kn_radix_t radix, cfg_radix; + ucc_kn_radix_t radix, cfg_radix, opt_radix; size_t count; status = ucc_tl_ucp_get_schedule(tl_team, coll_args, @@ -112,10 +112,11 @@ ucc_tl_ucp_allreduce_sra_knomial_frag_init(ucc_base_coll_args_t *coll_args, count = coll_args->args.dst.info.count; } + opt_radix = (mem_type == UCC_MEMORY_TYPE_HOST) ? tl_team->opt_radix_host : + tl_team->opt_radix; cfg_radix = ucc_tl_ucp_get_radix_from_range(tl_team, count * ucc_dt_size(dtype), - mem_type, p, - tl_team->opt_radix); + mem_type, p, opt_radix); radix = ucc_knomial_pattern_get_min_radix(cfg_radix, UCC_TL_TEAM_SIZE(tl_team), count); diff --git a/src/components/tl/ucp/bcast/bcast_sag_knomial.c b/src/components/tl/ucp/bcast/bcast_sag_knomial.c index 3f4a6919f6..8301bda82e 100644 --- a/src/components/tl/ucp/bcast/bcast_sag_knomial.c +++ b/src/components/tl/ucp/bcast/bcast_sag_knomial.c @@ -82,17 +82,19 @@ ucc_tl_ucp_bcast_sag_knomial_init(ucc_base_coll_args_t *coll_args, ucc_schedule_t *schedule; ucc_coll_task_t *task, *rs_task; ucc_status_t status; - ucc_kn_radix_t radix, cfg_radix; + ucc_kn_radix_t radix, cfg_radix, opt_radix; if (UCC_COLL_ARGS_ACTIVE_SET(&coll_args->args)) { /* ActiveSets currently are only supported with KN alg */ return ucc_tl_ucp_bcast_knomial_init(coll_args, team, task_h); } + opt_radix = (mem_type == UCC_MEMORY_TYPE_HOST) ? tl_team->opt_radix_host : + tl_team->opt_radix; + cfg_radix = ucc_tl_ucp_get_radix_from_range(tl_team, count * ucc_dt_size(dtype), - mem_type, p, - tl_team->opt_radix); + mem_type, p, opt_radix); radix = ucc_knomial_pattern_get_min_radix(cfg_radix, UCC_TL_TEAM_SIZE(tl_team), count); diff --git a/src/components/tl/ucp/tl_ucp.c b/src/components/tl/ucp/tl_ucp.c index 7db99bdaf2..09af260aa7 100644 --- a/src/components/tl/ucp/tl_ucp.c +++ b/src/components/tl/ucp/tl_ucp.c @@ -136,9 +136,9 @@ ucc_config_field_t ucc_tl_ucp_lib_config_table[] = { ucc_offsetof(ucc_tl_ucp_lib_config_t, reduce_scatter_kn_radix), UCC_CONFIG_TYPE_UINT}, - {"ALLGATHER_KN_RADIX", "4", "Radix of the knomial allgather algorithm", + {"ALLGATHER_KN_RADIX", "auto", "Radix of the knomial allgather algorithm", ucc_offsetof(ucc_tl_ucp_lib_config_t, allgather_kn_radix), - UCC_CONFIG_TYPE_UINT}, + UCC_CONFIG_TYPE_UINT_RANGED}, {"BCAST_KN_RADIX", "4", "Radix of the recursive-knomial bcast algorithm", ucc_offsetof(ucc_tl_ucp_lib_config_t, bcast_kn_radix), diff --git a/src/components/tl/ucp/tl_ucp.h b/src/components/tl/ucp/tl_ucp.h index 3c439f4ae5..897df2c0b9 100644 --- a/src/components/tl/ucp/tl_ucp.h +++ b/src/components/tl/ucp/tl_ucp.h @@ -54,7 +54,7 @@ typedef struct ucc_tl_ucp_lib_config { ucc_mrange_uint_t allreduce_kn_radix; ucc_mrange_uint_t allreduce_sra_kn_radix; uint32_t reduce_scatter_kn_radix; - uint32_t allgather_kn_radix; + ucc_mrange_uint_t allgather_kn_radix; uint32_t bcast_kn_radix; ucc_mrange_uint_t bcast_sag_kn_radix; uint32_t reduce_kn_radix; @@ -145,7 +145,8 @@ typedef struct ucc_tl_ucp_team { const char * tuning_str; ucc_topo_t *topo; ucc_ep_map_t ctx_map; - ucc_rank_t opt_radix; + ucc_rank_t opt_radix; /* generic opt radix */ + ucc_rank_t opt_radix_host; /* host specific opt radix */ } ucc_tl_ucp_team_t; UCC_CLASS_DECLARE(ucc_tl_ucp_team_t, ucc_base_context_t *, const ucc_base_team_params_t *); diff --git a/src/components/tl/ucp/tl_ucp_lib.c b/src/components/tl/ucp/tl_ucp_lib.c index 9a724abff9..cec14a767b 100644 --- a/src/components/tl/ucp/tl_ucp_lib.c +++ b/src/components/tl/ucp/tl_ucp_lib.c @@ -30,7 +30,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_lib_t, const ucc_base_lib_params_t *params, if (tl_ucp_config->kn_radix > 0) { self->cfg.barrier_kn_radix = tl_ucp_config->kn_radix; self->cfg.reduce_scatter_kn_radix = tl_ucp_config->kn_radix; - self->cfg.allgather_kn_radix = tl_ucp_config->kn_radix; self->cfg.bcast_kn_radix = tl_ucp_config->kn_radix; self->cfg.reduce_kn_radix = tl_ucp_config->kn_radix; self->cfg.scatter_kn_radix = tl_ucp_config->kn_radix; diff --git a/src/components/tl/ucp/tl_ucp_team.c b/src/components/tl/ucp/tl_ucp_team.c index b6db1654e4..93206a1fe4 100644 --- a/src/components/tl/ucp/tl_ucp_team.c +++ b/src/components/tl/ucp/tl_ucp_team.c @@ -44,10 +44,10 @@ static inline ucc_status_t ucc_tl_ucp_get_topo(ucc_tl_ucp_team_t *team) UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context, const ucc_base_team_params_t *params) { - ucc_tl_ucp_context_t *ctx = - ucc_derived_of(tl_context, ucc_tl_ucp_context_t); + ucc_tl_ucp_context_t *ctx = ucc_derived_of(tl_context, + ucc_tl_ucp_context_t); ucc_kn_radix_t max_radix, min_radix; - ucc_rank_t tsize; + ucc_rank_t tsize, max_ppn; ucc_status_t status; UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params); @@ -59,6 +59,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context, self->tuning_str = ""; self->topo = NULL; self->opt_radix = UCC_UUNITS_AUTO_RADIX; + self->opt_radix_host = UCC_UUNITS_AUTO_RADIX; status = ucc_config_clone_table(&UCC_TL_UCP_TEAM_LIB(self)->cfg, &self->cfg, ucc_tl_ucp_lib_config_table); @@ -91,14 +92,25 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context, self->cfg.use_reordering = 0; } - if (self->topo && !UCC_TL_IS_SERVICE_TEAM(self) && - self->topo->topo->sock_bound) { + if (self->topo && !UCC_TL_IS_SERVICE_TEAM(self)) { tsize = UCC_TL_TEAM_SIZE(self); - max_radix = (ucc_topo_max_ppn(self->topo) == 1) ? tsize : - ucc_min(tsize, ucc_topo_min_socket_size(self->topo)); - min_radix = ucc_min(tsize, ucc_topo_max_ppn(self->topo) == 1 ? 3: 2); + max_ppn = ucc_topo_max_ppn(self->topo); + + min_radix = ucc_min(tsize, 3); + max_radix = tsize; self->opt_radix = ucc_kn_get_opt_radix(tsize, min_radix, max_radix); - tl_debug(tl_context->lib, "opt knomial radix: %d", self->opt_radix); + if (max_ppn == 1) { + self->opt_radix_host = self->opt_radix; + } else { + if (self->topo->topo->sock_bound) { + min_radix = 2; + max_radix = ucc_min(tsize, ucc_topo_min_socket_size(self->topo)); + self->opt_radix_host = ucc_kn_get_opt_radix(tsize, min_radix, + max_radix); + } + } + tl_debug(tl_context->lib, "opt knomial radix: general %d host %d", + self->opt_radix, self->opt_radix_host); } tl_debug(tl_context->lib, "posted tl team: %p", self);