From d32cce0ec041431cb610c5622f475b4382cae4f6 Mon Sep 17 00:00:00 2001 From: Anatoly Vildemanov Date: Wed, 19 May 2021 12:51:50 +0300 Subject: [PATCH 01/19] AZP/CI: Auto-rebase on master branch + Fix name of branch + Trigger for push and pr + Clean workspace before use --- buildlib/azure-pipelines-int4.yml | 47 +++++++++++++++++++++++++++++++ buildlib/azure-pipelines-pr.yml | 9 ++++-- 2 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 buildlib/azure-pipelines-int4.yml diff --git a/buildlib/azure-pipelines-int4.yml b/buildlib/azure-pipelines-int4.yml new file mode 100644 index 00000000000..c17b4985d9b --- /dev/null +++ b/buildlib/azure-pipelines-int4.yml @@ -0,0 +1,47 @@ +# See https://aka.ms/yaml + +trigger: none +pr: none + +resources: + pipelines: + - pipeline: myTest + source: 'UCX snapshot' + trigger: true + +stages: + - stage: Rebase + jobs: + - job: rebase + pool: + name: MLNX + demands: + - ucx_docker -equals yes + displayName: rebase on openucx/ucx@master + steps: + - checkout: self + clean: true + fetchDepth: 200 + - bash: | + set -eEx + source buildlib/az-helpers.sh + # Checkout integration4 branch from Mellanox/ucx + git remote set-url origin git@github.com:Mellanox/ucx.git + git fetch origin integration4 + git checkout integration4 + # Checkout master branch from openucx/ucx + git remote add upstream https://github.com/openucx/ucx.git + git fetch upstream master + git log --oneline -10 upstream/master + # Rebase integration4 branch on master branch + if ! git rebase upstream/master + then + # Automatic rebase failed - show merge conflicts + git status + git diff + head=$(git rev-parse --short HEAD) + azure_log_issue "Rebase on ${head} failed, see https://github.com/Mellanox/ucx/wiki/Manual-rebase-of-integration4-branch for details" + else + # Automatic rebase was successful - update the branch + git push origin HEAD --force + fi diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml index 51dd8d7e82f..4d1fefa37ab 100644 --- a/buildlib/azure-pipelines-pr.yml +++ b/buildlib/azure-pipelines-pr.yml @@ -1,12 +1,15 @@ # See https://aka.ms/yaml # This pipeline to be run on PRs -trigger: none +trigger: + batch: true + branches: + include: + - integration4 pr: branches: include: - - master - - v*.*.x + - integration4 paths: exclude: - .gitignore From c98b2c0bbb64e8e5508bd78d44c6835f51232744 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Tue, 10 Aug 2021 16:52:51 +0300 Subject: [PATCH 02/19] UCT/MLX5: disable HW TM --- src/uct/ib/rc/accel/rc_mlx5_iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/uct/ib/rc/accel/rc_mlx5_iface.c b/src/uct/ib/rc/accel/rc_mlx5_iface.c index a6e11e87b3e..bc02aaf2215 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_iface.c +++ b/src/uct/ib/rc/accel/rc_mlx5_iface.c @@ -469,8 +469,9 @@ static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface, UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB | UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB); + /* NOTE: always disable for wire compatibility */ iface->tm.enabled = mlx5_config->tm.enable && tm_params && - (init_attr->flags & UCT_IB_TM_SUPPORTED); + (init_attr->flags & UCT_IB_TM_SUPPORTED) && 0; if (!iface->tm.enabled) { goto out_tm_disabled; } From ac9d2d39406d1d4e9bc616f691531d5d32aea061 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Tue, 3 Aug 2021 16:37:06 +0300 Subject: [PATCH 03/19] UCP/TAG: add eager compatibility --- src/ucp/core/ucp_ep.c | 113 ++++++++++++++++++++++++++++++++++++- src/ucp/core/ucp_request.h | 1 + src/ucp/core/ucp_worker.c | 5 +- src/ucp/proto/proto_am.inl | 12 ---- src/ucp/rndv/rndv.h | 3 + src/ucp/rndv/rndv.inl | 3 +- src/ucp/tag/eager.h | 37 ++++++++++++ src/ucp/tag/eager_multi.c | 2 + src/ucp/tag/eager_rcv.c | 26 +++++++-- src/ucp/tag/eager_single.c | 19 ++++--- src/ucp/tag/eager_snd.c | 22 ++++++-- src/ucp/tag/tag_match.c | 3 +- src/ucp/tag/tag_match.h | 2 +- src/ucp/tag/tag_match.inl | 7 ++- src/ucp/tag/tag_recv.c | 8 ++- src/ucp/tag/tag_rndv.c | 72 ++++++++++++++++++++++- src/ucp/tag/tag_send.c | 5 +- 17 files changed, 294 insertions(+), 46 deletions(-) diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index 2e42acc1b98..f36bbc48f75 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -21,8 +21,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -333,6 +335,9 @@ ucs_status_t ucp_ep_create_base(ucp_worker_h worker, unsigned ep_init_flags, goto err_ep_deallocate; } + ucs_debug("ep %p: local id %"PRIuPTR, ep, + ucp_ep_ext_control(ep)->local_ep_id); + ucp_ep_flush_state_reset(ep); /* Create endpoint VFS node on demand to avoid memory bloat */ @@ -494,6 +499,9 @@ void ucp_ep_release_id(ucp_ep_h ep) { ucs_status_t status; + ucs_debug("ep %p: released id %"PRIuPTR, ep, + ucp_ep_ext_control(ep)->local_ep_id); + /* Don't use ucp_ep_local_id() function here to avoid assertion failure, * because local_ep_id can be set to @ref UCS_PTR_MAP_KEY_INVALID */ status = UCS_PTR_MAP_DEL(ep, &ep->worker->ep_map, @@ -1232,6 +1240,7 @@ static void ucp_ep_set_lanes_failed(ucp_ep_h ep, uct_ep_h *uct_eps) uct_ep_h uct_ep; ucp_ep_check_lanes(ep); + /* release id on failed EP to drop data by invalid ID */ ucp_ep_release_id(ep); ucp_ep_update_flags(ep, UCP_EP_FLAG_FAILED, UCP_EP_FLAG_LOCAL_CONNECTED); @@ -1341,11 +1350,93 @@ static void ucp_ep_discard_lanes(ucp_ep_h ep, ucs_status_t discard_status) ucp_ep_discard_lanes_callback(NULL, UCS_OK, discard_arg); } +static void ucp_tm_ep_cleanup(ucp_ep_h ep, ucs_ptr_map_key_t ep_id, + ucs_status_t status) +{ + ucp_tag_match_t *tm = &ep->worker->tm; + const ucp_rndv_rts_hdr_t *rndv_rts_hdr; + const ucp_eager_middle_hdr_t *eager_mid_hdr; + const ucp_eager_hdr_t *eager_hdr; + ucp_recv_desc_t *rdesc, *tmp; + ucp_tag_frag_match_t *matchq; + ucp_request_t *rreq; + uint64_t msg_id; + khiter_t iter; + ucs_debug("cleanup ep %p", ep); + + if (!(ep_id & UCS_PTR_MAP_KEY_INDIRECT_FLAG)) { + return; + } + + if (ep_id == UCS_PTR_MAP_KEY_INVALID) { + ucs_assert(ep->flags & UCP_EP_FLAG_FAILED); + return; + } + + /* remove from unexpected queue */ + ucs_list_for_each_safe(rdesc, tmp, &tm->unexpected.all, + tag_list[UCP_RDESC_ALL_LIST]) { + if (rdesc->flags & UCP_RECV_DESC_FLAG_RNDV) { + rndv_rts_hdr = (const void*)(rdesc + 1); + if (rndv_rts_hdr->sreq.ep_id != ep_id) { + /* rndv not matched */ + continue; + } + } else { + eager_hdr = (const void*)(rdesc + 1); + if (eager_hdr->ep_id != ep_id) { + /* eager not matched */ + continue; + } + } + + ucs_debug("ep %p: ep_id %"PRIuPTR" releasing unexpected rdesc %p", ep, + ep_id, rdesc); + ucp_tag_unexp_remove(rdesc); + ucp_recv_desc_release(rdesc); + } + + /* remove from fragments hash */ + kh_foreach_key(&tm->frag_hash, msg_id, { + iter = kh_get(ucp_tag_frag_hash, &tm->frag_hash, msg_id); + matchq = &kh_val(&tm->frag_hash, iter); + if (!ucp_tag_frag_match_is_unexp(matchq)) { + /* remove receive request from expected hash */ + rreq = matchq->exp_req; + if (rreq->recv.tag.ep_id != ep_id) { + continue; + } + + ucs_debug("ep %p: ep_id %"PRIuPTR" completing expected receive request %p with status %s", + ep, ep_id, rreq, ucs_status_string(status)); + ucp_request_complete_tag_recv(rreq, status); + } else { + /* remove receive fragments from unexpected matchq */ + rdesc = ucs_queue_head_elem_non_empty(&matchq->unexp_q, ucp_recv_desc_t, + tag_frag_queue); + ucs_assert(!(rdesc->flags & UCP_RECV_DESC_FLAG_RNDV)); + eager_mid_hdr = (void*)(rdesc + 1); + if (eager_mid_hdr->ep_id != ep_id) { + continue; + } + + ucs_queue_for_each_extract(rdesc, &matchq->unexp_q, tag_frag_queue, 1) { + ucs_debug("ep %p: ep_id %"PRIuPTR" releasing unexpected rdesc %p", + ep, ep_id, rdesc); + ucp_recv_desc_release(rdesc); + } + } + + kh_del(ucp_tag_frag_hash, &tm->frag_hash, iter); + }); +} + ucs_status_t ucp_ep_set_failed(ucp_ep_h ucp_ep, ucp_lane_index_t lane, ucs_status_t status) { UCS_STRING_BUFFER_ONSTACK(lane_info_strb, 64); ucp_ep_ext_control_t *ep_ext_control = ucp_ep_ext_control(ucp_ep); + ucs_ptr_map_key_t ep_id; ucp_err_handling_mode_t err_mode; ucs_log_level_t log_level; ucp_request_t *close_req; @@ -1370,9 +1461,14 @@ ucp_ep_set_failed(ucp_ep_h ucp_ep, ucp_lane_index_t lane, ucs_status_t status) ++ucp_ep->worker->counters.ep_failures; + /* Store local_ep_id because @ref ucp_ep_discard_lanes invalidates it, + * invalidated local ID is used to drop data on failed EP */ + ep_id = ucp_ep_local_id(ucp_ep); + /* The EP can be closed from last completion callback */ ucp_ep_discard_lanes(ucp_ep, status); ucp_stream_ep_cleanup(ucp_ep, status); + ucp_tm_ep_cleanup(ucp_ep, ep_id, status); if (ucp_ep->flags & UCP_EP_FLAG_USED) { if (ucp_ep->flags & UCP_EP_FLAG_CLOSED) { @@ -1484,6 +1580,8 @@ void ucp_ep_disconnected(ucp_ep_h ep, int force) ucp_stream_ep_cleanup(ep, UCS_ERR_CANCELED); ucp_am_ep_cleanup(ep); + ucp_tm_ep_cleanup(ep, ucp_ep_ext_control(ep)->local_ep_id, + UCS_ERR_CANCELED); ucp_ep_update_flags(ep, 0, UCP_EP_FLAG_USED); @@ -1616,6 +1714,8 @@ ucs_status_ptr_t ucp_ep_close_nbx(ucp_ep_h ep, const ucp_request_param_t *param) ucp_ep_update_flags(ep, UCP_EP_FLAG_CLOSED, 0); if (ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE) { + ucp_tm_ep_cleanup(ep, ucp_ep_ext_control(ep)->local_ep_id, + UCS_ERR_CANCELED); ucp_ep_discard_lanes(ep, UCS_ERR_CANCELED); ucp_ep_disconnected(ep, 1); } else { @@ -2332,7 +2432,7 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, size_t max_rndv_thresh, max_am_rndv_thresh; size_t min_rndv_thresh, min_am_rndv_thresh; size_t rma_zcopy_thresh; - size_t am_max_eager_short; + ssize_t am_max_eager_short; double get_zcopy_max_bw[UCS_MEMORY_TYPE_LAST]; double put_zcopy_max_bw[UCS_MEMORY_TYPE_LAST]; ucs_status_t status; @@ -2563,6 +2663,9 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, ucp_ep_config_set_memtype_thresh(&config->tag.offload.max_eager_short, config->tag.eager.max_short, context->num_mem_type_detect_mds); + /* TAG offload is disabled for compatibility reasons */ + ucs_assert(config->tag.offload.max_eager_short.memtype_on < 0); + ucs_assert(config->tag.offload.max_eager_short.memtype_off < 0); } } @@ -2635,7 +2738,13 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, /* TODO: set threshold level based on all available lanes */ config->tag.eager = config->am; - config->tag.eager.max_short = am_max_eager_short; + /* short_iov which is used for compatibility path does not have + * header, payload can be 8 bytes larger but we add ep_id to header, + * so 8 - 16 = -8 bytes*/ + config->tag.eager.max_short = (am_max_eager_short < + (ssize_t)sizeof(uint64_t)) ? -1 : + (am_max_eager_short - + sizeof(uint64_t)); config->tag.lane = lane; config->tag.rndv.am_thresh = config->rndv.am_thresh; config->tag.rndv.rma_thresh = config->rndv.rma_thresh; diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h index 7d779d9feb8..34fab9dbca8 100644 --- a/src/ucp/core/ucp_request.h +++ b/src/ucp/core/ucp_request.h @@ -407,6 +407,7 @@ struct ucp_request { ucp_tag_t tag; /* Expected tag */ ucp_tag_t tag_mask; /* Expected tag mask */ uint64_t sn; /* Tag match sequence */ + ucs_ptr_map_key_t ep_id; /* Endpoint local id */ ucp_tag_recv_nbx_callback_t cb; /* Completion callback */ ucp_tag_recv_info_t info; /* Completion info to fill */ diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 8943fbc5be1..3258084d578 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -1229,8 +1229,11 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id, */ UCS_STATIC_ASSERT(UCP_WORKER_HEADROOM_PRIV_SIZE >= sizeof(ucp_eager_sync_hdr_t)); + + /* HW TM is disabled due to compatibility reason */ UCS_STATIC_ASSERT(UCP_WORKER_HEADROOM_PRIV_SIZE >= - sizeof(ucp_offload_first_desc_t)); + (sizeof(ucp_offload_first_desc_t) - + sizeof(uint64_t) /* ep_id */)); /* Fill rest of uct_iface params (caller should fill specific mode fields) */ iface_params->field_mask |= UCT_IFACE_PARAM_FIELD_STATS_ROOT | diff --git a/src/ucp/proto/proto_am.inl b/src/ucp/proto/proto_am.inl index 74be6e9d6e4..26ed42f13d3 100644 --- a/src/ucp/proto/proto_am.inl +++ b/src/ucp/proto/proto_am.inl @@ -24,18 +24,6 @@ typedef void (*ucp_req_complete_func_t)(ucp_request_t *req, ucs_status_t status); -static UCS_F_ALWAYS_INLINE void -ucp_add_uct_iov_elem(uct_iov_t *iov, void *buffer, size_t length, - uct_mem_h memh, size_t *iov_cnt) -{ - iov[*iov_cnt].buffer = buffer; - iov[*iov_cnt].length = length; - iov[*iov_cnt].count = 1; - iov[*iov_cnt].stride = 0; - iov[*iov_cnt].memh = memh; - ++(*iov_cnt); -} - static UCS_F_ALWAYS_INLINE ucs_status_t ucp_do_am_bcopy_single(uct_pending_req_t *self, uint8_t am_id, uct_pack_callback_t pack_cb) diff --git a/src/ucp/rndv/rndv.h b/src/ucp/rndv/rndv.h index ccc55ee0ea9..b44a63a7090 100644 --- a/src/ucp/rndv/rndv.h +++ b/src/ucp/rndv/rndv.h @@ -16,6 +16,9 @@ typedef enum { /* RNDV TAG operation with status UCS_OK (kept for wire compatibility with * the previous UCP versions) */ UCP_RNDV_RTS_TAG_OK = UCS_OK, + /* RNDV TAG operation with status UCS_ERR_CANCELED (kept for wire + * compatibility with the previous UCP versions) */ + UCP_RNDV_RTS_TAG_CANCELED = (uint8_t)UCS_ERR_CANCELED, /* RNDV AM operation */ UCP_RNDV_RTS_AM = 1 } UCS_S_PACKED ucp_rndv_rts_opcode_t; diff --git a/src/ucp/rndv/rndv.inl b/src/ucp/rndv/rndv.inl index e3f4f6e198f..1588e96e658 100644 --- a/src/ucp/rndv/rndv.inl +++ b/src/ucp/rndv/rndv.inl @@ -20,7 +20,8 @@ ucp_rndv_rts_is_am(const ucp_rndv_rts_hdr_t *rts_hdr) static UCS_F_ALWAYS_INLINE int ucp_rndv_rts_is_tag(const ucp_rndv_rts_hdr_t *rts_hdr) { - return rts_hdr->opcode == UCP_RNDV_RTS_TAG_OK; + return (rts_hdr->opcode == UCP_RNDV_RTS_TAG_OK) || + (rts_hdr->opcode == UCP_RNDV_RTS_TAG_CANCELED); } static UCS_F_ALWAYS_INLINE void diff --git a/src/ucp/tag/eager.h b/src/ucp/tag/eager.h index 61267073149..9c38ea7f0ac 100644 --- a/src/ucp/tag/eager.h +++ b/src/ucp/tag/eager.h @@ -27,6 +27,7 @@ */ typedef struct { ucp_tag_hdr_t super; + uint64_t ep_id; } UCS_S_PACKED ucp_eager_hdr_t; @@ -45,6 +46,7 @@ typedef struct { */ typedef struct { uint64_t msg_id; + uint64_t ep_id; size_t offset; } UCS_S_PACKED ucp_eager_middle_hdr_t; @@ -95,4 +97,39 @@ ucp_proto_eager_check_op_id(const ucp_proto_init_params_t *init_params, ucp_ep_config_key_has_tag_lane(init_params->ep_config_key)); } +static UCS_F_ALWAYS_INLINE void +ucp_add_uct_iov_elem(uct_iov_t *iov, void *buffer, size_t length, + uct_mem_h memh, size_t *iov_cnt) +{ + iov[*iov_cnt].buffer = buffer; + iov[*iov_cnt].length = length; + iov[*iov_cnt].count = 1; + iov[*iov_cnt].stride = 0; + iov[*iov_cnt].memh = memh; + ++(*iov_cnt); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_tag_send_am_short_iov(uct_ep_h ep, ucs_ptr_map_key_t remote_id, + const void *buffer, size_t length, ucp_tag_t tag) +{ + size_t iov_cnt = 0ul; + ucp_eager_hdr_t hdr = { .super.tag = tag, + .ep_id = remote_id }; + uct_iov_t iov[2]; + + ucp_add_uct_iov_elem(iov, &hdr, sizeof(hdr), UCT_MEM_HANDLE_NULL, &iov_cnt); + ucp_add_uct_iov_elem(iov, (void*)buffer, length, UCT_MEM_HANDLE_NULL, + &iov_cnt); + return uct_ep_am_short_iov(ep, UCP_AM_ID_EAGER_ONLY, iov, iov_cnt); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_ep_tag_send_am_short_iov(ucp_ep_h ep, const void *buffer, size_t length, + ucp_tag_t tag) +{ + return ucp_tag_send_am_short_iov(ucp_ep_get_am_uct_ep(ep), + ucp_ep_remote_id(ep), buffer, length, tag); +} + #endif diff --git a/src/ucp/tag/eager_multi.c b/src/ucp/tag/eager_multi.c index 44088cb4a68..99506d52f9e 100644 --- a/src/ucp/tag/eager_multi.c +++ b/src/ucp/tag/eager_multi.c @@ -18,6 +18,7 @@ static UCS_F_ALWAYS_INLINE void ucp_proto_eager_set_first_hdr(ucp_request_t *req, ucp_eager_first_hdr_t *hdr) { hdr->super.super.tag = req->send.msg_proto.tag; + hdr->super.ep_id = ucp_send_request_get_ep_remote_id(req); hdr->total_len = req->send.state.dt_iter.length; hdr->msg_id = req->send.msg_proto.message_id; } @@ -26,6 +27,7 @@ static UCS_F_ALWAYS_INLINE void ucp_proto_eager_set_middle_hdr(ucp_request_t *req, ucp_eager_middle_hdr_t *hdr) { hdr->msg_id = req->send.msg_proto.message_id; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); hdr->offset = req->send.state.dt_iter.offset; } diff --git a/src/ucp/tag/eager_rcv.c b/src/ucp/tag/eager_rcv.c index 99b1e148bac..468db9b4339 100644 --- a/src/ucp/tag/eager_rcv.c +++ b/src/ucp/tag/eager_rcv.c @@ -65,7 +65,8 @@ ucp_eager_offload_handler(void *arg, void *data, size_t length, if (!UCS_STATUS_IS_ERR(status)) { rdesc_hdr = (ucp_tag_t*)(rdesc + 1); *rdesc_hdr = recv_tag; - ucp_tag_unexp_recv(&worker->tm, rdesc, recv_tag); + ucp_tag_unexp_recv(&worker->tm, rdesc, recv_tag, + UCS_PTR_MAP_KEY_INVALID); } } @@ -89,6 +90,8 @@ ucp_eager_tagged_handler(void *arg, void *data, size_t length, unsigned am_flags ucp_recv_desc_t *rdesc; ucp_request_t *req; ucs_status_t status; + ucp_ep_h ep UCS_V_UNUSED; + req = ucp_tag_exp_search(&worker->tm, recv_tag); if (req != NULL) { @@ -117,17 +120,24 @@ ucp_eager_tagged_handler(void *arg, void *data, size_t length, unsigned am_flags 0); if (status == UCS_INPROGRESS) { ucp_tag_frag_list_process_queue( - &worker->tm, req, eagerf_hdr->msg_id + &worker->tm, req, eagerf_hdr->msg_id, + eagerf_hdr->super.ep_id UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_EXP)); } } status = UCS_OK; } else { + /* check UCS_PTR_MAP_KEY_INVALID to pass CI */ + if (ucs_likely(eager_hdr->ep_id != UCS_PTR_MAP_KEY_INVALID)) { + UCP_WORKER_GET_EP_BY_ID(&ep, worker, eager_hdr->ep_id, return UCS_OK, + "eager"); + } + status = ucp_recv_desc_init(worker, data, length, 0, am_flags, hdr_len, flags, priv_length, 1, name, &rdesc); if (!UCS_STATUS_IS_ERR(status)) { - ucp_tag_unexp_recv(&worker->tm, rdesc, eager_hdr->super.tag); + ucp_tag_unexp_recv(&worker->tm, rdesc, recv_tag, eager_hdr->ep_id); } } @@ -163,6 +173,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, ucp_worker_h worker = arg; ucp_eager_middle_hdr_t *hdr = data; ucp_recv_desc_t *rdesc = NULL; + ucp_ep_h ep UCS_V_UNUSED; ucp_tag_frag_match_t *matchq; ucp_request_t *req; ucs_status_t status; @@ -170,6 +181,12 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, khiter_t iter; int ret; + /* check UCS_PTR_MAP_KEY_INVALID to pass CI */ + if (ucs_likely(hdr->ep_id != UCS_PTR_MAP_KEY_INVALID)) { + UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, hdr->ep_id, return UCS_OK, + "eager_middle"); + } + iter = kh_put(ucp_tag_frag_hash, &worker->tm.frag_hash, hdr->msg_id, &ret); ucs_assert(ret >= 0); matchq = &kh_value(&worker->tm.frag_hash, iter); @@ -347,7 +364,7 @@ ucp_tag_offload_eager_first_handler(ucp_worker_h worker, void *data, ucp_request_recv_offload_data(req, data, length, flags); } else { ucp_tag_frag_match_init_unexp(matchq); - ucp_tag_unexp_recv(&worker->tm, rdesc, stag); + ucp_tag_unexp_recv(&worker->tm, rdesc, stag, UCS_PTR_MAP_KEY_INVALID); } return status; @@ -476,6 +493,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_eager, priv->req.req_id = UCS_PTR_MAP_KEY_INVALID; priv->req.ep_id = imm; priv->super.super.tag = stag; + priv->super.ep_id = UCS_PTR_MAP_KEY_INVALID; return ucp_eager_tagged_handler(worker, priv, length + priv_len, tl_flags, flags, priv_len, priv_len, "tag_offload_unexp_eager_sync"); diff --git a/src/ucp/tag/eager_single.c b/src/ucp/tag/eager_single.c index a64dacf9117..569be8a0aea 100644 --- a/src/ucp/tag/eager_single.c +++ b/src/ucp/tag/eager_single.c @@ -26,10 +26,11 @@ static ucs_status_t ucp_eager_short_progress(uct_pending_req_t *self) const ucp_proto_single_priv_t *spriv = req->send.proto_config->priv; ucs_status_t status; - status = uct_ep_am_short(req->send.ep->uct_eps[spriv->super.lane], - UCP_AM_ID_EAGER_ONLY, req->send.msg_proto.tag, - req->send.state.dt_iter.type.contig.buffer, - req->send.state.dt_iter.length); + status = ucp_tag_send_am_short_iov(req->send.ep->uct_eps[spriv->super.lane], + ucp_send_request_get_ep_remote_id(req), + req->send.state.dt_iter.type.contig.buffer, + req->send.state.dt_iter.length, + req->send.msg_proto.tag); if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) { req->send.lane = spriv->super.lane; /* for pending add */ return status; @@ -59,7 +60,7 @@ ucp_proto_eager_short_init(const ucp_proto_init_params_t *init_params) .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID, .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_short), .super.max_iov_offs = UCP_PROTO_COMMON_OFFSET_INVALID, - .super.hdr_size = sizeof(ucp_tag_hdr_t), + .super.hdr_size = sizeof(ucp_eager_hdr_t), .super.send_op = UCT_EP_OP_AM_SHORT, .super.memtype_op = UCT_EP_OP_LAST, .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG, @@ -95,6 +96,7 @@ static size_t ucp_eager_single_pack(void *dest, void *arg) ucs_assert(req->send.state.dt_iter.offset == 0); hdr->super.tag = req->send.msg_proto.tag; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); packed_size = ucp_datatype_iter_next_pack(&req->send.state.dt_iter, req->send.ep->worker, SIZE_MAX, &next_iter, hdr + 1); @@ -128,7 +130,7 @@ ucp_proto_eager_bcopy_single_init(const ucp_proto_init_params_t *init_params) .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID, .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_bcopy), .super.max_iov_offs = UCP_PROTO_COMMON_OFFSET_INVALID, - .super.hdr_size = sizeof(ucp_tag_hdr_t), + .super.hdr_size = sizeof(ucp_eager_hdr_t), .super.send_op = UCT_EP_OP_AM_BCOPY, .super.memtype_op = UCT_EP_OP_GET_SHORT, .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG, @@ -170,7 +172,7 @@ ucp_proto_eager_zcopy_single_init(const ucp_proto_init_params_t *init_params) .super.min_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.min_zcopy), .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_zcopy), .super.max_iov_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_iov), - .super.hdr_size = sizeof(ucp_tag_hdr_t), + .super.hdr_size = sizeof(ucp_eager_hdr_t), .super.send_op = UCT_EP_OP_AM_ZCOPY, .super.memtype_op = UCT_EP_OP_LAST, .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY | @@ -194,7 +196,8 @@ ucp_proto_eager_zcopy_send_func(ucp_request_t *req, uct_iov_t *iov) { ucp_eager_hdr_t hdr = { - .super.tag = req->send.msg_proto.tag + .super.tag = req->send.msg_proto.tag, + .ep_id = ucp_send_request_get_ep_remote_id(req) }; return uct_ep_am_zcopy(req->send.ep->uct_eps[spriv->super.lane], diff --git a/src/ucp/tag/eager_snd.c b/src/ucp/tag/eager_snd.c index 2f0c26c90d5..28ed5ca57a6 100644 --- a/src/ucp/tag/eager_snd.c +++ b/src/ucp/tag/eager_snd.c @@ -40,6 +40,7 @@ static size_t ucp_tag_pack_eager_only_dt(void *dest, void *arg) ucp_request_t *req = arg; hdr->super.tag = req->send.msg_proto.tag; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); return ucp_tag_pack_eager_common(req, hdr + 1, req->send.length, sizeof(*hdr), 1); @@ -51,6 +52,7 @@ static size_t ucp_tag_pack_eager_sync_only_dt(void *dest, void *arg) ucp_request_t *req = arg; hdr->super.super.tag = req->send.msg_proto.tag; + hdr->super.ep_id = hdr->req.ep_id = ucp_send_request_get_ep_remote_id(req); hdr->req.req_id = ucp_send_request_get_id(req); @@ -70,6 +72,7 @@ static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg) sizeof(*hdr); length = ucs_min(length, req->send.length); hdr->super.super.tag = req->send.msg_proto.tag; + hdr->super.ep_id = ucp_send_request_get_ep_remote_id(req); hdr->total_len = req->send.length; hdr->msg_id = req->send.msg_proto.message_id; @@ -80,6 +83,7 @@ static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg) { ucp_eager_sync_first_hdr_t *hdr = dest; ucp_request_t *req = arg; + ucs_ptr_map_key_t ep_id = ucp_send_request_get_ep_remote_id(req); size_t length; ucs_assert(req->send.lane == ucp_ep_get_am_lane(req->send.ep)); @@ -89,8 +93,9 @@ static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg) sizeof(*hdr); length = ucs_min(length, req->send.length); hdr->super.super.super.tag = req->send.msg_proto.tag; + hdr->super.super.ep_id = ep_id; hdr->super.total_len = req->send.length; - hdr->req.ep_id = ucp_send_request_get_ep_remote_id(req); + hdr->req.ep_id = ep_id; hdr->super.msg_id = req->send.msg_proto.message_id; hdr->req.req_id = ucp_send_request_get_id(req); @@ -107,6 +112,7 @@ static size_t ucp_tag_pack_eager_middle_dt(void *dest, void *arg) sizeof(*hdr), req->send.length - req->send.state.dt.offset); hdr->msg_id = req->send.msg_proto.message_id; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); hdr->offset = req->send.state.dt.offset; return ucp_tag_pack_eager_common(req, hdr + 1, length, sizeof(*hdr), 0); @@ -121,10 +127,9 @@ static ucs_status_t ucp_tag_eager_contig_short(uct_pending_req_t *self) ucs_status_t status; req->send.lane = ucp_ep_get_am_lane(ep); - status = uct_ep_am_short(ep->uct_eps[req->send.lane], - UCP_AM_ID_EAGER_ONLY, - req->send.msg_proto.tag, req->send.buffer, - req->send.length); + status = ucp_ep_tag_send_am_short_iov(ep, req->send.buffer, + req->send.length, + req->send.msg_proto.tag); return ucp_am_short_handle_status_from_pending(req, status); } @@ -153,6 +158,7 @@ static ucs_status_t ucp_tag_eager_zcopy_single(uct_pending_req_t *self) ucp_eager_hdr_t hdr; hdr.super.tag = req->send.msg_proto.tag; + hdr.ep_id = ucp_send_request_get_ep_remote_id(req); return ucp_do_am_zcopy_single(self, UCP_AM_ID_EAGER_ONLY, &hdr, sizeof(hdr), NULL, 0ul, ucp_proto_am_zcopy_req_complete); } @@ -160,13 +166,16 @@ static ucs_status_t ucp_tag_eager_zcopy_single(uct_pending_req_t *self) static ucs_status_t ucp_tag_eager_zcopy_multi(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + uint64_t ep_id = ucp_send_request_get_ep_remote_id(req); ucp_eager_first_hdr_t first_hdr; ucp_eager_middle_hdr_t middle_hdr; first_hdr.super.super.tag = req->send.msg_proto.tag; + first_hdr.super.ep_id = ep_id; first_hdr.total_len = req->send.length; first_hdr.msg_id = req->send.msg_proto.message_id; middle_hdr.msg_id = req->send.msg_proto.message_id; + middle_hdr.ep_id = ep_id; middle_hdr.offset = req->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, UCP_AM_ID_EAGER_FIRST, @@ -253,6 +262,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_single(uct_pending_req_t *self) ucp_eager_sync_hdr_t hdr; hdr.super.super.tag = req->send.msg_proto.tag; + hdr.super.ep_id = hdr.req.ep_id = ucp_send_request_get_ep_remote_id(req); hdr.req.req_id = ucp_send_request_get_id(req); @@ -269,6 +279,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self) if (req->send.state.dt.offset != 0) { middle_hdr.msg_id = req->send.msg_proto.message_id; + middle_hdr.ep_id = ucp_send_request_get_ep_remote_id(req); middle_hdr.offset = req->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, UCP_AM_ID_LAST, @@ -279,6 +290,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self) } first_hdr.super.super.super.tag = req->send.msg_proto.tag; + first_hdr.super.super.ep_id = ucp_send_request_get_ep_remote_id(req); first_hdr.super.total_len = req->send.length; first_hdr.req.ep_id = ucp_send_request_get_ep_remote_id(req); first_hdr.req.req_id = ucp_send_request_get_id(req); diff --git a/src/ucp/tag/tag_match.c b/src/ucp/tag/tag_match.c index d7009203453..0ee05ef9ba8 100644 --- a/src/ucp/tag/tag_match.c +++ b/src/ucp/tag/tag_match.c @@ -153,7 +153,7 @@ ucp_tag_exp_search_all(ucp_tag_match_t *tm, ucp_request_queue_t *req_queue, * offload flow. */ void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, - uint64_t msg_id + uint64_t msg_id, uint64_t ep_id UCS_STATS_ARG(int counter_idx)) { ucp_tag_frag_match_t *matchq; @@ -174,5 +174,6 @@ void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, } /* request not completed, put it on the hash */ + req->recv.tag.ep_id = ep_id; ucp_tag_frag_hash_init_exp(matchq, req); } diff --git a/src/ucp/tag/tag_match.h b/src/ucp/tag/tag_match.h index b653658cbf8..00f767d4c8f 100644 --- a/src/ucp/tag/tag_match.h +++ b/src/ucp/tag/tag_match.h @@ -114,7 +114,7 @@ ucp_tag_exp_search_all(ucp_tag_match_t *tm, ucp_request_queue_t *req_queue, ucp_tag_t tag); void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, - uint64_t msg_id + uint64_t msg_id, uint64_t ep_id UCS_STATS_ARG(int counter_idx)); #endif diff --git a/src/ucp/tag/tag_match.inl b/src/ucp/tag/tag_match.inl index 8cc0e3f6f70..0ba705635e7 100644 --- a/src/ucp/tag/tag_match.inl +++ b/src/ucp/tag/tag_match.inl @@ -139,7 +139,8 @@ ucp_tag_unexp_remove(ucp_recv_desc_t *rdesc) } static UCS_F_ALWAYS_INLINE void -ucp_tag_unexp_recv(ucp_tag_match_t *tm, ucp_recv_desc_t *rdesc, ucp_tag_t tag) +ucp_tag_unexp_recv(ucp_tag_match_t *tm, ucp_recv_desc_t *rdesc, ucp_tag_t tag, + uint64_t ep_id) { ucs_list_link_t *hash_list; @@ -147,8 +148,8 @@ ucp_tag_unexp_recv(ucp_tag_match_t *tm, ucp_recv_desc_t *rdesc, ucp_tag_t tag) ucs_list_add_tail(hash_list, &rdesc->tag_list[UCP_RDESC_HASH_LIST]); ucs_list_add_tail(&tm->unexpected.all, &rdesc->tag_list[UCP_RDESC_ALL_LIST]); - ucs_trace_req("unexp "UCP_RECV_DESC_FMT" tag %"PRIx64, - UCP_RECV_DESC_ARG(rdesc), tag); + ucs_trace_req("unexp "UCP_RECV_DESC_FMT" tag %"PRIx64" ep_id %"PRIx64, + UCP_RECV_DESC_ARG(rdesc), tag, ep_id); } static UCS_F_ALWAYS_INLINE ucp_recv_desc_t* diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c index fd933054d92..bc1c18dbfaa 100644 --- a/src/ucp/tag/tag_recv.c +++ b/src/ucp/tag/tag_recv.c @@ -24,6 +24,7 @@ static void ucp_tag_recv_eager_multi(ucp_worker_h worker, ucp_request_t *req, { ucp_eager_first_hdr_t *first_hdr; uint64_t msg_id; + uint64_t ep_id; ucs_status_t status; UCP_WORKER_STAT_EAGER_MSG(worker, rdesc->flags); @@ -41,6 +42,7 @@ static void ucp_tag_recv_eager_multi(ucp_worker_h worker, ucp_request_t *req, first_hdr = (ucp_eager_first_hdr_t*)(rdesc + 1); req->recv.remaining = req->recv.tag.info.length = first_hdr->total_len; msg_id = first_hdr->msg_id; + ep_id = first_hdr->super.ep_id; if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) { ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags); @@ -48,9 +50,8 @@ static void ucp_tag_recv_eager_multi(ucp_worker_h worker, ucp_request_t *req, status = ucp_tag_recv_request_process_rdesc(req, rdesc, 0, 0); if (status == UCS_INPROGRESS) { - ucp_tag_frag_list_process_queue( - &worker->tm, req, msg_id - UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP)); + ucp_tag_frag_list_process_queue(&worker->tm, req, msg_id, ep_id + UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP)); } } } @@ -135,6 +136,7 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, req->recv.tag.tag = tag; req->recv.tag.tag_mask = tag_mask; + req->recv.tag.ep_id = UCS_PTR_MAP_KEY_INVALID; if (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) { req->recv.tag.cb = param->cb.recv; diff --git a/src/ucp/tag/tag_rndv.c b/src/ucp/tag/tag_rndv.c index 094db7dec93..1e87bc6de06 100644 --- a/src/ucp/tag/tag_rndv.c +++ b/src/ucp/tag/tag_rndv.c @@ -28,6 +28,66 @@ void ucp_tag_rndv_matched(ucp_worker_h worker, ucp_request_t *rreq, hdr_length - sizeof(*rts_hdr)); } +static void ucp_rndv_send_cancel_ack(ucp_worker_h worker, + ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + ucp_ep_h ep = NULL; + ucp_request_t *req; + + UCP_WORKER_GET_EP_BY_ID(&ep, worker, rndv_rts_hdr->sreq.ep_id, return, + "ats_cancel"); + + req = ucp_request_get(worker); + if (req == NULL) { + return; + } + + req->send.ep = ep; + req->flags = 0; + req->send.rndv.mdesc = NULL; + req->send.pending_lane = UCP_NULL_LANE; + + ucp_rndv_req_send_ack(req, sizeof(*rndv_rts_hdr), rndv_rts_hdr->sreq.req_id, + UCS_ERR_CANCELED, UCP_AM_ID_RNDV_ATS, + "send_ats_cancel"); +} + +static void ucp_rndv_unexp_cancel(ucp_worker_h worker, + ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + ucp_tag_hdr_t* tag_hdr = ucp_tag_hdr_from_rts(rndv_rts_hdr); + ucp_ep_h UCS_V_UNUSED ep = NULL; + const ucp_rndv_rts_hdr_t *rdesc_rts_hdr; + ucp_recv_desc_t *rdesc; + ucs_list_link_t *list; + + UCP_WORKER_GET_EP_BY_ID(&ep, worker, rndv_rts_hdr->sreq.ep_id, + ep = NULL, "unexp_cancel"); + + list = ucp_tag_unexp_get_list_for_tag(&worker->tm, tag_hdr->tag); + ucs_list_for_each(rdesc, list, tag_list[UCP_RDESC_HASH_LIST]) { + rdesc_rts_hdr = (const void*)(rdesc + 1); + if ((rdesc->flags & UCP_RECV_DESC_FLAG_RNDV) && + (ucp_rdesc_get_tag(rdesc) == tag_hdr->tag) && + (rdesc_rts_hdr->sreq.ep_id == rndv_rts_hdr->sreq.ep_id) && + (rdesc_rts_hdr->sreq.req_id == rndv_rts_hdr->sreq.req_id)) + { + ucs_debug("ep %p, canceling unexp rdesc " UCP_RECV_DESC_FMT " with " + "tag %"PRIx64" ep_id %"PRIx64, ep, + UCP_RECV_DESC_ARG(rdesc), ucp_rdesc_get_tag(rdesc), + rdesc_rts_hdr->sreq.ep_id); + ucp_tag_unexp_remove(rdesc); + ucp_rndv_send_cancel_ack(worker, rndv_rts_hdr); + ucp_recv_desc_release(rdesc); + return; + } + } + + ucs_debug("ep %p, unexp rdesc for RTS tag %"PRIx64" ep_id %"PRIx64 + " req_id %"PRIx64" is not found", ep, tag_hdr->tag, + rndv_rts_hdr->sreq.ep_id, rndv_rts_hdr->sreq.req_id); +} + ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, ucp_rndv_rts_hdr_t *rts_hdr, size_t length, unsigned tl_flags) @@ -35,9 +95,15 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, ucp_recv_desc_t *rdesc; ucp_request_t *rreq; ucs_status_t status; + ucp_ep_h ep UCS_V_UNUSED; ucs_assert(ucp_rndv_rts_is_tag(rts_hdr)); + if (ucs_unlikely(rts_hdr->opcode == UCP_RNDV_RTS_TAG_CANCELED)) { + ucp_rndv_unexp_cancel(worker, rts_hdr); + return UCS_OK; + } + rreq = ucp_tag_exp_search(&worker->tm, ucp_tag_hdr_from_rts(rts_hdr)->tag); if (rreq != NULL) { /* Cancel req in transport if it was offloaded, because it arrived @@ -49,6 +115,9 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, return UCS_OK; } + UCP_WORKER_GET_EP_BY_ID(&ep, worker, rts_hdr->sreq.ep_id, return UCS_OK, + "rts"); + ucs_assert(length >= sizeof(*rts_hdr)); status = ucp_recv_desc_init(worker, rts_hdr, length, 0, tl_flags, @@ -58,7 +127,8 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, ucs_assert(ucp_rdesc_get_tag(rdesc) == ucp_tag_hdr_from_rts(rts_hdr)->tag); ucp_tag_unexp_recv(&worker->tm, rdesc, - ucp_tag_hdr_from_rts(rts_hdr)->tag); + ucp_tag_hdr_from_rts(rts_hdr)->tag, + rts_hdr->sreq.ep_id); } return status; diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index d60c96b3063..349dae234d4 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -153,10 +153,7 @@ ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t length, ucp_tag_t ta ucs_status_t status; if (ucp_proto_is_inline(ep, &ucp_ep_config(ep)->tag.max_eager_short, length)) { - UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(ucp_eager_hdr_t)); - UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uint64_t)); - status = uct_ep_am_short(ucp_ep_get_am_uct_ep(ep), UCP_AM_ID_EAGER_ONLY, - tag, buffer, length); + status = ucp_ep_tag_send_am_short_iov(ep, buffer, length, tag); } else if (ucp_proto_is_inline(ep, &ucp_ep_config(ep)->tag.offload.max_eager_short, length)) { From 7a34b393b97a531e98516e477516a6e0946eb6c8 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 22 Sep 2021 11:10:30 +0300 Subject: [PATCH 04/19] UCP/TAG/SEND: added eager/rndv flags to tag_send op - added op-flags support to tag_send_[sync]_nbx functions - added flags eager/rndv flags to force used protocol --- src/ucp/api/ucp.h | 21 +++++++++++++++++- src/ucp/tag/tag_send.c | 38 +++++++++++++++++++++++++++------ test/apps/iodemo/io_demo.cc | 14 ++++++++++-- test/apps/iodemo/ucx_wrapper.cc | 9 +++++++- test/apps/iodemo/ucx_wrapper.h | 10 ++++++++- 5 files changed, 81 insertions(+), 11 deletions(-) diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index 801bd6052e1..9bfb965aa03 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -721,6 +721,21 @@ typedef enum { } ucp_op_attr_t; +/** + * @ingroup UCP_COMM + * @brief UCP tag send operation flags + * + * Flags dictate the behavior of @ref ucp_tag_send_nbx and + * @ref ucp_tag_send_sync_nbx routines. + */ +typedef enum { + UCP_EP_TAG_SEND_FLAG_EAGER = UCS_BIT(0), /**< force use eager protocol + to transfer data */ + UCP_EP_TAG_SEND_FLAG_RNDV = UCS_BIT(1) /**< force use rndv protocol + to transfer data */ +} ucp_ep_tag_send_flags_t; + + /** * @ingroup UCP_COMM * @brief UCP request query attributes @@ -3521,7 +3536,11 @@ ucs_status_ptr_t ucp_tag_send_sync_nb(ucp_ep_h ep, const void *buffer, size_t co * @param [in] buffer Pointer to the message buffer (payload). * @param [in] count Number of elements to send * @param [in] tag Message tag. - * @param [in] param Operation parameters, see @ref ucp_request_param_t + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * This operation supports specific flags, which can be + * passed in @a param by @ref ucp_request_param_t.flags. + * The exact set of flags is defined + * by @ref ucp_ep_tag_send_flags_t. * * @return UCS_OK - The send operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index 349dae234d4..59fdfdad3a0 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -24,8 +24,18 @@ static UCS_F_ALWAYS_INLINE size_t ucp_tag_get_rndv_threshold(const ucp_request_t *req, size_t count, size_t max_iov, size_t rndv_rma_thresh, - size_t rndv_am_thresh) + size_t rndv_am_thresh, uint32_t flags) { + /* Eager protocol requested - set rndv threshold to max */ + if (flags & UCP_EP_TAG_SEND_FLAG_EAGER) { + return SIZE_MAX; + } + + /* RNDV protocol requested - set rndv threshold to 0 */ + if (flags & UCP_EP_TAG_SEND_FLAG_RNDV) { + return 0; + } + switch (req->send.datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_IOV: if ((count > max_iov) && @@ -54,6 +64,7 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, { ssize_t max_short = ucp_proto_get_short_max(req, msg_config); ucp_ep_config_t *ep_config = ucp_ep_config(req->send.ep); + uint32_t flags = ucp_request_param_flags(param); ucs_status_t status; size_t zcopy_thresh; size_t rndv_thresh; @@ -65,7 +76,8 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, &rndv_rma_thresh, &rndv_am_thresh); rndv_thresh = ucp_tag_get_rndv_threshold(req, dt_count, msg_config->max_iov, - rndv_rma_thresh, rndv_am_thresh); + rndv_rma_thresh, rndv_am_thresh, + flags); if (!(param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) || ucs_unlikely(!UCP_MEM_IS_HOST(req->send.mem_type))) { @@ -228,7 +240,8 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx, ucp_ep_h ep, const void *buffer, size_t count, ucp_tag_t tag, const ucp_request_param_t *param) { - size_t contig_length = 0; + size_t contig_length = 0; + uint32_t UCS_V_UNUSED flags = ucp_request_param_flags(param); ucs_status_t status; ucp_request_t *req; ucs_status_ptr_t ret; @@ -240,6 +253,12 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_REQUEST_CHECK_PARAM(param); + if (ENABLE_PARAMS_CHECK && + ucs_test_all_flags(flags, UCP_EP_TAG_SEND_FLAG_EAGER | + UCP_EP_TAG_SEND_FLAG_RNDV)) { + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); ucs_trace_req("send_nbx buffer %p count %zu tag %"PRIx64" to %s", @@ -305,9 +324,10 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx, ucp_ep_h ep, const void *buffer, size_t count, ucp_tag_t tag, const ucp_request_param_t *param) { - ucp_worker_h worker = ep->worker; - size_t contig_length = 0; - uintptr_t datatype = ucp_request_param_datatype(param); + ucp_worker_h worker = ep->worker; + size_t contig_length = 0; + uintptr_t datatype = ucp_request_param_datatype(param); + uint32_t UCS_V_UNUSED flags = ucp_request_param_flags(param); ucs_status_t status; ucp_request_t *req; ucs_status_ptr_t ret; @@ -317,6 +337,12 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx, UCS_ERR_INVALID_PARAM)); UCP_REQUEST_CHECK_PARAM(param); + if (ENABLE_PARAMS_CHECK && + ucs_test_all_flags(flags, UCP_EP_TAG_SEND_FLAG_EAGER | + UCP_EP_TAG_SEND_FLAG_RNDV)) { + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); ucs_trace_req("send_sync_nbx buffer %p count %zu tag %"PRIx64" to %s", diff --git a/test/apps/iodemo/io_demo.cc b/test/apps/iodemo/io_demo.cc index a4cb9a679ea..3631e45f3f2 100644 --- a/test/apps/iodemo/io_demo.cc +++ b/test/apps/iodemo/io_demo.cc @@ -70,6 +70,7 @@ typedef struct { double retry_interval; double client_runtime_limit; double print_interval; + size_t rndv_thresh; size_t iomsg_size; size_t min_data_size; size_t max_data_size; @@ -875,7 +876,8 @@ class P2pDemoCommon : public UcxContext { P2pDemoCommon(const options_t &test_opts, uint32_t iov_buf_filler) : UcxContext(test_opts.iomsg_size, test_opts.connect_timeout, - test_opts.use_am, test_opts.use_epoll), + test_opts.use_am, test_opts.rndv_thresh, + test_opts.use_epoll), _test_opts(test_opts), _io_msg_pool(test_opts.iomsg_size, "io messages"), _send_callback_pool(0, "send callbacks"), @@ -2659,6 +2661,7 @@ static int parse_args(int argc, char **argv, options_t *test_opts) test_opts->retry_interval = 5.0; test_opts->client_runtime_limit = std::numeric_limits::max(); test_opts->print_interval = 1.0; + test_opts->rndv_thresh = UcxContext::rndv_thresh_auto; test_opts->min_data_size = 4096; test_opts->max_data_size = 4096; test_opts->chunk_size = std::numeric_limits::max(); @@ -2679,7 +2682,7 @@ static int parse_args(int argc, char **argv, options_t *test_opts) test_opts->per_conn_info = false; while ((c = getopt(argc, argv, - "p:c:r:d:b:i:w:a:k:o:t:n:l:s:y:vqeADHP:m:L:I:zV")) != -1) { + "p:c:r:d:b:i:w:a:k:o:t:n:l:s:y:vqeADHP:R:m:L:I:zV")) != -1) { switch (c) { case 'p': test_opts->port_num = atoi(optarg); @@ -2809,6 +2812,9 @@ static int parse_args(int argc, char **argv, options_t *test_opts) case 'P': test_opts->print_interval = atof(optarg); break; + case 'R': + test_opts->rndv_thresh = strtol(optarg, NULL, 0); + break; case 'm': if (!strcmp(optarg, "host")) { test_opts->memory_type = UCS_MEMORY_TYPE_HOST; @@ -2866,6 +2872,10 @@ static int parse_args(int argc, char **argv, options_t *test_opts) std::cout << " -D Enable debugging mode for IO operation timeouts" << std::endl; std::cout << " -H Use human-readable timestamps" << std::endl; std::cout << " -P Set report printing interval" << std::endl; + std::cout << " -R Always use rendezvous protocol for messages starting" << std::endl; + std::cout << " from this size, and eager protocol for" << std::endl; + std::cout << " messages lower than this size. If not set," << std::endl; + std::cout << " the threshold is selected automatically by UCX" << std::endl; std::cout << "" << std::endl; std::cout << " -m Memory type to use. Possible values: host" #ifdef HAVE_CUDA diff --git a/test/apps/iodemo/ucx_wrapper.cc b/test/apps/iodemo/ucx_wrapper.cc index 8304b2bedef..143442a0314 100644 --- a/test/apps/iodemo/ucx_wrapper.cc +++ b/test/apps/iodemo/ucx_wrapper.cc @@ -111,7 +111,7 @@ void UcxContext::UcxDisconnectCallback::operator()(ucs_status_t status) } UcxContext::UcxContext(size_t iomsg_size, double connect_timeout, bool use_am, - bool use_epoll) : + size_t rndv_thresh, bool use_epoll) : _context(NULL), _worker(NULL), _listener(NULL), _iomsg_recv_request(NULL), _iomsg_buffer(iomsg_size), _connect_timeout(connect_timeout), _use_am(use_am), _worker_fd(-1), _epoll_fd(-1) @@ -1228,6 +1228,13 @@ bool UcxConnection::send_common(const void *buffer, size_t length, param.datatype = 0; // make coverity happy param.cb.send = (ucp_send_nbx_callback_t)common_request_callback; + if (_context.rndv_thresh() != UcxContext::rndv_thresh_auto) { + param.op_attr_mask |= UCP_OP_ATTR_FIELD_FLAGS; + param.flags = (length >= _context.rndv_thresh()) ? + UCP_EP_TAG_SEND_FLAG_RNDV : + UCP_EP_TAG_SEND_FLAG_EAGER; + } + if (memh) { param.op_attr_mask |= UCP_OP_ATTR_FIELD_MEMH; param.memh = memh; diff --git a/test/apps/iodemo/ucx_wrapper.h b/test/apps/iodemo/ucx_wrapper.h index c906d0b1c72..155b02cfede 100644 --- a/test/apps/iodemo/ucx_wrapper.h +++ b/test/apps/iodemo/ucx_wrapper.h @@ -112,8 +112,10 @@ class UcxContext { public: typedef std::vector iomsg_buffer_t; + static const size_t rndv_thresh_auto = (size_t)-2; + UcxContext(size_t iomsg_size, double connect_timeout, bool use_am, - bool use_epoll = false); + size_t rndv_thresh, bool use_epoll = false); virtual ~UcxContext(); @@ -256,6 +258,11 @@ class UcxContext { void destroy_worker(); + size_t rndv_thresh() const + { + return _rndv_thresh; + } + void set_am_handler(ucp_am_recv_callback_t cb, void *arg); ucp_context_h _context; @@ -272,6 +279,7 @@ class UcxContext { bool _use_am; int _worker_fd; int _epoll_fd; + size_t _rndv_thresh; }; From ba9a5b76bfceac0209540b31099c9b8496afed3d Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Thu, 26 Aug 2021 18:24:44 +0300 Subject: [PATCH 05/19] GTEST/UCP/TAG: test unexp queue cleanup --- test/gtest/ucp/test_ucp_sockaddr.cc | 61 ++++++++++++++++++----------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/test/gtest/ucp/test_ucp_sockaddr.cc b/test/gtest/ucp/test_ucp_sockaddr.cc index 50649f2d218..2494f0c3e98 100644 --- a/test/gtest/ucp/test_ucp_sockaddr.cc +++ b/test/gtest/ucp/test_ucp_sockaddr.cc @@ -54,7 +54,8 @@ class test_ucp_sockaddr : public ucp_test { enum { SEND_DIRECTION_C2S = UCS_BIT(0), /* send data from client to server */ SEND_DIRECTION_S2C = UCS_BIT(1), /* send data from server to client */ - SEND_DIRECTION_BIDI = SEND_DIRECTION_C2S | SEND_DIRECTION_S2C /* bidirectional send */ + SEND_DIRECTION_BIDI = SEND_DIRECTION_C2S | SEND_DIRECTION_S2C, /* bidirectional send */ + SEND_NO_RECV = UCS_BIT(2) /* do not recv data to test unexp Q cleanup */ }; typedef enum { @@ -531,30 +532,34 @@ class test_ucp_sockaddr : public ucp_test { void send_recv(entity& from, entity& to, send_recv_type_t send_recv_type, bool wakeup, ucp_test_base::entity::listen_cb_type_t cb_type, - size_t ep_index = 0) + bool no_recv = false, size_t ep_index = 0) { const uint64_t send_data = ucs_generate_uuid(0); uint64_t recv_data = 0; rx_am_msg_arg am_rx_arg(to, NULL, &recv_data); ucs_status_t send_status; - if (send_recv_type == SEND_RECV_AM) { + if (!no_recv && (send_recv_type == SEND_RECV_AM)) { set_am_data_handler(to, 0, rx_am_msg_cb, &am_rx_arg); } void *send_req = send(from, &send_data, sizeof(send_data), send_recv_type, scomplete_cbx, NULL, ep_index); - void *recv_req = NULL; // to suppress compiler warning - if (send_recv_type == SEND_RECV_TAG) { - recv_req = recv(to, &recv_data, sizeof(recv_data), - rtag_complete_cbx, NULL); - } else if (send_recv_type == SEND_RECV_STREAM) { - recv_req = recv(to, &recv_data, sizeof(recv_data), - rstream_complete_cbx, NULL); - } else if (send_recv_type != SEND_RECV_AM) { - UCS_TEST_ABORT("unsupported communication type " + - std::to_string(send_recv_type)); + void *recv_req = NULL; + if (!no_recv) { + if (send_recv_type == SEND_RECV_TAG) { + recv_req = recv(to, &recv_data, sizeof(recv_data), + rtag_complete_cbx, NULL); + } else if (send_recv_type == SEND_RECV_STREAM) { + recv_req = recv(to, &recv_data, sizeof(recv_data), + rstream_complete_cbx, NULL); + } else if (send_recv_type != SEND_RECV_AM) { + UCS_TEST_ABORT("unsupported communication type " + + std::to_string(send_recv_type)); + } + } else { + UCS_TEST_MESSAGE << "Do not recv"; } { @@ -566,14 +571,16 @@ class test_ucp_sockaddr : public ucp_test { } } - if (send_recv_type == SEND_RECV_AM) { - request_wait(am_rx_arg.rreq); - wait_for_flag(&am_rx_arg.received); - set_am_data_handler(to, 0, NULL, NULL); - } else { - request_wait(recv_req, 0, wakeup); + if (!no_recv) { + if (send_recv_type == SEND_RECV_AM) { + request_wait(am_rx_arg.rreq); + wait_for_flag(&am_rx_arg.received); + set_am_data_handler(to, 0, NULL, NULL); + } else { + request_wait(recv_req, 0, wakeup); + } + EXPECT_EQ(send_data, recv_data); } - EXPECT_EQ(send_data, recv_data); } bool wait_for_server_ep(bool wakeup) @@ -661,12 +668,12 @@ class test_ucp_sockaddr : public ucp_test { if (flags & SEND_DIRECTION_C2S) { send_recv(sender(), receiver(), send_recv_type(), wakeup, - cb_type()); + cb_type(), flags & SEND_NO_RECV); } if (flags & SEND_DIRECTION_S2C) { send_recv(receiver(), sender(), send_recv_type(), wakeup, - cb_type()); + cb_type(), flags & SEND_NO_RECV); } } @@ -1007,6 +1014,10 @@ UCS_TEST_P(test_ucp_sockaddr, listen_bidi) { listen_and_communicate(false, SEND_DIRECTION_BIDI); } +UCS_TEST_P(test_ucp_sockaddr, listen_bidi_no_recv) { + listen_and_communicate(false, SEND_DIRECTION_BIDI | SEND_NO_RECV); +} + UCS_TEST_P(test_ucp_sockaddr, ep_query) { listen_and_communicate(false, 0); ep_query(); @@ -2774,6 +2785,10 @@ class test_ucp_sockaddr_protocols_err : public test_ucp_sockaddr_protocols { void test_tag_send_recv(size_t size, bool is_exp, bool is_sync = false) { + if (!is_exp) { + UCS_TEST_SKIP_R("ucp_tag_probe_nb + err handling is not supported"); + } + /* warmup */ test_ucp_sockaddr_protocols::test_tag_send_recv(size, is_exp, is_sync); @@ -2905,7 +2920,7 @@ class test_ucp_sockaddr_protocols_err_sender /* Warmup */ send_recv(sender(), receiver(), send_recv_type(), false, cb_type(), - sender_idx); + false, sender_idx); for (size_t i = 0; i < num_sends; ++i) { void *sreq = send(sender(), send_buf.ptr(), size, From 33ffac9f9d3e0eec04739d616dce0e3909e353b5 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Wed, 9 Dec 2020 15:28:05 +0200 Subject: [PATCH 06/19] UCT/IB: Non-blocking IB device close --- src/ucs/arch/aarch64/cpu.h | 4 ++ src/ucs/arch/ppc64/cpu.h | 4 ++ src/ucs/arch/x86_64/cpu.c | 17 +++++ src/ucs/arch/x86_64/cpu.h | 3 + src/uct/ib/base/ib_device.c | 119 ++++++++++++++++++++++++++++++++++- src/uct/ib/base/ib_device.h | 17 ++++- src/uct/ib/base/ib_md.c | 8 ++- src/uct/ib/base/ib_md.h | 1 + test/gtest/uct/ib/test_ib.cc | 10 +++ 9 files changed, 179 insertions(+), 4 deletions(-) diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index 06d43b644a9..eda59830a8f 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -70,6 +70,10 @@ BEGIN_C_DECLS #define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst) +/* revert to glibc syscall */ +#define ucs_syscall_raw syscall + + /* * ARM processor ID (ARM ISA - Main ID Register, EL1) */ diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index c163c2968de..85f4f23c8aa 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -39,6 +39,10 @@ BEGIN_C_DECLS #define ucs_memory_cpu_wc_fence() ucs_memory_bus_fence() +/* revert to glibc syscall */ +#define ucs_syscall_raw syscall + + static inline uint64_t ucs_arch_read_hres_clock() { #if HAVE_DECL___PPC_GET_TIMEBASE diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 81a6e0406a6..90d64870791 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -755,4 +755,21 @@ void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len) #endif } +int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3) +{ + int ret; + + asm volatile ( + "movq %1, %%rax\n\t" + "movq %2, %%rdi\n\t" + "movq %3, %%rsi\n\t" + "movq %4, %%rdx\n\t" + "syscall\n\t" + :"=a"(ret) + :"r"(num), "r"(arg1), "r"(arg2), "r"(arg3)); + + return ret; +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index a7a6f56a661..bdab8c22853 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -52,6 +52,9 @@ ucs_cpu_vendor_t ucs_arch_get_cpu_vendor(); void ucs_cpu_init(); ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); +int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3); + static UCS_F_ALWAYS_INLINE int ucs_arch_x86_rdtsc_enabled() { diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index f97d33db2a1..3d8621c04b5 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -804,12 +807,124 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, return UCS_OK; } +static int uct_ib_device_cleanup_proc(void* arg) +{ + uct_ib_device_nb_close_ctx *ctx = arg; + static const char *process_name = "ucx_cleanup"; + char dummy; + int fd; + + /* Since TLS of this thread is uninitialized avoid using glibc */ + ucs_syscall_raw(SYS_prctl, PR_SET_NAME, (long)process_name, 0); + + for (fd = 0; fd < ctx->max_fds; fd++) { + if ((fd != ctx->cmd_fd) && (fd != ctx->pipefds[0])) { + ucs_syscall_raw(SYS_close, fd, 0, 0); + } + } + + /* Wait until pipe closed - either parent terminated or closing device */ + ucs_syscall_raw(SYS_read, ctx->pipefds[0], (long)&dummy, 1); + + return 0; +} + +static ucs_status_t +uct_ib_device_init_nb_close_ctx(int fd, uct_ib_device_nb_close_ctx **ctx_p) +{ + uct_ib_device_nb_close_ctx *ctx; + struct rlimit nofile; + ucs_status_t status; + int ret; + + ctx = ucs_calloc(1, sizeof(*ctx), "ibv cleanup ctx"); + if (ctx == NULL) { + ucs_error("cleanup context allocation failure"); + status = UCS_ERR_NO_MEMORY; + goto err; + } + + ret = getrlimit(RLIMIT_NOFILE, &nofile); + if (ret == 0) { + ctx->max_fds = nofile.rlim_cur; + } else { + ucs_warn("getrlimit(NOFILE) failed: %m"); + ctx->max_fds = 1024; + } + + ctx->buff_size = ucs_get_page_size() * 2; + ctx->cmd_fd = fd; + + status = ucs_mmap_alloc(&ctx->buff_size, &ctx->buff, 0, "ibv cleanup buff"); + if (status != UCS_OK) { + ucs_error("cleanup buffer allocation failed"); + goto err_alloc; + } + + ret = pipe(ctx->pipefds); + if (ret) { + ucs_error("cleanup pipe allocation failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_pipe; + } + + /* CLONE_VM - to keep pinned memory shared + * CLONE_SETTLS - to avoid corruption of parent's TLS + * SIGCHLD - will be sent to parent when child quit, required by waitpid + * buffer layout: tls goes up, stack goes down */ + ret = clone(uct_ib_device_cleanup_proc, + UCS_PTR_BYTE_OFFSET(ctx->buff, ctx->buff_size), + CLONE_VM|CLONE_SETTLS|SIGCHLD, ctx, NULL, ctx->buff, NULL); + + if (ret == -1) { + ucs_error("cleanup clone failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_clone; + } + + ctx->pid = ret; + *ctx_p = ctx; + close(ctx->pipefds[0]); + return UCS_OK; + +err_clone: + close(ctx->pipefds[0]); + close(ctx->pipefds[1]); +err_pipe: + ucs_mmap_free(ctx->buff, ctx->buff_size); +err_alloc: + ucs_free(ctx); +err: + return status; +} + +void uct_ib_device_free_nb_close_ctx(uct_ib_device_nb_close_ctx *ctx) +{ + if (ctx == NULL) { + return; + } + + close(ctx->pipefds[1]); + waitpid(ctx->pid, NULL, 0); + ucs_mmap_free(ctx->buff, ctx->buff_size); + ucs_free(ctx); +} + ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, - struct ibv_device *ibv_device, int async_events + struct ibv_device *ibv_device, + int async_events, int nb_close UCS_STATS_ARG(ucs_stats_node_t *stats_parent)) { ucs_status_t status; + if (nb_close) { + status = uct_ib_device_init_nb_close_ctx(dev->ibv_context->cmd_fd, + &dev->nb_close_ctx); + if (status != UCS_OK) { + return status; + } + } + dev->async_events = async_events; uct_ib_device_get_locality(ibv_get_device_name(ibv_device), @@ -865,6 +980,8 @@ void uct_ib_device_cleanup(uct_ib_device_t *dev) { ucs_debug("destroying ib device %s", uct_ib_device_name(dev)); + uct_ib_device_free_nb_close_ctx(dev->nb_close_ctx); + if (kh_size(&dev->async_events_hash) != 0) { ucs_warn("async_events_hash not empty"); } diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 8fcfcda1a77..e717c068d91 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -200,6 +200,19 @@ typedef struct { KHASH_TYPE(uct_ib_async_event, uct_ib_async_event_t, uct_ib_async_event_val_t); +/** + * Context for non-blocking device cleanup + */ +typedef struct { + int pipefds[2]; + int cmd_fd; + void *buff; + size_t buff_size; + pid_t pid; + int max_fds; +} uct_ib_device_nb_close_ctx; + + /** * IB device (corresponds to HCA) */ @@ -232,6 +245,7 @@ typedef struct uct_ib_device { /* Async event subscribers */ ucs_spinlock_t async_event_lock; khash_t(uct_ib_async_event) async_events_hash; + uct_ib_device_nb_close_ctx *nb_close_ctx; } uct_ib_device_t; @@ -280,7 +294,8 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, struct ibv_device *ibv_device); ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, - struct ibv_device *ibv_device, int async_events + struct ibv_device *ibv_device, int async_events, + int nb_close UCS_STATS_ARG(ucs_stats_node_t *stats_parent)); void uct_ib_device_cleanup(uct_ib_device_t *dev); diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 1023f71e5e6..a88d13fb2c0 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -70,6 +70,10 @@ static ucs_config_field_t uct_ib_md_config_table[] = { "Initialize a fork-safe IB library with ibv_fork_init().", ucs_offsetof(uct_ib_md_config_t, fork_init), UCS_CONFIG_TYPE_TERNARY}, + {"CLEANUP_THREAD", "n", + "Cleanup device resources by a background process.", + ucs_offsetof(uct_ib_md_config_t, nb_close), UCS_CONFIG_TYPE_BOOL}, + {"ASYNC_EVENTS", "y", "Enable listening for async events on the device", ucs_offsetof(uct_ib_md_config_t, async_events), UCS_CONFIG_TYPE_BOOL}, @@ -1590,8 +1594,8 @@ ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md, goto err; } - status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events - UCS_STATS_ARG(md->stats)); + status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events, + md_config->nb_close UCS_STATS_ARG(md->stats)); if (status != UCS_OK) { goto err_release_stats; } diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 29f00a18af5..7b19e19b16c 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -153,6 +153,7 @@ typedef struct uct_ib_md_config { without using the cache */ unsigned fork_init; /**< Use ibv_fork_init() */ int async_events; /**< Whether async events should be delivered */ + int nb_close; /**< Cleanup resources in background */ uct_ib_md_ext_config_t ext; /**< External configuration */ diff --git a/test/gtest/uct/ib/test_ib.cc b/test/gtest/uct/ib/test_ib.cc index 2928aa2acd0..813b350d23d 100644 --- a/test/gtest/uct/ib/test_ib.cc +++ b/test/gtest/uct/ib/test_ib.cc @@ -78,6 +78,16 @@ void test_uct_ib::send_recv_short() { size_t test_uct_ib::m_ib_am_handler_counter = 0; +#if defined(__x86_64__) +UCS_TEST_SKIP_COND_P(test_uct_ib, nb_close, RUNNING_ON_VALGRIND, + "CLEANUP_THREAD=y") +{ +} +#endif + +UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib); + + class test_uct_ib_addr : public test_uct_ib { public: uct_ib_iface_config_t *ib_config() { From 29bfad795edbf8f53977291907e3d029079c38d2 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Fri, 17 Sep 2021 11:09:53 +0300 Subject: [PATCH 07/19] UCT/IB: Remove sighandlers in IB cleanup thread --- src/ucs/arch/aarch64/cpu.h | 1 + src/ucs/arch/ppc64/cpu.h | 1 + src/ucs/arch/x86_64/cpu.c | 18 ++++++++++++++++++ src/ucs/arch/x86_64/cpu.h | 2 ++ src/uct/ib/base/ib_device.c | 23 +++++++++++++++++++---- 5 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index eda59830a8f..751438c7cc0 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -72,6 +72,7 @@ BEGIN_C_DECLS /* revert to glibc syscall */ #define ucs_syscall_raw syscall +#define ucs_syscall_raw4 syscall /* diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index 85f4f23c8aa..42769d7481f 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -41,6 +41,7 @@ BEGIN_C_DECLS /* revert to glibc syscall */ #define ucs_syscall_raw syscall +#define ucs_syscall_raw4 syscall static inline uint64_t ucs_arch_read_hres_clock() diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 90d64870791..cd9a7cb7c18 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -772,4 +772,22 @@ int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, return ret; } +int ucs_syscall_raw4(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4) +{ + int ret; + + asm volatile ( + "movq %1, %%rax\n\t" + "movq %2, %%rdi\n\t" + "movq %3, %%rsi\n\t" + "movq %4, %%rdx\n\t" + "movq %5, %%r10\n\t" + "syscall\n\t" + :"=a"(ret) + :"r"(num), "r"(arg1), "r"(arg2), "r"(arg3), "r"(arg4)); + + return ret; +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index bdab8c22853..e32150a1d92 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -54,6 +54,8 @@ ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, unsigned long arg3); +int ucs_syscall_raw4(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4); static UCS_F_ALWAYS_INLINE int ucs_arch_x86_rdtsc_enabled() diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index 3d8621c04b5..c2803c60e08 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -807,19 +807,34 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, return UCS_OK; } +/* Based on include/linux/signal.h from kernel source */ +struct uct_ib_sigaction { + void *uapi_sa_handler; + uint64_t uapi_sa_flags; + void *uapi_sa_restorer; + uint64_t uapi_sa_mask; + uint64_t pad; /* glibc compatibility */ +}; + static int uct_ib_device_cleanup_proc(void* arg) { uct_ib_device_nb_close_ctx *ctx = arg; static const char *process_name = "ucx_cleanup"; + struct uct_ib_sigaction dfl = { SIG_DFL }; char dummy; - int fd; + int i; + + for (i = 1; i <= SIGUSR2; i++) { + ucs_syscall_raw4(SYS_rt_sigaction, i, (long)&dfl, 0, + sizeof(dfl.uapi_sa_mask)); + } /* Since TLS of this thread is uninitialized avoid using glibc */ ucs_syscall_raw(SYS_prctl, PR_SET_NAME, (long)process_name, 0); - for (fd = 0; fd < ctx->max_fds; fd++) { - if ((fd != ctx->cmd_fd) && (fd != ctx->pipefds[0])) { - ucs_syscall_raw(SYS_close, fd, 0, 0); + for (i = 0; i < ctx->max_fds; i++) { + if ((i != ctx->cmd_fd) && (i != ctx->pipefds[0])) { + ucs_syscall_raw(SYS_close, i, 0, 0); } } From 729030e0a0e8866672bf26543d8f194901ceba70 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Tue, 11 Jan 2022 11:44:02 +0200 Subject: [PATCH 08/19] UCT/IB: handle IBV_EVENT_DEVICE_FATAL --- src/uct/ib/base/ib_device.c | 43 ++++++++++++++++++++++++++--------- src/uct/ib/base/ib_device.h | 1 + src/uct/ib/ud/accel/ud_mlx5.c | 7 ++++++ 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index c2803c60e08..01139f8ad2d 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -284,22 +284,42 @@ uct_ib_device_async_event_schedule_callback(uct_ib_device_t *dev, wait_ctx, 0); } +static void +uct_ib_device_async_event_dispatch_nolock(uct_ib_device_t *dev, + const uct_ib_async_event_t *event) +{ + khiter_t iter = kh_get(uct_ib_async_event, &dev->async_events_hash, *event); + uct_ib_async_event_val_t *entry; + + if (iter == kh_end(&dev->async_events_hash)) { + return; + } + + entry = &kh_value(&dev->async_events_hash, iter); + entry->fired = 1; + if (entry->wait_ctx != NULL) { + uct_ib_device_async_event_schedule_callback(dev, entry->wait_ctx); + } +} + static void uct_ib_device_async_event_dispatch(uct_ib_device_t *dev, const uct_ib_async_event_t *event) { - uct_ib_async_event_val_t *entry; - khiter_t iter; + ucs_spin_lock(&dev->async_event_lock); + uct_ib_device_async_event_dispatch_nolock(dev, event); + ucs_spin_unlock(&dev->async_event_lock); +} + +static void +uct_ib_device_async_event_dispatch_fatal(uct_ib_device_t *dev) +{ + uct_ib_async_event_t event; ucs_spin_lock(&dev->async_event_lock); - iter = kh_get(uct_ib_async_event, &dev->async_events_hash, *event); - if (iter != kh_end(&dev->async_events_hash)) { - entry = &kh_value(&dev->async_events_hash, iter); - entry->fired = 1; - if (entry->wait_ctx != NULL) { - uct_ib_device_async_event_schedule_callback(dev, entry->wait_ctx); - } - } + dev->flags |= UCT_IB_DEVICE_FAILED; + kh_foreach_key(&dev->async_events_hash, event, + uct_ib_device_async_event_dispatch_nolock(dev, &event)); ucs_spin_unlock(&dev->async_event_lock); } @@ -509,9 +529,10 @@ void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_DEVICE_FATAL: + uct_ib_device_async_event_dispatch_fatal(dev); snprintf(event_info, sizeof(event_info), "%s on port %d", ibv_event_type_str(event->event_type), event->port_num); - level = UCS_LOG_LEVEL_ERROR; + level = UCS_LOG_LEVEL_DIAG; break; case IBV_EVENT_PORT_ACTIVE: case IBV_EVENT_PORT_ERR: diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index e717c068d91..88357c3fd2b 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -88,6 +88,7 @@ enum { UCT_IB_DEVICE_FLAG_DC = UCT_IB_DEVICE_FLAG_DC_V1 | UCT_IB_DEVICE_FLAG_DC_V2, /* Device supports DC */ UCT_IB_DEVICE_FLAG_ODP_IMPLICIT = UCS_BIT(9), + UCT_IB_DEVICE_FAILED = UCS_BIT(10) /* Got fatal error */ }; diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 823c383009a..9364eeb410a 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -697,6 +697,13 @@ static ucs_status_t uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface, int solicited) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t); + uct_ib_mlx5_md_t *ib_md = ucs_derived_of(ib_iface->super.md, + uct_ib_mlx5_md_t); + + if (ucs_unlikely(ib_md->super.dev.flags & UCT_IB_DEVICE_FAILED)) { + return UCS_OK; + } + #if HAVE_DECL_MLX5DV_INIT_OBJ return uct_ib_mlx5dv_arm_cq(&iface->cq[dir], solicited); #else From 71fb4c9578ec742ba2dac7e15d6aee419ba9b3c2 Mon Sep 17 00:00:00 2001 From: Anatoly Vildemanov Date: Mon, 11 Apr 2022 13:17:58 +0300 Subject: [PATCH 09/19] AZP: Create release pipeline --- buildlib/azure-pipelines-release.yml | 102 +++++++++------------------ 1 file changed, 34 insertions(+), 68 deletions(-) diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index cbe9d2ed195..58b4c64f8a0 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -5,52 +5,27 @@ trigger: tags: include: - v* -pr: - - master - - v*.*.x resources: containers: - - container: centos7_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5-cuda11:2 - - container: centos8_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos8-mofed5-cuda11:2 - - container: ubuntu16_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu16.04-mofed5-cuda11:3 - - container: ubuntu18_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5-cuda11:3 - - container: ubuntu20_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu20.04-mofed5-cuda11:3 + - container: centos7 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7:4 -stages: - - stage: Prepare - jobs: - - job: CheckRelease - pool: - name: MLNX - demands: - - ucx_docker -equals yes - steps: - - checkout: self - fetchDepth: 100 - clean: true - - - bash: | - set -eE - source ./buildlib/az-helpers.sh - set -x - check_release_build $(Build.Reason) $(Build.SourceVersion) "AZP/RELEASE: " - name: Result - displayName: Check build condition +stages: # Create an empty draft to avoid race condition in distro releases - - stage: GitHubDraft - condition: eq(dependencies.Prepare.outputs['CheckRelease.Result.Launch'], 'True') - dependsOn: Prepare + - stage: GitHubRelease + variables: + ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}: + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/heads/', '') }}-centos7-mofed5.tar.bz2 + ${{ if eq(variables['Build.Reason'], 'IndividualCI') }}: + artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-mofed5.tar.bz2 + ${{ if eq(variables['Build.Reason'], 'PullRequest') }}: + artifact_name: ucx-pr$(System.PullRequest.PullRequestNumber)-centos7-mofed5.tar.bz2 jobs: - - job: DraftRelease - displayName: Create draft release - container: centos7_cuda11 + - job: Release + displayName: Create release + container: centos7 pool: name: MLNX demands: @@ -61,44 +36,35 @@ stages: fetchDepth: 100 path: "we/need/to/go/deeper" - - bash: ./autogen.sh - displayName: Setup autotools + - bash: | + set -eEx + ./autogen.sh + mkdir pkg-build + cd pkg-build + ../contrib/configure-release --with-java=no + displayName: Configure - bash: | - set -eE - gcc --version - ./contrib/configure-release --with-java=no - ./contrib/buildrpm.sh -s -t -b - displayName: Build tarball + set -eEx + cd pkg-build + ../contrib/buildrpm.sh -s -t -b --noclean + cd rpm-dist/`uname -m` + tar -cjf "../../../${AZ_ARTIFACT_NAME}" *.rpm + cd ../../.. + tar -tjf "${AZ_ARTIFACT_NAME}" + displayName: Build RPM package + env: + AZ_ARTIFACT_NAME: $(artifact_name) - task: GithubRelease@0 condition: eq(variables['Build.Reason'], 'IndividualCI') - displayName: Create/edit GitHub Draft Release + displayName: Upload artifacts inputs: githubConnection: release - repositoryName: openucx/ucx + repositoryName: mellanox/ucx action: edit tag: $(Build.SourceBranchName) isDraft: true addChangeLog: false - releaseNotesSource: file - releaseNotesFile: NEWS assetUploadMode: replace - assets: | - ./ucx-*.tar.gz - ./rpm-dist/ucx-*.src.rpm - - - stage: Build - displayName: Build binary packages - dependsOn: - - Prepare - - GitHubDraft - condition: eq(dependencies.Prepare.outputs['CheckRelease.Result.Launch'], 'True') - jobs: - - template: az-distro-release.yml - - template: jucx/jucx-publish.yml - parameters: - ${{ if eq(variables['Build.Reason'], 'IndividualCI') }}: - target: publish-release - ${{ if eq(variables['Build.Reason'], 'PullRequest') }}: - target: package + assets: "./$(artifact_name)" From a200b9a457803fce3c3639acfc889a558db04b59 Mon Sep 17 00:00:00 2001 From: Anatoly Vildemanov Date: Mon, 11 Apr 2022 14:18:54 +0300 Subject: [PATCH 10/19] AZP: Create release pipeline --- buildlib/azure-pipelines-release.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index 58b4c64f8a0..a932e1f1396 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -20,8 +20,6 @@ stages: artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/heads/', '') }}-centos7-mofed5.tar.bz2 ${{ if eq(variables['Build.Reason'], 'IndividualCI') }}: artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-mofed5.tar.bz2 - ${{ if eq(variables['Build.Reason'], 'PullRequest') }}: - artifact_name: ucx-pr$(System.PullRequest.PullRequestNumber)-centos7-mofed5.tar.bz2 jobs: - job: Release displayName: Create release From 7c4d0f6429139390c0657d5958fa2b5d71859e04 Mon Sep 17 00:00:00 2001 From: Anatoly Vildemanov Date: Wed, 19 May 2021 12:51:50 +0300 Subject: [PATCH 11/19] AZP/CI: Auto-rebase on openucx/ucx v1.13.x branch + Fix name of branch + Trigger for push and pr + Clean workspace before use --- buildlib/azure-pipelines-int4.yml | 50 +++++++++++++++++++++++++++++++ buildlib/azure-pipelines-pr.yml | 9 ++++-- 2 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 buildlib/azure-pipelines-int4.yml diff --git a/buildlib/azure-pipelines-int4.yml b/buildlib/azure-pipelines-int4.yml new file mode 100644 index 00000000000..a146a50fbff --- /dev/null +++ b/buildlib/azure-pipelines-int4.yml @@ -0,0 +1,50 @@ +# See https://aka.ms/yaml + +trigger: none +pr: none + +resources: + pipelines: + - pipeline: myTest + source: 'UCX snapshot' + trigger: true + +stages: + - stage: Rebase + variables: + rebase_branch: v1.13.x + + jobs: + - job: rebase + pool: + name: MLNX + demands: + - ucx_docker -equals yes + displayName: rebase on openucx/ucx + steps: + - checkout: self + clean: true + fetchDepth: 200 + - bash: | + set -eEx + source buildlib/az-helpers.sh + # Checkout integration4 branch from Mellanox/ucx + git remote set-url origin git@github.com:Mellanox/ucx.git + git fetch origin integration4 + git checkout integration4 + # Checkout $(rebase_branch) branch from openucx/ucx + git remote add upstream https://github.com/openucx/ucx.git + git fetch upstream $(rebase_branch) + git log --oneline -10 upstream/$(rebase_branch) + # Rebase integration4 branch on $(rebase_branch) branch + if ! git rebase upstream/$(rebase_branch) + then + # Automatic rebase failed - show merge conflicts + git status + git diff + head=$(git rev-parse --short HEAD) + azure_log_issue "Rebase on ${head} failed, see https://github.com/Mellanox/ucx/wiki/Manual-rebase-of-integration4-branch for details" + else + # Automatic rebase was successful - update the branch + git push origin HEAD --force + fi diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml index 51dd8d7e82f..4d1fefa37ab 100644 --- a/buildlib/azure-pipelines-pr.yml +++ b/buildlib/azure-pipelines-pr.yml @@ -1,12 +1,15 @@ # See https://aka.ms/yaml # This pipeline to be run on PRs -trigger: none +trigger: + batch: true + branches: + include: + - integration4 pr: branches: include: - - master - - v*.*.x + - integration4 paths: exclude: - .gitignore From ba3e65ca4582b66b3cdeefb6029a8ee45092f17d Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Tue, 10 Aug 2021 16:52:51 +0300 Subject: [PATCH 12/19] UCT/MLX5: disable HW TM --- src/uct/ib/rc/accel/rc_mlx5_iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/uct/ib/rc/accel/rc_mlx5_iface.c b/src/uct/ib/rc/accel/rc_mlx5_iface.c index f166a805788..421a0a26db6 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_iface.c +++ b/src/uct/ib/rc/accel/rc_mlx5_iface.c @@ -469,8 +469,9 @@ static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface, UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB | UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB); + /* NOTE: always disable for wire compatibility */ iface->tm.enabled = mlx5_config->tm.enable && tm_params && - (init_attr->flags & UCT_IB_TM_SUPPORTED); + (init_attr->flags & UCT_IB_TM_SUPPORTED) && 0; if (!iface->tm.enabled) { goto out_tm_disabled; } From e12239f3c05257d1651eb8659b754f5eca382478 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Tue, 3 Aug 2021 16:37:06 +0300 Subject: [PATCH 13/19] UCP/TAG: add eager compatibility --- src/ucp/core/ucp_ep.c | 113 ++++++++++++++++++++++++++++++++++++- src/ucp/core/ucp_request.h | 1 + src/ucp/core/ucp_worker.c | 5 +- src/ucp/proto/proto_am.inl | 12 ---- src/ucp/rndv/rndv.h | 3 + src/ucp/rndv/rndv.inl | 3 +- src/ucp/tag/eager.h | 37 ++++++++++++ src/ucp/tag/eager_multi.c | 2 + src/ucp/tag/eager_rcv.c | 26 +++++++-- src/ucp/tag/eager_single.c | 19 ++++--- src/ucp/tag/eager_snd.c | 22 ++++++-- src/ucp/tag/tag_match.c | 3 +- src/ucp/tag/tag_match.h | 2 +- src/ucp/tag/tag_match.inl | 7 ++- src/ucp/tag/tag_recv.c | 8 ++- src/ucp/tag/tag_rndv.c | 72 ++++++++++++++++++++++- src/ucp/tag/tag_send.c | 5 +- 17 files changed, 294 insertions(+), 46 deletions(-) diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index 8513e55233b..abd86b37420 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -21,8 +21,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -333,6 +335,9 @@ ucs_status_t ucp_ep_create_base(ucp_worker_h worker, unsigned ep_init_flags, goto err_ep_deallocate; } + ucs_debug("ep %p: local id %"PRIuPTR, ep, + ucp_ep_ext_control(ep)->local_ep_id); + ucp_ep_flush_state_reset(ep); /* Create endpoint VFS node on demand to avoid memory bloat */ @@ -494,6 +499,9 @@ void ucp_ep_release_id(ucp_ep_h ep) { ucs_status_t status; + ucs_debug("ep %p: released id %"PRIuPTR, ep, + ucp_ep_ext_control(ep)->local_ep_id); + /* Don't use ucp_ep_local_id() function here to avoid assertion failure, * because local_ep_id can be set to @ref UCS_PTR_MAP_KEY_INVALID */ status = UCS_PTR_MAP_DEL(ep, &ep->worker->ep_map, @@ -1235,6 +1243,7 @@ static void ucp_ep_set_lanes_failed(ucp_ep_h ep, uct_ep_h *uct_eps) uct_ep_h uct_ep; ucp_ep_check_lanes(ep); + /* release id on failed EP to drop data by invalid ID */ ucp_ep_release_id(ep); ucp_ep_update_flags(ep, UCP_EP_FLAG_FAILED, UCP_EP_FLAG_LOCAL_CONNECTED); @@ -1344,11 +1353,93 @@ static void ucp_ep_discard_lanes(ucp_ep_h ep, ucs_status_t discard_status) ucp_ep_discard_lanes_callback(NULL, UCS_OK, discard_arg); } +static void ucp_tm_ep_cleanup(ucp_ep_h ep, ucs_ptr_map_key_t ep_id, + ucs_status_t status) +{ + ucp_tag_match_t *tm = &ep->worker->tm; + const ucp_rndv_rts_hdr_t *rndv_rts_hdr; + const ucp_eager_middle_hdr_t *eager_mid_hdr; + const ucp_eager_hdr_t *eager_hdr; + ucp_recv_desc_t *rdesc, *tmp; + ucp_tag_frag_match_t *matchq; + ucp_request_t *rreq; + uint64_t msg_id; + khiter_t iter; + ucs_debug("cleanup ep %p", ep); + + if (!(ep_id & UCS_PTR_MAP_KEY_INDIRECT_FLAG)) { + return; + } + + if (ep_id == UCS_PTR_MAP_KEY_INVALID) { + ucs_assert(ep->flags & UCP_EP_FLAG_FAILED); + return; + } + + /* remove from unexpected queue */ + ucs_list_for_each_safe(rdesc, tmp, &tm->unexpected.all, + tag_list[UCP_RDESC_ALL_LIST]) { + if (rdesc->flags & UCP_RECV_DESC_FLAG_RNDV) { + rndv_rts_hdr = (const void*)(rdesc + 1); + if (rndv_rts_hdr->sreq.ep_id != ep_id) { + /* rndv not matched */ + continue; + } + } else { + eager_hdr = (const void*)(rdesc + 1); + if (eager_hdr->ep_id != ep_id) { + /* eager not matched */ + continue; + } + } + + ucs_debug("ep %p: ep_id %"PRIuPTR" releasing unexpected rdesc %p", ep, + ep_id, rdesc); + ucp_tag_unexp_remove(rdesc); + ucp_recv_desc_release(rdesc); + } + + /* remove from fragments hash */ + kh_foreach_key(&tm->frag_hash, msg_id, { + iter = kh_get(ucp_tag_frag_hash, &tm->frag_hash, msg_id); + matchq = &kh_val(&tm->frag_hash, iter); + if (!ucp_tag_frag_match_is_unexp(matchq)) { + /* remove receive request from expected hash */ + rreq = matchq->exp_req; + if (rreq->recv.tag.ep_id != ep_id) { + continue; + } + + ucs_debug("ep %p: ep_id %"PRIuPTR" completing expected receive request %p with status %s", + ep, ep_id, rreq, ucs_status_string(status)); + ucp_request_complete_tag_recv(rreq, status); + } else { + /* remove receive fragments from unexpected matchq */ + rdesc = ucs_queue_head_elem_non_empty(&matchq->unexp_q, ucp_recv_desc_t, + tag_frag_queue); + ucs_assert(!(rdesc->flags & UCP_RECV_DESC_FLAG_RNDV)); + eager_mid_hdr = (void*)(rdesc + 1); + if (eager_mid_hdr->ep_id != ep_id) { + continue; + } + + ucs_queue_for_each_extract(rdesc, &matchq->unexp_q, tag_frag_queue, 1) { + ucs_debug("ep %p: ep_id %"PRIuPTR" releasing unexpected rdesc %p", + ep, ep_id, rdesc); + ucp_recv_desc_release(rdesc); + } + } + + kh_del(ucp_tag_frag_hash, &tm->frag_hash, iter); + }); +} + ucs_status_t ucp_ep_set_failed(ucp_ep_h ucp_ep, ucp_lane_index_t lane, ucs_status_t status) { UCS_STRING_BUFFER_ONSTACK(lane_info_strb, 64); ucp_ep_ext_control_t *ep_ext_control = ucp_ep_ext_control(ucp_ep); + ucs_ptr_map_key_t ep_id; ucp_err_handling_mode_t err_mode; ucs_log_level_t log_level; ucp_request_t *close_req; @@ -1373,9 +1464,14 @@ ucp_ep_set_failed(ucp_ep_h ucp_ep, ucp_lane_index_t lane, ucs_status_t status) ++ucp_ep->worker->counters.ep_failures; + /* Store local_ep_id because @ref ucp_ep_discard_lanes invalidates it, + * invalidated local ID is used to drop data on failed EP */ + ep_id = ucp_ep_local_id(ucp_ep); + /* The EP can be closed from last completion callback */ ucp_ep_discard_lanes(ucp_ep, status); ucp_stream_ep_cleanup(ucp_ep, status); + ucp_tm_ep_cleanup(ucp_ep, ep_id, status); if (ucp_ep->flags & UCP_EP_FLAG_USED) { if (ucp_ep->flags & UCP_EP_FLAG_CLOSED) { @@ -1487,6 +1583,8 @@ void ucp_ep_disconnected(ucp_ep_h ep, int force) ucp_stream_ep_cleanup(ep, UCS_ERR_CANCELED); ucp_am_ep_cleanup(ep); + ucp_tm_ep_cleanup(ep, ucp_ep_ext_control(ep)->local_ep_id, + UCS_ERR_CANCELED); ucp_ep_update_flags(ep, 0, UCP_EP_FLAG_USED); @@ -1619,6 +1717,8 @@ ucs_status_ptr_t ucp_ep_close_nbx(ucp_ep_h ep, const ucp_request_param_t *param) ucp_ep_update_flags(ep, UCP_EP_FLAG_CLOSED, 0); if (ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE) { + ucp_tm_ep_cleanup(ep, ucp_ep_ext_control(ep)->local_ep_id, + UCS_ERR_CANCELED); ucp_ep_discard_lanes(ep, UCS_ERR_CANCELED); ucp_ep_disconnected(ep, 1); } else { @@ -2335,7 +2435,7 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, size_t max_rndv_thresh, max_am_rndv_thresh; size_t min_rndv_thresh, min_am_rndv_thresh; size_t rma_zcopy_thresh; - size_t am_max_eager_short; + ssize_t am_max_eager_short; double get_zcopy_max_bw[UCS_MEMORY_TYPE_LAST]; double put_zcopy_max_bw[UCS_MEMORY_TYPE_LAST]; ucs_status_t status; @@ -2569,6 +2669,9 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, ucp_ep_config_set_memtype_thresh(&config->tag.offload.max_eager_short, config->tag.eager.max_short, context->num_mem_type_detect_mds); + /* TAG offload is disabled for compatibility reasons */ + ucs_assert(config->tag.offload.max_eager_short.memtype_on < 0); + ucs_assert(config->tag.offload.max_eager_short.memtype_off < 0); } } @@ -2641,7 +2744,13 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config, /* TODO: set threshold level based on all available lanes */ config->tag.eager = config->am; - config->tag.eager.max_short = am_max_eager_short; + /* short_iov which is used for compatibility path does not have + * header, payload can be 8 bytes larger but we add ep_id to header, + * so 8 - 16 = -8 bytes*/ + config->tag.eager.max_short = (am_max_eager_short < + (ssize_t)sizeof(uint64_t)) ? -1 : + (am_max_eager_short - + sizeof(uint64_t)); config->tag.lane = lane; config->tag.rndv.am_thresh = config->rndv.am_thresh; config->tag.rndv.rma_thresh = config->rndv.rma_thresh; diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h index 7d779d9feb8..34fab9dbca8 100644 --- a/src/ucp/core/ucp_request.h +++ b/src/ucp/core/ucp_request.h @@ -407,6 +407,7 @@ struct ucp_request { ucp_tag_t tag; /* Expected tag */ ucp_tag_t tag_mask; /* Expected tag mask */ uint64_t sn; /* Tag match sequence */ + ucs_ptr_map_key_t ep_id; /* Endpoint local id */ ucp_tag_recv_nbx_callback_t cb; /* Completion callback */ ucp_tag_recv_info_t info; /* Completion info to fill */ diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index c23196e470c..e44ee1114f4 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -1229,8 +1229,11 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id, */ UCS_STATIC_ASSERT(UCP_WORKER_HEADROOM_PRIV_SIZE >= sizeof(ucp_eager_sync_hdr_t)); + + /* HW TM is disabled due to compatibility reason */ UCS_STATIC_ASSERT(UCP_WORKER_HEADROOM_PRIV_SIZE >= - sizeof(ucp_offload_first_desc_t)); + (sizeof(ucp_offload_first_desc_t) - + sizeof(uint64_t) /* ep_id */)); /* Fill rest of uct_iface params (caller should fill specific mode fields) */ iface_params->field_mask |= UCT_IFACE_PARAM_FIELD_STATS_ROOT | diff --git a/src/ucp/proto/proto_am.inl b/src/ucp/proto/proto_am.inl index 74be6e9d6e4..26ed42f13d3 100644 --- a/src/ucp/proto/proto_am.inl +++ b/src/ucp/proto/proto_am.inl @@ -24,18 +24,6 @@ typedef void (*ucp_req_complete_func_t)(ucp_request_t *req, ucs_status_t status); -static UCS_F_ALWAYS_INLINE void -ucp_add_uct_iov_elem(uct_iov_t *iov, void *buffer, size_t length, - uct_mem_h memh, size_t *iov_cnt) -{ - iov[*iov_cnt].buffer = buffer; - iov[*iov_cnt].length = length; - iov[*iov_cnt].count = 1; - iov[*iov_cnt].stride = 0; - iov[*iov_cnt].memh = memh; - ++(*iov_cnt); -} - static UCS_F_ALWAYS_INLINE ucs_status_t ucp_do_am_bcopy_single(uct_pending_req_t *self, uint8_t am_id, uct_pack_callback_t pack_cb) diff --git a/src/ucp/rndv/rndv.h b/src/ucp/rndv/rndv.h index ccc55ee0ea9..b44a63a7090 100644 --- a/src/ucp/rndv/rndv.h +++ b/src/ucp/rndv/rndv.h @@ -16,6 +16,9 @@ typedef enum { /* RNDV TAG operation with status UCS_OK (kept for wire compatibility with * the previous UCP versions) */ UCP_RNDV_RTS_TAG_OK = UCS_OK, + /* RNDV TAG operation with status UCS_ERR_CANCELED (kept for wire + * compatibility with the previous UCP versions) */ + UCP_RNDV_RTS_TAG_CANCELED = (uint8_t)UCS_ERR_CANCELED, /* RNDV AM operation */ UCP_RNDV_RTS_AM = 1 } UCS_S_PACKED ucp_rndv_rts_opcode_t; diff --git a/src/ucp/rndv/rndv.inl b/src/ucp/rndv/rndv.inl index e3f4f6e198f..1588e96e658 100644 --- a/src/ucp/rndv/rndv.inl +++ b/src/ucp/rndv/rndv.inl @@ -20,7 +20,8 @@ ucp_rndv_rts_is_am(const ucp_rndv_rts_hdr_t *rts_hdr) static UCS_F_ALWAYS_INLINE int ucp_rndv_rts_is_tag(const ucp_rndv_rts_hdr_t *rts_hdr) { - return rts_hdr->opcode == UCP_RNDV_RTS_TAG_OK; + return (rts_hdr->opcode == UCP_RNDV_RTS_TAG_OK) || + (rts_hdr->opcode == UCP_RNDV_RTS_TAG_CANCELED); } static UCS_F_ALWAYS_INLINE void diff --git a/src/ucp/tag/eager.h b/src/ucp/tag/eager.h index 61267073149..9c38ea7f0ac 100644 --- a/src/ucp/tag/eager.h +++ b/src/ucp/tag/eager.h @@ -27,6 +27,7 @@ */ typedef struct { ucp_tag_hdr_t super; + uint64_t ep_id; } UCS_S_PACKED ucp_eager_hdr_t; @@ -45,6 +46,7 @@ typedef struct { */ typedef struct { uint64_t msg_id; + uint64_t ep_id; size_t offset; } UCS_S_PACKED ucp_eager_middle_hdr_t; @@ -95,4 +97,39 @@ ucp_proto_eager_check_op_id(const ucp_proto_init_params_t *init_params, ucp_ep_config_key_has_tag_lane(init_params->ep_config_key)); } +static UCS_F_ALWAYS_INLINE void +ucp_add_uct_iov_elem(uct_iov_t *iov, void *buffer, size_t length, + uct_mem_h memh, size_t *iov_cnt) +{ + iov[*iov_cnt].buffer = buffer; + iov[*iov_cnt].length = length; + iov[*iov_cnt].count = 1; + iov[*iov_cnt].stride = 0; + iov[*iov_cnt].memh = memh; + ++(*iov_cnt); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_tag_send_am_short_iov(uct_ep_h ep, ucs_ptr_map_key_t remote_id, + const void *buffer, size_t length, ucp_tag_t tag) +{ + size_t iov_cnt = 0ul; + ucp_eager_hdr_t hdr = { .super.tag = tag, + .ep_id = remote_id }; + uct_iov_t iov[2]; + + ucp_add_uct_iov_elem(iov, &hdr, sizeof(hdr), UCT_MEM_HANDLE_NULL, &iov_cnt); + ucp_add_uct_iov_elem(iov, (void*)buffer, length, UCT_MEM_HANDLE_NULL, + &iov_cnt); + return uct_ep_am_short_iov(ep, UCP_AM_ID_EAGER_ONLY, iov, iov_cnt); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_ep_tag_send_am_short_iov(ucp_ep_h ep, const void *buffer, size_t length, + ucp_tag_t tag) +{ + return ucp_tag_send_am_short_iov(ucp_ep_get_am_uct_ep(ep), + ucp_ep_remote_id(ep), buffer, length, tag); +} + #endif diff --git a/src/ucp/tag/eager_multi.c b/src/ucp/tag/eager_multi.c index 44088cb4a68..99506d52f9e 100644 --- a/src/ucp/tag/eager_multi.c +++ b/src/ucp/tag/eager_multi.c @@ -18,6 +18,7 @@ static UCS_F_ALWAYS_INLINE void ucp_proto_eager_set_first_hdr(ucp_request_t *req, ucp_eager_first_hdr_t *hdr) { hdr->super.super.tag = req->send.msg_proto.tag; + hdr->super.ep_id = ucp_send_request_get_ep_remote_id(req); hdr->total_len = req->send.state.dt_iter.length; hdr->msg_id = req->send.msg_proto.message_id; } @@ -26,6 +27,7 @@ static UCS_F_ALWAYS_INLINE void ucp_proto_eager_set_middle_hdr(ucp_request_t *req, ucp_eager_middle_hdr_t *hdr) { hdr->msg_id = req->send.msg_proto.message_id; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); hdr->offset = req->send.state.dt_iter.offset; } diff --git a/src/ucp/tag/eager_rcv.c b/src/ucp/tag/eager_rcv.c index 99b1e148bac..468db9b4339 100644 --- a/src/ucp/tag/eager_rcv.c +++ b/src/ucp/tag/eager_rcv.c @@ -65,7 +65,8 @@ ucp_eager_offload_handler(void *arg, void *data, size_t length, if (!UCS_STATUS_IS_ERR(status)) { rdesc_hdr = (ucp_tag_t*)(rdesc + 1); *rdesc_hdr = recv_tag; - ucp_tag_unexp_recv(&worker->tm, rdesc, recv_tag); + ucp_tag_unexp_recv(&worker->tm, rdesc, recv_tag, + UCS_PTR_MAP_KEY_INVALID); } } @@ -89,6 +90,8 @@ ucp_eager_tagged_handler(void *arg, void *data, size_t length, unsigned am_flags ucp_recv_desc_t *rdesc; ucp_request_t *req; ucs_status_t status; + ucp_ep_h ep UCS_V_UNUSED; + req = ucp_tag_exp_search(&worker->tm, recv_tag); if (req != NULL) { @@ -117,17 +120,24 @@ ucp_eager_tagged_handler(void *arg, void *data, size_t length, unsigned am_flags 0); if (status == UCS_INPROGRESS) { ucp_tag_frag_list_process_queue( - &worker->tm, req, eagerf_hdr->msg_id + &worker->tm, req, eagerf_hdr->msg_id, + eagerf_hdr->super.ep_id UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_EXP)); } } status = UCS_OK; } else { + /* check UCS_PTR_MAP_KEY_INVALID to pass CI */ + if (ucs_likely(eager_hdr->ep_id != UCS_PTR_MAP_KEY_INVALID)) { + UCP_WORKER_GET_EP_BY_ID(&ep, worker, eager_hdr->ep_id, return UCS_OK, + "eager"); + } + status = ucp_recv_desc_init(worker, data, length, 0, am_flags, hdr_len, flags, priv_length, 1, name, &rdesc); if (!UCS_STATUS_IS_ERR(status)) { - ucp_tag_unexp_recv(&worker->tm, rdesc, eager_hdr->super.tag); + ucp_tag_unexp_recv(&worker->tm, rdesc, recv_tag, eager_hdr->ep_id); } } @@ -163,6 +173,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, ucp_worker_h worker = arg; ucp_eager_middle_hdr_t *hdr = data; ucp_recv_desc_t *rdesc = NULL; + ucp_ep_h ep UCS_V_UNUSED; ucp_tag_frag_match_t *matchq; ucp_request_t *req; ucs_status_t status; @@ -170,6 +181,12 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_middle_handler, khiter_t iter; int ret; + /* check UCS_PTR_MAP_KEY_INVALID to pass CI */ + if (ucs_likely(hdr->ep_id != UCS_PTR_MAP_KEY_INVALID)) { + UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, hdr->ep_id, return UCS_OK, + "eager_middle"); + } + iter = kh_put(ucp_tag_frag_hash, &worker->tm.frag_hash, hdr->msg_id, &ret); ucs_assert(ret >= 0); matchq = &kh_value(&worker->tm.frag_hash, iter); @@ -347,7 +364,7 @@ ucp_tag_offload_eager_first_handler(ucp_worker_h worker, void *data, ucp_request_recv_offload_data(req, data, length, flags); } else { ucp_tag_frag_match_init_unexp(matchq); - ucp_tag_unexp_recv(&worker->tm, rdesc, stag); + ucp_tag_unexp_recv(&worker->tm, rdesc, stag, UCS_PTR_MAP_KEY_INVALID); } return status; @@ -476,6 +493,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_eager, priv->req.req_id = UCS_PTR_MAP_KEY_INVALID; priv->req.ep_id = imm; priv->super.super.tag = stag; + priv->super.ep_id = UCS_PTR_MAP_KEY_INVALID; return ucp_eager_tagged_handler(worker, priv, length + priv_len, tl_flags, flags, priv_len, priv_len, "tag_offload_unexp_eager_sync"); diff --git a/src/ucp/tag/eager_single.c b/src/ucp/tag/eager_single.c index a64dacf9117..569be8a0aea 100644 --- a/src/ucp/tag/eager_single.c +++ b/src/ucp/tag/eager_single.c @@ -26,10 +26,11 @@ static ucs_status_t ucp_eager_short_progress(uct_pending_req_t *self) const ucp_proto_single_priv_t *spriv = req->send.proto_config->priv; ucs_status_t status; - status = uct_ep_am_short(req->send.ep->uct_eps[spriv->super.lane], - UCP_AM_ID_EAGER_ONLY, req->send.msg_proto.tag, - req->send.state.dt_iter.type.contig.buffer, - req->send.state.dt_iter.length); + status = ucp_tag_send_am_short_iov(req->send.ep->uct_eps[spriv->super.lane], + ucp_send_request_get_ep_remote_id(req), + req->send.state.dt_iter.type.contig.buffer, + req->send.state.dt_iter.length, + req->send.msg_proto.tag); if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) { req->send.lane = spriv->super.lane; /* for pending add */ return status; @@ -59,7 +60,7 @@ ucp_proto_eager_short_init(const ucp_proto_init_params_t *init_params) .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID, .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_short), .super.max_iov_offs = UCP_PROTO_COMMON_OFFSET_INVALID, - .super.hdr_size = sizeof(ucp_tag_hdr_t), + .super.hdr_size = sizeof(ucp_eager_hdr_t), .super.send_op = UCT_EP_OP_AM_SHORT, .super.memtype_op = UCT_EP_OP_LAST, .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG, @@ -95,6 +96,7 @@ static size_t ucp_eager_single_pack(void *dest, void *arg) ucs_assert(req->send.state.dt_iter.offset == 0); hdr->super.tag = req->send.msg_proto.tag; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); packed_size = ucp_datatype_iter_next_pack(&req->send.state.dt_iter, req->send.ep->worker, SIZE_MAX, &next_iter, hdr + 1); @@ -128,7 +130,7 @@ ucp_proto_eager_bcopy_single_init(const ucp_proto_init_params_t *init_params) .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID, .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_bcopy), .super.max_iov_offs = UCP_PROTO_COMMON_OFFSET_INVALID, - .super.hdr_size = sizeof(ucp_tag_hdr_t), + .super.hdr_size = sizeof(ucp_eager_hdr_t), .super.send_op = UCT_EP_OP_AM_BCOPY, .super.memtype_op = UCT_EP_OP_GET_SHORT, .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG, @@ -170,7 +172,7 @@ ucp_proto_eager_zcopy_single_init(const ucp_proto_init_params_t *init_params) .super.min_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.min_zcopy), .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_zcopy), .super.max_iov_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_iov), - .super.hdr_size = sizeof(ucp_tag_hdr_t), + .super.hdr_size = sizeof(ucp_eager_hdr_t), .super.send_op = UCT_EP_OP_AM_ZCOPY, .super.memtype_op = UCT_EP_OP_LAST, .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY | @@ -194,7 +196,8 @@ ucp_proto_eager_zcopy_send_func(ucp_request_t *req, uct_iov_t *iov) { ucp_eager_hdr_t hdr = { - .super.tag = req->send.msg_proto.tag + .super.tag = req->send.msg_proto.tag, + .ep_id = ucp_send_request_get_ep_remote_id(req) }; return uct_ep_am_zcopy(req->send.ep->uct_eps[spriv->super.lane], diff --git a/src/ucp/tag/eager_snd.c b/src/ucp/tag/eager_snd.c index 2f0c26c90d5..28ed5ca57a6 100644 --- a/src/ucp/tag/eager_snd.c +++ b/src/ucp/tag/eager_snd.c @@ -40,6 +40,7 @@ static size_t ucp_tag_pack_eager_only_dt(void *dest, void *arg) ucp_request_t *req = arg; hdr->super.tag = req->send.msg_proto.tag; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); return ucp_tag_pack_eager_common(req, hdr + 1, req->send.length, sizeof(*hdr), 1); @@ -51,6 +52,7 @@ static size_t ucp_tag_pack_eager_sync_only_dt(void *dest, void *arg) ucp_request_t *req = arg; hdr->super.super.tag = req->send.msg_proto.tag; + hdr->super.ep_id = hdr->req.ep_id = ucp_send_request_get_ep_remote_id(req); hdr->req.req_id = ucp_send_request_get_id(req); @@ -70,6 +72,7 @@ static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg) sizeof(*hdr); length = ucs_min(length, req->send.length); hdr->super.super.tag = req->send.msg_proto.tag; + hdr->super.ep_id = ucp_send_request_get_ep_remote_id(req); hdr->total_len = req->send.length; hdr->msg_id = req->send.msg_proto.message_id; @@ -80,6 +83,7 @@ static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg) { ucp_eager_sync_first_hdr_t *hdr = dest; ucp_request_t *req = arg; + ucs_ptr_map_key_t ep_id = ucp_send_request_get_ep_remote_id(req); size_t length; ucs_assert(req->send.lane == ucp_ep_get_am_lane(req->send.ep)); @@ -89,8 +93,9 @@ static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg) sizeof(*hdr); length = ucs_min(length, req->send.length); hdr->super.super.super.tag = req->send.msg_proto.tag; + hdr->super.super.ep_id = ep_id; hdr->super.total_len = req->send.length; - hdr->req.ep_id = ucp_send_request_get_ep_remote_id(req); + hdr->req.ep_id = ep_id; hdr->super.msg_id = req->send.msg_proto.message_id; hdr->req.req_id = ucp_send_request_get_id(req); @@ -107,6 +112,7 @@ static size_t ucp_tag_pack_eager_middle_dt(void *dest, void *arg) sizeof(*hdr), req->send.length - req->send.state.dt.offset); hdr->msg_id = req->send.msg_proto.message_id; + hdr->ep_id = ucp_send_request_get_ep_remote_id(req); hdr->offset = req->send.state.dt.offset; return ucp_tag_pack_eager_common(req, hdr + 1, length, sizeof(*hdr), 0); @@ -121,10 +127,9 @@ static ucs_status_t ucp_tag_eager_contig_short(uct_pending_req_t *self) ucs_status_t status; req->send.lane = ucp_ep_get_am_lane(ep); - status = uct_ep_am_short(ep->uct_eps[req->send.lane], - UCP_AM_ID_EAGER_ONLY, - req->send.msg_proto.tag, req->send.buffer, - req->send.length); + status = ucp_ep_tag_send_am_short_iov(ep, req->send.buffer, + req->send.length, + req->send.msg_proto.tag); return ucp_am_short_handle_status_from_pending(req, status); } @@ -153,6 +158,7 @@ static ucs_status_t ucp_tag_eager_zcopy_single(uct_pending_req_t *self) ucp_eager_hdr_t hdr; hdr.super.tag = req->send.msg_proto.tag; + hdr.ep_id = ucp_send_request_get_ep_remote_id(req); return ucp_do_am_zcopy_single(self, UCP_AM_ID_EAGER_ONLY, &hdr, sizeof(hdr), NULL, 0ul, ucp_proto_am_zcopy_req_complete); } @@ -160,13 +166,16 @@ static ucs_status_t ucp_tag_eager_zcopy_single(uct_pending_req_t *self) static ucs_status_t ucp_tag_eager_zcopy_multi(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + uint64_t ep_id = ucp_send_request_get_ep_remote_id(req); ucp_eager_first_hdr_t first_hdr; ucp_eager_middle_hdr_t middle_hdr; first_hdr.super.super.tag = req->send.msg_proto.tag; + first_hdr.super.ep_id = ep_id; first_hdr.total_len = req->send.length; first_hdr.msg_id = req->send.msg_proto.message_id; middle_hdr.msg_id = req->send.msg_proto.message_id; + middle_hdr.ep_id = ep_id; middle_hdr.offset = req->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, UCP_AM_ID_EAGER_FIRST, @@ -253,6 +262,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_single(uct_pending_req_t *self) ucp_eager_sync_hdr_t hdr; hdr.super.super.tag = req->send.msg_proto.tag; + hdr.super.ep_id = hdr.req.ep_id = ucp_send_request_get_ep_remote_id(req); hdr.req.req_id = ucp_send_request_get_id(req); @@ -269,6 +279,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self) if (req->send.state.dt.offset != 0) { middle_hdr.msg_id = req->send.msg_proto.message_id; + middle_hdr.ep_id = ucp_send_request_get_ep_remote_id(req); middle_hdr.offset = req->send.state.dt.offset; return ucp_do_am_zcopy_multi(self, UCP_AM_ID_LAST, @@ -279,6 +290,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self) } first_hdr.super.super.super.tag = req->send.msg_proto.tag; + first_hdr.super.super.ep_id = ucp_send_request_get_ep_remote_id(req); first_hdr.super.total_len = req->send.length; first_hdr.req.ep_id = ucp_send_request_get_ep_remote_id(req); first_hdr.req.req_id = ucp_send_request_get_id(req); diff --git a/src/ucp/tag/tag_match.c b/src/ucp/tag/tag_match.c index d7009203453..0ee05ef9ba8 100644 --- a/src/ucp/tag/tag_match.c +++ b/src/ucp/tag/tag_match.c @@ -153,7 +153,7 @@ ucp_tag_exp_search_all(ucp_tag_match_t *tm, ucp_request_queue_t *req_queue, * offload flow. */ void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, - uint64_t msg_id + uint64_t msg_id, uint64_t ep_id UCS_STATS_ARG(int counter_idx)) { ucp_tag_frag_match_t *matchq; @@ -174,5 +174,6 @@ void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, } /* request not completed, put it on the hash */ + req->recv.tag.ep_id = ep_id; ucp_tag_frag_hash_init_exp(matchq, req); } diff --git a/src/ucp/tag/tag_match.h b/src/ucp/tag/tag_match.h index b653658cbf8..00f767d4c8f 100644 --- a/src/ucp/tag/tag_match.h +++ b/src/ucp/tag/tag_match.h @@ -114,7 +114,7 @@ ucp_tag_exp_search_all(ucp_tag_match_t *tm, ucp_request_queue_t *req_queue, ucp_tag_t tag); void ucp_tag_frag_list_process_queue(ucp_tag_match_t *tm, ucp_request_t *req, - uint64_t msg_id + uint64_t msg_id, uint64_t ep_id UCS_STATS_ARG(int counter_idx)); #endif diff --git a/src/ucp/tag/tag_match.inl b/src/ucp/tag/tag_match.inl index 8cc0e3f6f70..0ba705635e7 100644 --- a/src/ucp/tag/tag_match.inl +++ b/src/ucp/tag/tag_match.inl @@ -139,7 +139,8 @@ ucp_tag_unexp_remove(ucp_recv_desc_t *rdesc) } static UCS_F_ALWAYS_INLINE void -ucp_tag_unexp_recv(ucp_tag_match_t *tm, ucp_recv_desc_t *rdesc, ucp_tag_t tag) +ucp_tag_unexp_recv(ucp_tag_match_t *tm, ucp_recv_desc_t *rdesc, ucp_tag_t tag, + uint64_t ep_id) { ucs_list_link_t *hash_list; @@ -147,8 +148,8 @@ ucp_tag_unexp_recv(ucp_tag_match_t *tm, ucp_recv_desc_t *rdesc, ucp_tag_t tag) ucs_list_add_tail(hash_list, &rdesc->tag_list[UCP_RDESC_HASH_LIST]); ucs_list_add_tail(&tm->unexpected.all, &rdesc->tag_list[UCP_RDESC_ALL_LIST]); - ucs_trace_req("unexp "UCP_RECV_DESC_FMT" tag %"PRIx64, - UCP_RECV_DESC_ARG(rdesc), tag); + ucs_trace_req("unexp "UCP_RECV_DESC_FMT" tag %"PRIx64" ep_id %"PRIx64, + UCP_RECV_DESC_ARG(rdesc), tag, ep_id); } static UCS_F_ALWAYS_INLINE ucp_recv_desc_t* diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c index fd933054d92..bc1c18dbfaa 100644 --- a/src/ucp/tag/tag_recv.c +++ b/src/ucp/tag/tag_recv.c @@ -24,6 +24,7 @@ static void ucp_tag_recv_eager_multi(ucp_worker_h worker, ucp_request_t *req, { ucp_eager_first_hdr_t *first_hdr; uint64_t msg_id; + uint64_t ep_id; ucs_status_t status; UCP_WORKER_STAT_EAGER_MSG(worker, rdesc->flags); @@ -41,6 +42,7 @@ static void ucp_tag_recv_eager_multi(ucp_worker_h worker, ucp_request_t *req, first_hdr = (ucp_eager_first_hdr_t*)(rdesc + 1); req->recv.remaining = req->recv.tag.info.length = first_hdr->total_len; msg_id = first_hdr->msg_id; + ep_id = first_hdr->super.ep_id; if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) { ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags); @@ -48,9 +50,8 @@ static void ucp_tag_recv_eager_multi(ucp_worker_h worker, ucp_request_t *req, status = ucp_tag_recv_request_process_rdesc(req, rdesc, 0, 0); if (status == UCS_INPROGRESS) { - ucp_tag_frag_list_process_queue( - &worker->tm, req, msg_id - UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP)); + ucp_tag_frag_list_process_queue(&worker->tm, req, msg_id, ep_id + UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP)); } } } @@ -135,6 +136,7 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, req->recv.tag.tag = tag; req->recv.tag.tag_mask = tag_mask; + req->recv.tag.ep_id = UCS_PTR_MAP_KEY_INVALID; if (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) { req->recv.tag.cb = param->cb.recv; diff --git a/src/ucp/tag/tag_rndv.c b/src/ucp/tag/tag_rndv.c index 094db7dec93..1e87bc6de06 100644 --- a/src/ucp/tag/tag_rndv.c +++ b/src/ucp/tag/tag_rndv.c @@ -28,6 +28,66 @@ void ucp_tag_rndv_matched(ucp_worker_h worker, ucp_request_t *rreq, hdr_length - sizeof(*rts_hdr)); } +static void ucp_rndv_send_cancel_ack(ucp_worker_h worker, + ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + ucp_ep_h ep = NULL; + ucp_request_t *req; + + UCP_WORKER_GET_EP_BY_ID(&ep, worker, rndv_rts_hdr->sreq.ep_id, return, + "ats_cancel"); + + req = ucp_request_get(worker); + if (req == NULL) { + return; + } + + req->send.ep = ep; + req->flags = 0; + req->send.rndv.mdesc = NULL; + req->send.pending_lane = UCP_NULL_LANE; + + ucp_rndv_req_send_ack(req, sizeof(*rndv_rts_hdr), rndv_rts_hdr->sreq.req_id, + UCS_ERR_CANCELED, UCP_AM_ID_RNDV_ATS, + "send_ats_cancel"); +} + +static void ucp_rndv_unexp_cancel(ucp_worker_h worker, + ucp_rndv_rts_hdr_t *rndv_rts_hdr) +{ + ucp_tag_hdr_t* tag_hdr = ucp_tag_hdr_from_rts(rndv_rts_hdr); + ucp_ep_h UCS_V_UNUSED ep = NULL; + const ucp_rndv_rts_hdr_t *rdesc_rts_hdr; + ucp_recv_desc_t *rdesc; + ucs_list_link_t *list; + + UCP_WORKER_GET_EP_BY_ID(&ep, worker, rndv_rts_hdr->sreq.ep_id, + ep = NULL, "unexp_cancel"); + + list = ucp_tag_unexp_get_list_for_tag(&worker->tm, tag_hdr->tag); + ucs_list_for_each(rdesc, list, tag_list[UCP_RDESC_HASH_LIST]) { + rdesc_rts_hdr = (const void*)(rdesc + 1); + if ((rdesc->flags & UCP_RECV_DESC_FLAG_RNDV) && + (ucp_rdesc_get_tag(rdesc) == tag_hdr->tag) && + (rdesc_rts_hdr->sreq.ep_id == rndv_rts_hdr->sreq.ep_id) && + (rdesc_rts_hdr->sreq.req_id == rndv_rts_hdr->sreq.req_id)) + { + ucs_debug("ep %p, canceling unexp rdesc " UCP_RECV_DESC_FMT " with " + "tag %"PRIx64" ep_id %"PRIx64, ep, + UCP_RECV_DESC_ARG(rdesc), ucp_rdesc_get_tag(rdesc), + rdesc_rts_hdr->sreq.ep_id); + ucp_tag_unexp_remove(rdesc); + ucp_rndv_send_cancel_ack(worker, rndv_rts_hdr); + ucp_recv_desc_release(rdesc); + return; + } + } + + ucs_debug("ep %p, unexp rdesc for RTS tag %"PRIx64" ep_id %"PRIx64 + " req_id %"PRIx64" is not found", ep, tag_hdr->tag, + rndv_rts_hdr->sreq.ep_id, rndv_rts_hdr->sreq.req_id); +} + ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, ucp_rndv_rts_hdr_t *rts_hdr, size_t length, unsigned tl_flags) @@ -35,9 +95,15 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, ucp_recv_desc_t *rdesc; ucp_request_t *rreq; ucs_status_t status; + ucp_ep_h ep UCS_V_UNUSED; ucs_assert(ucp_rndv_rts_is_tag(rts_hdr)); + if (ucs_unlikely(rts_hdr->opcode == UCP_RNDV_RTS_TAG_CANCELED)) { + ucp_rndv_unexp_cancel(worker, rts_hdr); + return UCS_OK; + } + rreq = ucp_tag_exp_search(&worker->tm, ucp_tag_hdr_from_rts(rts_hdr)->tag); if (rreq != NULL) { /* Cancel req in transport if it was offloaded, because it arrived @@ -49,6 +115,9 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, return UCS_OK; } + UCP_WORKER_GET_EP_BY_ID(&ep, worker, rts_hdr->sreq.ep_id, return UCS_OK, + "rts"); + ucs_assert(length >= sizeof(*rts_hdr)); status = ucp_recv_desc_init(worker, rts_hdr, length, 0, tl_flags, @@ -58,7 +127,8 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker, ucs_assert(ucp_rdesc_get_tag(rdesc) == ucp_tag_hdr_from_rts(rts_hdr)->tag); ucp_tag_unexp_recv(&worker->tm, rdesc, - ucp_tag_hdr_from_rts(rts_hdr)->tag); + ucp_tag_hdr_from_rts(rts_hdr)->tag, + rts_hdr->sreq.ep_id); } return status; diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index d60c96b3063..349dae234d4 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -153,10 +153,7 @@ ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t length, ucp_tag_t ta ucs_status_t status; if (ucp_proto_is_inline(ep, &ucp_ep_config(ep)->tag.max_eager_short, length)) { - UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(ucp_eager_hdr_t)); - UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uint64_t)); - status = uct_ep_am_short(ucp_ep_get_am_uct_ep(ep), UCP_AM_ID_EAGER_ONLY, - tag, buffer, length); + status = ucp_ep_tag_send_am_short_iov(ep, buffer, length, tag); } else if (ucp_proto_is_inline(ep, &ucp_ep_config(ep)->tag.offload.max_eager_short, length)) { From eb1a20f64a5e551d51681f8f80eb22f7ee8c6366 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 22 Sep 2021 11:10:30 +0300 Subject: [PATCH 14/19] UCP/TAG/SEND: added eager/rndv flags to tag_send op - added op-flags support to tag_send_[sync]_nbx functions - added flags eager/rndv flags to force used protocol --- src/ucp/api/ucp.h | 21 +++++++++++++++++- src/ucp/tag/tag_send.c | 38 +++++++++++++++++++++++++++------ test/apps/iodemo/io_demo.cc | 14 ++++++++++-- test/apps/iodemo/ucx_wrapper.cc | 11 ++++++++-- test/apps/iodemo/ucx_wrapper.h | 10 ++++++++- 5 files changed, 82 insertions(+), 12 deletions(-) diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index 801bd6052e1..9bfb965aa03 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -721,6 +721,21 @@ typedef enum { } ucp_op_attr_t; +/** + * @ingroup UCP_COMM + * @brief UCP tag send operation flags + * + * Flags dictate the behavior of @ref ucp_tag_send_nbx and + * @ref ucp_tag_send_sync_nbx routines. + */ +typedef enum { + UCP_EP_TAG_SEND_FLAG_EAGER = UCS_BIT(0), /**< force use eager protocol + to transfer data */ + UCP_EP_TAG_SEND_FLAG_RNDV = UCS_BIT(1) /**< force use rndv protocol + to transfer data */ +} ucp_ep_tag_send_flags_t; + + /** * @ingroup UCP_COMM * @brief UCP request query attributes @@ -3521,7 +3536,11 @@ ucs_status_ptr_t ucp_tag_send_sync_nb(ucp_ep_h ep, const void *buffer, size_t co * @param [in] buffer Pointer to the message buffer (payload). * @param [in] count Number of elements to send * @param [in] tag Message tag. - * @param [in] param Operation parameters, see @ref ucp_request_param_t + * @param [in] param Operation parameters, see @ref ucp_request_param_t. + * This operation supports specific flags, which can be + * passed in @a param by @ref ucp_request_param_t.flags. + * The exact set of flags is defined + * by @ref ucp_ep_tag_send_flags_t. * * @return UCS_OK - The send operation was completed immediately. * @return UCS_PTR_IS_ERR(_ptr) - The send operation failed. diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index 349dae234d4..59fdfdad3a0 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -24,8 +24,18 @@ static UCS_F_ALWAYS_INLINE size_t ucp_tag_get_rndv_threshold(const ucp_request_t *req, size_t count, size_t max_iov, size_t rndv_rma_thresh, - size_t rndv_am_thresh) + size_t rndv_am_thresh, uint32_t flags) { + /* Eager protocol requested - set rndv threshold to max */ + if (flags & UCP_EP_TAG_SEND_FLAG_EAGER) { + return SIZE_MAX; + } + + /* RNDV protocol requested - set rndv threshold to 0 */ + if (flags & UCP_EP_TAG_SEND_FLAG_RNDV) { + return 0; + } + switch (req->send.datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_IOV: if ((count > max_iov) && @@ -54,6 +64,7 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, { ssize_t max_short = ucp_proto_get_short_max(req, msg_config); ucp_ep_config_t *ep_config = ucp_ep_config(req->send.ep); + uint32_t flags = ucp_request_param_flags(param); ucs_status_t status; size_t zcopy_thresh; size_t rndv_thresh; @@ -65,7 +76,8 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count, &rndv_rma_thresh, &rndv_am_thresh); rndv_thresh = ucp_tag_get_rndv_threshold(req, dt_count, msg_config->max_iov, - rndv_rma_thresh, rndv_am_thresh); + rndv_rma_thresh, rndv_am_thresh, + flags); if (!(param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) || ucs_unlikely(!UCP_MEM_IS_HOST(req->send.mem_type))) { @@ -228,7 +240,8 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx, ucp_ep_h ep, const void *buffer, size_t count, ucp_tag_t tag, const ucp_request_param_t *param) { - size_t contig_length = 0; + size_t contig_length = 0; + uint32_t UCS_V_UNUSED flags = ucp_request_param_flags(param); ucs_status_t status; ucp_request_t *req; ucs_status_ptr_t ret; @@ -240,6 +253,12 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); UCP_REQUEST_CHECK_PARAM(param); + if (ENABLE_PARAMS_CHECK && + ucs_test_all_flags(flags, UCP_EP_TAG_SEND_FLAG_EAGER | + UCP_EP_TAG_SEND_FLAG_RNDV)) { + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker); ucs_trace_req("send_nbx buffer %p count %zu tag %"PRIx64" to %s", @@ -305,9 +324,10 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx, ucp_ep_h ep, const void *buffer, size_t count, ucp_tag_t tag, const ucp_request_param_t *param) { - ucp_worker_h worker = ep->worker; - size_t contig_length = 0; - uintptr_t datatype = ucp_request_param_datatype(param); + ucp_worker_h worker = ep->worker; + size_t contig_length = 0; + uintptr_t datatype = ucp_request_param_datatype(param); + uint32_t UCS_V_UNUSED flags = ucp_request_param_flags(param); ucs_status_t status; ucp_request_t *req; ucs_status_ptr_t ret; @@ -317,6 +337,12 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx, UCS_ERR_INVALID_PARAM)); UCP_REQUEST_CHECK_PARAM(param); + if (ENABLE_PARAMS_CHECK && + ucs_test_all_flags(flags, UCP_EP_TAG_SEND_FLAG_EAGER | + UCP_EP_TAG_SEND_FLAG_RNDV)) { + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); + } + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); ucs_trace_req("send_sync_nbx buffer %p count %zu tag %"PRIx64" to %s", diff --git a/test/apps/iodemo/io_demo.cc b/test/apps/iodemo/io_demo.cc index a4cb9a679ea..3631e45f3f2 100644 --- a/test/apps/iodemo/io_demo.cc +++ b/test/apps/iodemo/io_demo.cc @@ -70,6 +70,7 @@ typedef struct { double retry_interval; double client_runtime_limit; double print_interval; + size_t rndv_thresh; size_t iomsg_size; size_t min_data_size; size_t max_data_size; @@ -875,7 +876,8 @@ class P2pDemoCommon : public UcxContext { P2pDemoCommon(const options_t &test_opts, uint32_t iov_buf_filler) : UcxContext(test_opts.iomsg_size, test_opts.connect_timeout, - test_opts.use_am, test_opts.use_epoll), + test_opts.use_am, test_opts.rndv_thresh, + test_opts.use_epoll), _test_opts(test_opts), _io_msg_pool(test_opts.iomsg_size, "io messages"), _send_callback_pool(0, "send callbacks"), @@ -2659,6 +2661,7 @@ static int parse_args(int argc, char **argv, options_t *test_opts) test_opts->retry_interval = 5.0; test_opts->client_runtime_limit = std::numeric_limits::max(); test_opts->print_interval = 1.0; + test_opts->rndv_thresh = UcxContext::rndv_thresh_auto; test_opts->min_data_size = 4096; test_opts->max_data_size = 4096; test_opts->chunk_size = std::numeric_limits::max(); @@ -2679,7 +2682,7 @@ static int parse_args(int argc, char **argv, options_t *test_opts) test_opts->per_conn_info = false; while ((c = getopt(argc, argv, - "p:c:r:d:b:i:w:a:k:o:t:n:l:s:y:vqeADHP:m:L:I:zV")) != -1) { + "p:c:r:d:b:i:w:a:k:o:t:n:l:s:y:vqeADHP:R:m:L:I:zV")) != -1) { switch (c) { case 'p': test_opts->port_num = atoi(optarg); @@ -2809,6 +2812,9 @@ static int parse_args(int argc, char **argv, options_t *test_opts) case 'P': test_opts->print_interval = atof(optarg); break; + case 'R': + test_opts->rndv_thresh = strtol(optarg, NULL, 0); + break; case 'm': if (!strcmp(optarg, "host")) { test_opts->memory_type = UCS_MEMORY_TYPE_HOST; @@ -2866,6 +2872,10 @@ static int parse_args(int argc, char **argv, options_t *test_opts) std::cout << " -D Enable debugging mode for IO operation timeouts" << std::endl; std::cout << " -H Use human-readable timestamps" << std::endl; std::cout << " -P Set report printing interval" << std::endl; + std::cout << " -R Always use rendezvous protocol for messages starting" << std::endl; + std::cout << " from this size, and eager protocol for" << std::endl; + std::cout << " messages lower than this size. If not set," << std::endl; + std::cout << " the threshold is selected automatically by UCX" << std::endl; std::cout << "" << std::endl; std::cout << " -m Memory type to use. Possible values: host" #ifdef HAVE_CUDA diff --git a/test/apps/iodemo/ucx_wrapper.cc b/test/apps/iodemo/ucx_wrapper.cc index 8304b2bedef..d0005d40cf7 100644 --- a/test/apps/iodemo/ucx_wrapper.cc +++ b/test/apps/iodemo/ucx_wrapper.cc @@ -111,10 +111,10 @@ void UcxContext::UcxDisconnectCallback::operator()(ucs_status_t status) } UcxContext::UcxContext(size_t iomsg_size, double connect_timeout, bool use_am, - bool use_epoll) : + size_t rndv_thresh, bool use_epoll) : _context(NULL), _worker(NULL), _listener(NULL), _iomsg_recv_request(NULL), _iomsg_buffer(iomsg_size), _connect_timeout(connect_timeout), - _use_am(use_am), _worker_fd(-1), _epoll_fd(-1) + _use_am(use_am), _worker_fd(-1), _epoll_fd(-1), _rndv_thresh(rndv_thresh) { if (use_epoll) { _epoll_fd = epoll_create(1); @@ -1228,6 +1228,13 @@ bool UcxConnection::send_common(const void *buffer, size_t length, param.datatype = 0; // make coverity happy param.cb.send = (ucp_send_nbx_callback_t)common_request_callback; + if (_context.rndv_thresh() != UcxContext::rndv_thresh_auto) { + param.op_attr_mask |= UCP_OP_ATTR_FIELD_FLAGS; + param.flags = (length >= _context.rndv_thresh()) ? + UCP_EP_TAG_SEND_FLAG_RNDV : + UCP_EP_TAG_SEND_FLAG_EAGER; + } + if (memh) { param.op_attr_mask |= UCP_OP_ATTR_FIELD_MEMH; param.memh = memh; diff --git a/test/apps/iodemo/ucx_wrapper.h b/test/apps/iodemo/ucx_wrapper.h index c906d0b1c72..155b02cfede 100644 --- a/test/apps/iodemo/ucx_wrapper.h +++ b/test/apps/iodemo/ucx_wrapper.h @@ -112,8 +112,10 @@ class UcxContext { public: typedef std::vector iomsg_buffer_t; + static const size_t rndv_thresh_auto = (size_t)-2; + UcxContext(size_t iomsg_size, double connect_timeout, bool use_am, - bool use_epoll = false); + size_t rndv_thresh, bool use_epoll = false); virtual ~UcxContext(); @@ -256,6 +258,11 @@ class UcxContext { void destroy_worker(); + size_t rndv_thresh() const + { + return _rndv_thresh; + } + void set_am_handler(ucp_am_recv_callback_t cb, void *arg); ucp_context_h _context; @@ -272,6 +279,7 @@ class UcxContext { bool _use_am; int _worker_fd; int _epoll_fd; + size_t _rndv_thresh; }; From be81bbd234280d38116311cb23ac0c3dd0170ff1 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Thu, 26 Aug 2021 18:24:44 +0300 Subject: [PATCH 15/19] GTEST/UCP/TAG: test unexp queue cleanup --- test/gtest/ucp/test_ucp_sockaddr.cc | 61 ++++++++++++++++++----------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/test/gtest/ucp/test_ucp_sockaddr.cc b/test/gtest/ucp/test_ucp_sockaddr.cc index 50649f2d218..2494f0c3e98 100644 --- a/test/gtest/ucp/test_ucp_sockaddr.cc +++ b/test/gtest/ucp/test_ucp_sockaddr.cc @@ -54,7 +54,8 @@ class test_ucp_sockaddr : public ucp_test { enum { SEND_DIRECTION_C2S = UCS_BIT(0), /* send data from client to server */ SEND_DIRECTION_S2C = UCS_BIT(1), /* send data from server to client */ - SEND_DIRECTION_BIDI = SEND_DIRECTION_C2S | SEND_DIRECTION_S2C /* bidirectional send */ + SEND_DIRECTION_BIDI = SEND_DIRECTION_C2S | SEND_DIRECTION_S2C, /* bidirectional send */ + SEND_NO_RECV = UCS_BIT(2) /* do not recv data to test unexp Q cleanup */ }; typedef enum { @@ -531,30 +532,34 @@ class test_ucp_sockaddr : public ucp_test { void send_recv(entity& from, entity& to, send_recv_type_t send_recv_type, bool wakeup, ucp_test_base::entity::listen_cb_type_t cb_type, - size_t ep_index = 0) + bool no_recv = false, size_t ep_index = 0) { const uint64_t send_data = ucs_generate_uuid(0); uint64_t recv_data = 0; rx_am_msg_arg am_rx_arg(to, NULL, &recv_data); ucs_status_t send_status; - if (send_recv_type == SEND_RECV_AM) { + if (!no_recv && (send_recv_type == SEND_RECV_AM)) { set_am_data_handler(to, 0, rx_am_msg_cb, &am_rx_arg); } void *send_req = send(from, &send_data, sizeof(send_data), send_recv_type, scomplete_cbx, NULL, ep_index); - void *recv_req = NULL; // to suppress compiler warning - if (send_recv_type == SEND_RECV_TAG) { - recv_req = recv(to, &recv_data, sizeof(recv_data), - rtag_complete_cbx, NULL); - } else if (send_recv_type == SEND_RECV_STREAM) { - recv_req = recv(to, &recv_data, sizeof(recv_data), - rstream_complete_cbx, NULL); - } else if (send_recv_type != SEND_RECV_AM) { - UCS_TEST_ABORT("unsupported communication type " + - std::to_string(send_recv_type)); + void *recv_req = NULL; + if (!no_recv) { + if (send_recv_type == SEND_RECV_TAG) { + recv_req = recv(to, &recv_data, sizeof(recv_data), + rtag_complete_cbx, NULL); + } else if (send_recv_type == SEND_RECV_STREAM) { + recv_req = recv(to, &recv_data, sizeof(recv_data), + rstream_complete_cbx, NULL); + } else if (send_recv_type != SEND_RECV_AM) { + UCS_TEST_ABORT("unsupported communication type " + + std::to_string(send_recv_type)); + } + } else { + UCS_TEST_MESSAGE << "Do not recv"; } { @@ -566,14 +571,16 @@ class test_ucp_sockaddr : public ucp_test { } } - if (send_recv_type == SEND_RECV_AM) { - request_wait(am_rx_arg.rreq); - wait_for_flag(&am_rx_arg.received); - set_am_data_handler(to, 0, NULL, NULL); - } else { - request_wait(recv_req, 0, wakeup); + if (!no_recv) { + if (send_recv_type == SEND_RECV_AM) { + request_wait(am_rx_arg.rreq); + wait_for_flag(&am_rx_arg.received); + set_am_data_handler(to, 0, NULL, NULL); + } else { + request_wait(recv_req, 0, wakeup); + } + EXPECT_EQ(send_data, recv_data); } - EXPECT_EQ(send_data, recv_data); } bool wait_for_server_ep(bool wakeup) @@ -661,12 +668,12 @@ class test_ucp_sockaddr : public ucp_test { if (flags & SEND_DIRECTION_C2S) { send_recv(sender(), receiver(), send_recv_type(), wakeup, - cb_type()); + cb_type(), flags & SEND_NO_RECV); } if (flags & SEND_DIRECTION_S2C) { send_recv(receiver(), sender(), send_recv_type(), wakeup, - cb_type()); + cb_type(), flags & SEND_NO_RECV); } } @@ -1007,6 +1014,10 @@ UCS_TEST_P(test_ucp_sockaddr, listen_bidi) { listen_and_communicate(false, SEND_DIRECTION_BIDI); } +UCS_TEST_P(test_ucp_sockaddr, listen_bidi_no_recv) { + listen_and_communicate(false, SEND_DIRECTION_BIDI | SEND_NO_RECV); +} + UCS_TEST_P(test_ucp_sockaddr, ep_query) { listen_and_communicate(false, 0); ep_query(); @@ -2774,6 +2785,10 @@ class test_ucp_sockaddr_protocols_err : public test_ucp_sockaddr_protocols { void test_tag_send_recv(size_t size, bool is_exp, bool is_sync = false) { + if (!is_exp) { + UCS_TEST_SKIP_R("ucp_tag_probe_nb + err handling is not supported"); + } + /* warmup */ test_ucp_sockaddr_protocols::test_tag_send_recv(size, is_exp, is_sync); @@ -2905,7 +2920,7 @@ class test_ucp_sockaddr_protocols_err_sender /* Warmup */ send_recv(sender(), receiver(), send_recv_type(), false, cb_type(), - sender_idx); + false, sender_idx); for (size_t i = 0; i < num_sends; ++i) { void *sreq = send(sender(), send_buf.ptr(), size, From a9defa1c2381424165c334ff53d3f637c15848b5 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Wed, 9 Dec 2020 15:28:05 +0200 Subject: [PATCH 16/19] UCT/IB: Non-blocking IB device close --- src/ucs/arch/aarch64/cpu.h | 4 ++ src/ucs/arch/ppc64/cpu.h | 4 ++ src/ucs/arch/x86_64/cpu.c | 17 +++++ src/ucs/arch/x86_64/cpu.h | 3 + src/uct/ib/base/ib_device.c | 119 ++++++++++++++++++++++++++++++++++- src/uct/ib/base/ib_device.h | 17 ++++- src/uct/ib/base/ib_md.c | 8 ++- src/uct/ib/base/ib_md.h | 1 + test/gtest/uct/ib/test_ib.cc | 10 +++ 9 files changed, 179 insertions(+), 4 deletions(-) diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index 06d43b644a9..eda59830a8f 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -70,6 +70,10 @@ BEGIN_C_DECLS #define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst) +/* revert to glibc syscall */ +#define ucs_syscall_raw syscall + + /* * ARM processor ID (ARM ISA - Main ID Register, EL1) */ diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index c163c2968de..85f4f23c8aa 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -39,6 +39,10 @@ BEGIN_C_DECLS #define ucs_memory_cpu_wc_fence() ucs_memory_bus_fence() +/* revert to glibc syscall */ +#define ucs_syscall_raw syscall + + static inline uint64_t ucs_arch_read_hres_clock() { #if HAVE_DECL___PPC_GET_TIMEBASE diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 81a6e0406a6..90d64870791 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -755,4 +755,21 @@ void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len) #endif } +int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3) +{ + int ret; + + asm volatile ( + "movq %1, %%rax\n\t" + "movq %2, %%rdi\n\t" + "movq %3, %%rsi\n\t" + "movq %4, %%rdx\n\t" + "syscall\n\t" + :"=a"(ret) + :"r"(num), "r"(arg1), "r"(arg2), "r"(arg3)); + + return ret; +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index a7a6f56a661..bdab8c22853 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -52,6 +52,9 @@ ucs_cpu_vendor_t ucs_arch_get_cpu_vendor(); void ucs_cpu_init(); ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); +int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3); + static UCS_F_ALWAYS_INLINE int ucs_arch_x86_rdtsc_enabled() { diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index f97d33db2a1..3d8621c04b5 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -804,12 +807,124 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, return UCS_OK; } +static int uct_ib_device_cleanup_proc(void* arg) +{ + uct_ib_device_nb_close_ctx *ctx = arg; + static const char *process_name = "ucx_cleanup"; + char dummy; + int fd; + + /* Since TLS of this thread is uninitialized avoid using glibc */ + ucs_syscall_raw(SYS_prctl, PR_SET_NAME, (long)process_name, 0); + + for (fd = 0; fd < ctx->max_fds; fd++) { + if ((fd != ctx->cmd_fd) && (fd != ctx->pipefds[0])) { + ucs_syscall_raw(SYS_close, fd, 0, 0); + } + } + + /* Wait until pipe closed - either parent terminated or closing device */ + ucs_syscall_raw(SYS_read, ctx->pipefds[0], (long)&dummy, 1); + + return 0; +} + +static ucs_status_t +uct_ib_device_init_nb_close_ctx(int fd, uct_ib_device_nb_close_ctx **ctx_p) +{ + uct_ib_device_nb_close_ctx *ctx; + struct rlimit nofile; + ucs_status_t status; + int ret; + + ctx = ucs_calloc(1, sizeof(*ctx), "ibv cleanup ctx"); + if (ctx == NULL) { + ucs_error("cleanup context allocation failure"); + status = UCS_ERR_NO_MEMORY; + goto err; + } + + ret = getrlimit(RLIMIT_NOFILE, &nofile); + if (ret == 0) { + ctx->max_fds = nofile.rlim_cur; + } else { + ucs_warn("getrlimit(NOFILE) failed: %m"); + ctx->max_fds = 1024; + } + + ctx->buff_size = ucs_get_page_size() * 2; + ctx->cmd_fd = fd; + + status = ucs_mmap_alloc(&ctx->buff_size, &ctx->buff, 0, "ibv cleanup buff"); + if (status != UCS_OK) { + ucs_error("cleanup buffer allocation failed"); + goto err_alloc; + } + + ret = pipe(ctx->pipefds); + if (ret) { + ucs_error("cleanup pipe allocation failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_pipe; + } + + /* CLONE_VM - to keep pinned memory shared + * CLONE_SETTLS - to avoid corruption of parent's TLS + * SIGCHLD - will be sent to parent when child quit, required by waitpid + * buffer layout: tls goes up, stack goes down */ + ret = clone(uct_ib_device_cleanup_proc, + UCS_PTR_BYTE_OFFSET(ctx->buff, ctx->buff_size), + CLONE_VM|CLONE_SETTLS|SIGCHLD, ctx, NULL, ctx->buff, NULL); + + if (ret == -1) { + ucs_error("cleanup clone failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_clone; + } + + ctx->pid = ret; + *ctx_p = ctx; + close(ctx->pipefds[0]); + return UCS_OK; + +err_clone: + close(ctx->pipefds[0]); + close(ctx->pipefds[1]); +err_pipe: + ucs_mmap_free(ctx->buff, ctx->buff_size); +err_alloc: + ucs_free(ctx); +err: + return status; +} + +void uct_ib_device_free_nb_close_ctx(uct_ib_device_nb_close_ctx *ctx) +{ + if (ctx == NULL) { + return; + } + + close(ctx->pipefds[1]); + waitpid(ctx->pid, NULL, 0); + ucs_mmap_free(ctx->buff, ctx->buff_size); + ucs_free(ctx); +} + ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, - struct ibv_device *ibv_device, int async_events + struct ibv_device *ibv_device, + int async_events, int nb_close UCS_STATS_ARG(ucs_stats_node_t *stats_parent)) { ucs_status_t status; + if (nb_close) { + status = uct_ib_device_init_nb_close_ctx(dev->ibv_context->cmd_fd, + &dev->nb_close_ctx); + if (status != UCS_OK) { + return status; + } + } + dev->async_events = async_events; uct_ib_device_get_locality(ibv_get_device_name(ibv_device), @@ -865,6 +980,8 @@ void uct_ib_device_cleanup(uct_ib_device_t *dev) { ucs_debug("destroying ib device %s", uct_ib_device_name(dev)); + uct_ib_device_free_nb_close_ctx(dev->nb_close_ctx); + if (kh_size(&dev->async_events_hash) != 0) { ucs_warn("async_events_hash not empty"); } diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index d1f4a454ee2..81a467322f3 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -200,6 +200,19 @@ typedef struct { KHASH_TYPE(uct_ib_async_event, uct_ib_async_event_t, uct_ib_async_event_val_t); +/** + * Context for non-blocking device cleanup + */ +typedef struct { + int pipefds[2]; + int cmd_fd; + void *buff; + size_t buff_size; + pid_t pid; + int max_fds; +} uct_ib_device_nb_close_ctx; + + /** * IB device (corresponds to HCA) */ @@ -232,6 +245,7 @@ typedef struct uct_ib_device { /* Async event subscribers */ ucs_spinlock_t async_event_lock; khash_t(uct_ib_async_event) async_events_hash; + uct_ib_device_nb_close_ctx *nb_close_ctx; } uct_ib_device_t; @@ -280,7 +294,8 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, struct ibv_device *ibv_device); ucs_status_t uct_ib_device_init(uct_ib_device_t *dev, - struct ibv_device *ibv_device, int async_events + struct ibv_device *ibv_device, int async_events, + int nb_close UCS_STATS_ARG(ucs_stats_node_t *stats_parent)); void uct_ib_device_cleanup(uct_ib_device_t *dev); diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 2a07829cfba..3cf34a788e4 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -75,6 +75,10 @@ static ucs_config_field_t uct_ib_md_config_table[] = { "Initialize a fork-safe IB library with ibv_fork_init().", ucs_offsetof(uct_ib_md_config_t, fork_init), UCS_CONFIG_TYPE_TERNARY}, + {"CLEANUP_THREAD", "n", + "Cleanup device resources by a background process.", + ucs_offsetof(uct_ib_md_config_t, nb_close), UCS_CONFIG_TYPE_BOOL}, + {"ASYNC_EVENTS", "y", "Enable listening for async events on the device", ucs_offsetof(uct_ib_md_config_t, async_events), UCS_CONFIG_TYPE_BOOL}, @@ -1624,8 +1628,8 @@ ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md, goto err; } - status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events - UCS_STATS_ARG(md->stats)); + status = uct_ib_device_init(&md->dev, ib_device, md_config->async_events, + md_config->nb_close UCS_STATS_ARG(md->stats)); if (status != UCS_OK) { goto err_release_stats; } diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 51c1103d6d5..5bc45fca47a 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -162,6 +162,7 @@ typedef struct uct_ib_md_config { without using the cache */ unsigned fork_init; /**< Use ibv_fork_init() */ int async_events; /**< Whether async events should be delivered */ + int nb_close; /**< Cleanup resources in background */ uct_ib_md_ext_config_t ext; /**< External configuration */ diff --git a/test/gtest/uct/ib/test_ib.cc b/test/gtest/uct/ib/test_ib.cc index 2928aa2acd0..813b350d23d 100644 --- a/test/gtest/uct/ib/test_ib.cc +++ b/test/gtest/uct/ib/test_ib.cc @@ -78,6 +78,16 @@ void test_uct_ib::send_recv_short() { size_t test_uct_ib::m_ib_am_handler_counter = 0; +#if defined(__x86_64__) +UCS_TEST_SKIP_COND_P(test_uct_ib, nb_close, RUNNING_ON_VALGRIND, + "CLEANUP_THREAD=y") +{ +} +#endif + +UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib); + + class test_uct_ib_addr : public test_uct_ib { public: uct_ib_iface_config_t *ib_config() { From 56a35bda9bfd6e92beb4d41adc04386a50f16ebf Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Fri, 17 Sep 2021 11:09:53 +0300 Subject: [PATCH 17/19] UCT/IB: Remove sighandlers in IB cleanup thread --- src/ucs/arch/aarch64/cpu.h | 1 + src/ucs/arch/ppc64/cpu.h | 1 + src/ucs/arch/x86_64/cpu.c | 18 ++++++++++++++++++ src/ucs/arch/x86_64/cpu.h | 2 ++ src/uct/ib/base/ib_device.c | 23 +++++++++++++++++++---- 5 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index eda59830a8f..751438c7cc0 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -72,6 +72,7 @@ BEGIN_C_DECLS /* revert to glibc syscall */ #define ucs_syscall_raw syscall +#define ucs_syscall_raw4 syscall /* diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index 85f4f23c8aa..42769d7481f 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -41,6 +41,7 @@ BEGIN_C_DECLS /* revert to glibc syscall */ #define ucs_syscall_raw syscall +#define ucs_syscall_raw4 syscall static inline uint64_t ucs_arch_read_hres_clock() diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 90d64870791..cd9a7cb7c18 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -772,4 +772,22 @@ int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, return ret; } +int ucs_syscall_raw4(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4) +{ + int ret; + + asm volatile ( + "movq %1, %%rax\n\t" + "movq %2, %%rdi\n\t" + "movq %3, %%rsi\n\t" + "movq %4, %%rdx\n\t" + "movq %5, %%r10\n\t" + "syscall\n\t" + :"=a"(ret) + :"r"(num), "r"(arg1), "r"(arg2), "r"(arg3), "r"(arg4)); + + return ret; +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index bdab8c22853..e32150a1d92 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -54,6 +54,8 @@ ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); int ucs_syscall_raw(unsigned long num, unsigned long arg1, unsigned long arg2, unsigned long arg3); +int ucs_syscall_raw4(unsigned long num, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4); static UCS_F_ALWAYS_INLINE int ucs_arch_x86_rdtsc_enabled() diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index 3d8621c04b5..c2803c60e08 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -807,19 +807,34 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev, return UCS_OK; } +/* Based on include/linux/signal.h from kernel source */ +struct uct_ib_sigaction { + void *uapi_sa_handler; + uint64_t uapi_sa_flags; + void *uapi_sa_restorer; + uint64_t uapi_sa_mask; + uint64_t pad; /* glibc compatibility */ +}; + static int uct_ib_device_cleanup_proc(void* arg) { uct_ib_device_nb_close_ctx *ctx = arg; static const char *process_name = "ucx_cleanup"; + struct uct_ib_sigaction dfl = { SIG_DFL }; char dummy; - int fd; + int i; + + for (i = 1; i <= SIGUSR2; i++) { + ucs_syscall_raw4(SYS_rt_sigaction, i, (long)&dfl, 0, + sizeof(dfl.uapi_sa_mask)); + } /* Since TLS of this thread is uninitialized avoid using glibc */ ucs_syscall_raw(SYS_prctl, PR_SET_NAME, (long)process_name, 0); - for (fd = 0; fd < ctx->max_fds; fd++) { - if ((fd != ctx->cmd_fd) && (fd != ctx->pipefds[0])) { - ucs_syscall_raw(SYS_close, fd, 0, 0); + for (i = 0; i < ctx->max_fds; i++) { + if ((i != ctx->cmd_fd) && (i != ctx->pipefds[0])) { + ucs_syscall_raw(SYS_close, i, 0, 0); } } From b0b9a58fd355f8179a316404ba5f5c895f368d7a Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Tue, 11 Jan 2022 11:44:02 +0200 Subject: [PATCH 18/19] UCT/IB: handle IBV_EVENT_DEVICE_FATAL --- src/uct/ib/base/ib_device.c | 43 ++++++++++++++++++++++++++--------- src/uct/ib/base/ib_device.h | 1 + src/uct/ib/ud/accel/ud_mlx5.c | 7 ++++++ 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index c2803c60e08..01139f8ad2d 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -284,22 +284,42 @@ uct_ib_device_async_event_schedule_callback(uct_ib_device_t *dev, wait_ctx, 0); } +static void +uct_ib_device_async_event_dispatch_nolock(uct_ib_device_t *dev, + const uct_ib_async_event_t *event) +{ + khiter_t iter = kh_get(uct_ib_async_event, &dev->async_events_hash, *event); + uct_ib_async_event_val_t *entry; + + if (iter == kh_end(&dev->async_events_hash)) { + return; + } + + entry = &kh_value(&dev->async_events_hash, iter); + entry->fired = 1; + if (entry->wait_ctx != NULL) { + uct_ib_device_async_event_schedule_callback(dev, entry->wait_ctx); + } +} + static void uct_ib_device_async_event_dispatch(uct_ib_device_t *dev, const uct_ib_async_event_t *event) { - uct_ib_async_event_val_t *entry; - khiter_t iter; + ucs_spin_lock(&dev->async_event_lock); + uct_ib_device_async_event_dispatch_nolock(dev, event); + ucs_spin_unlock(&dev->async_event_lock); +} + +static void +uct_ib_device_async_event_dispatch_fatal(uct_ib_device_t *dev) +{ + uct_ib_async_event_t event; ucs_spin_lock(&dev->async_event_lock); - iter = kh_get(uct_ib_async_event, &dev->async_events_hash, *event); - if (iter != kh_end(&dev->async_events_hash)) { - entry = &kh_value(&dev->async_events_hash, iter); - entry->fired = 1; - if (entry->wait_ctx != NULL) { - uct_ib_device_async_event_schedule_callback(dev, entry->wait_ctx); - } - } + dev->flags |= UCT_IB_DEVICE_FAILED; + kh_foreach_key(&dev->async_events_hash, event, + uct_ib_device_async_event_dispatch_nolock(dev, &event)); ucs_spin_unlock(&dev->async_event_lock); } @@ -509,9 +529,10 @@ void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_DEVICE_FATAL: + uct_ib_device_async_event_dispatch_fatal(dev); snprintf(event_info, sizeof(event_info), "%s on port %d", ibv_event_type_str(event->event_type), event->port_num); - level = UCS_LOG_LEVEL_ERROR; + level = UCS_LOG_LEVEL_DIAG; break; case IBV_EVENT_PORT_ACTIVE: case IBV_EVENT_PORT_ERR: diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 81a467322f3..6d2eb319ef7 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -88,6 +88,7 @@ enum { UCT_IB_DEVICE_FLAG_DC = UCT_IB_DEVICE_FLAG_DC_V1 | UCT_IB_DEVICE_FLAG_DC_V2, /* Device supports DC */ UCT_IB_DEVICE_FLAG_ODP_IMPLICIT = UCS_BIT(9), + UCT_IB_DEVICE_FAILED = UCS_BIT(10) /* Got fatal error */ }; diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 91b809e5e36..3d2ce530451 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -686,6 +686,13 @@ static ucs_status_t uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface, int solicited) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t); + uct_ib_mlx5_md_t *ib_md = ucs_derived_of(ib_iface->super.md, + uct_ib_mlx5_md_t); + + if (ucs_unlikely(ib_md->super.dev.flags & UCT_IB_DEVICE_FAILED)) { + return UCS_OK; + } + #if HAVE_DECL_MLX5DV_INIT_OBJ return uct_ib_mlx5dv_arm_cq(&iface->cq[dir], solicited); #else From 50d9937d11b57ddccbcba53e49ca1375cce1c0b0 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Mon, 11 Apr 2022 14:24:40 +0300 Subject: [PATCH 19/19] UCX: bump extra version to pre1 --- configure.ac | 8 ++++---- ucx.spec.in | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index f183232baab..da167b6f8db 100644 --- a/configure.ac +++ b/configure.ac @@ -8,10 +8,10 @@ # AC_PREREQ([2.63]) -define([ucx_ver_major], 1) # Major version. Usually does not change. -define([ucx_ver_minor], 13) # Minor version. Increased for each release. -define([ucx_ver_patch], 0) # Patch version. Increased for a bugfix release. -define([ucx_ver_extra], ) # Extra version string. Empty for a general release. +define([ucx_ver_major], 1) # Major version. Usually does not change. +define([ucx_ver_minor], 13) # Minor version. Increased for each release. +define([ucx_ver_patch], 0) # Patch version. Increased for a bugfix release. +define([ucx_ver_extra], .pre1) # Extra version string. Empty for a general release. define([ts], esyscmd([sh -c "date +%Y%m%d%H%M%S"])) diff --git a/ucx.spec.in b/ucx.spec.in index e98bc32c32b..bf617f38eb0 100644 --- a/ucx.spec.in +++ b/ucx.spec.in @@ -349,6 +349,8 @@ library internals, protocol objects, transports status, and more. %endif %changelog +* Wed Apr 27 2022 Evgeny Leksikov 1.13.0.pre1 +- Bump version to 1.13.0.pre1 * Wed Nov 10 2021 Yossi Itigin 1.13.0-1 - Bump version to 1.13.0 * Wed Jun 9 2021 Yossi Itigin 1.12.0-1