-
Notifications
You must be signed in to change notification settings - Fork 506
UCT/GDA: Collapsed CQ #10959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCT/GDA: Collapsed CQ #10959
Changes from all commits
5133a6b
53139a0
f00182e
0f0e3ec
60db513
452dbfc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -93,29 +93,99 @@ template<ucs_device_level_t level> UCS_F_DEVICE void uct_rc_mlx5_gda_sync(void) | |||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| UCS_F_DEVICE uint64_t | ||||||||||||||
| uct_rc_mlx5_gda_reserv_wqe_thread(uct_rc_gdaki_dev_ep_t *ep, unsigned count) | ||||||||||||||
| UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16(uint16_t x) | ||||||||||||||
| { | ||||||||||||||
| /* Try to reserve optimistically */ | ||||||||||||||
| int32_t prev = atomicAdd(&ep->avail_count, -(int32_t)count); | ||||||||||||||
| if (prev < (int32_t)count) { | ||||||||||||||
| /* Rollback */ | ||||||||||||||
| atomicAdd(&ep->avail_count, count); | ||||||||||||||
| uint32_t ret; | ||||||||||||||
| asm volatile("{\n\t" | ||||||||||||||
| ".reg .b32 mask;\n\t" | ||||||||||||||
| ".reg .b32 ign;\n\t" | ||||||||||||||
| "mov.b32 mask, 0x1;\n\t" | ||||||||||||||
| "prmt.b32 %0, %1, ign, mask;\n\t" | ||||||||||||||
| "}" | ||||||||||||||
| : "=r"(ret) | ||||||||||||||
| : "r"((uint32_t)x)); | ||||||||||||||
| return ret; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_parse_cqe(uct_rc_gdaki_dev_ep_t *ep, | ||||||||||||||
| uint16_t *wqe_cnt, | ||||||||||||||
| uint8_t *opcode) | ||||||||||||||
| { | ||||||||||||||
| auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(ep->cqe_daddr); | ||||||||||||||
| uint32_t *data_ptr = (uint32_t*)&cqe64->wqe_counter; | ||||||||||||||
| uint32_t data = READ_ONCE(*data_ptr); | ||||||||||||||
| uint64_t rsvd_idx = READ_ONCE(ep->sq_rsvd_index); | ||||||||||||||
|
|
||||||||||||||
| *wqe_cnt = uct_rc_mlx5_gda_bswap16(data); | ||||||||||||||
| if (opcode != nullptr) { | ||||||||||||||
| *opcode = data >> 28; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| return rsvd_idx - ((rsvd_idx - *wqe_cnt) & 0xffff); | ||||||||||||||
ofirfarjun7 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_max_alloc_wqe_base( | ||||||||||||||
| uct_rc_gdaki_dev_ep_t *ep, unsigned count) | ||||||||||||||
| { | ||||||||||||||
| uint16_t wqe_cnt; | ||||||||||||||
| uint64_t pi; | ||||||||||||||
|
|
||||||||||||||
| pi = uct_rc_mlx5_gda_parse_cqe(ep, &wqe_cnt, nullptr); | ||||||||||||||
| return pi + ep->sq_wqe_num + 1 - count; | ||||||||||||||
| } | ||||||||||||||
|
Comment on lines
+133
to
+135
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix off-by-one in max alloc calculation. - return pi + ep->sq_wqe_num + 1 - count;
+ return pi + ep->sq_wqe_num - count;📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
|
|
||||||||||||||
| UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_reserv_wqe_thread( | ||||||||||||||
| uct_rc_gdaki_dev_ep_t *ep, unsigned count) | ||||||||||||||
| { | ||||||||||||||
| /* Do not attempt to reserve if the available space is less than the | ||||||||||||||
| * requested count, to avoid starvation of threads trying to rollback the | ||||||||||||||
| * reservation with atomicCAS. */ | ||||||||||||||
| uint64_t max_wqe_base = uct_rc_mlx5_gda_max_alloc_wqe_base(ep, count); | ||||||||||||||
| if (ep->sq_rsvd_index > max_wqe_base) { | ||||||||||||||
| return UCT_RC_GDA_RESV_WQE_NO_RESOURCE; | ||||||||||||||
| } | ||||||||||||||
|
Comment on lines
+144
to
146
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: race condition:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe add READ_ONCE to make sure value is not cached?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it fixed? |
||||||||||||||
|
|
||||||||||||||
| /* We own count elements, now can safely increment the reserved index */ | ||||||||||||||
| return atomicAdd(reinterpret_cast<unsigned long long*>(&ep->sq_rsvd_index), | ||||||||||||||
| count); | ||||||||||||||
| uint64_t wqe_base = atomicAdd(reinterpret_cast<unsigned long long*>( | ||||||||||||||
| &ep->sq_rsvd_index), | ||||||||||||||
| static_cast<unsigned long long>(count)); | ||||||||||||||
|
|
||||||||||||||
| /* | ||||||||||||||
| * Attempt to reserve 'count' WQEs by atomically incrementing the reserved | ||||||||||||||
| * index. If the reservation exceeds the available space in the work queue, | ||||||||||||||
| * enter a rollback loop. | ||||||||||||||
| * | ||||||||||||||
| * Rollback Logic: | ||||||||||||||
| * - Calculate the next potential index (wqe_next) after attempting the | ||||||||||||||
| * reservation. | ||||||||||||||
| * - Use atomic CAS to check if the current reserved index matches wqe_next. | ||||||||||||||
| * If it does, revert the reservation by resetting the reserved index to | ||||||||||||||
| * wqe_base. | ||||||||||||||
| * - A successful CAS indicates no other thread has modified the reserved | ||||||||||||||
| * index, allowing the rollback to complete, and the function returns | ||||||||||||||
| * UCT_RC_GDA_RESV_WQE_NO_RESOURCE to signal insufficient resources. | ||||||||||||||
| * - If CAS fails, it means another thread has modified the reserved index. | ||||||||||||||
| * The loop continues to reevaluate resource availability to determine if | ||||||||||||||
| * the reservation can now be satisfied, possibly due to other operations | ||||||||||||||
| * freeing up resources. | ||||||||||||||
| */ | ||||||||||||||
| while (wqe_base > max_wqe_base) { | ||||||||||||||
| uint64_t wqe_next = wqe_base + count; | ||||||||||||||
| if (atomicCAS(reinterpret_cast<unsigned long long*>(&ep->sq_rsvd_index), | ||||||||||||||
| wqe_next, wqe_base) == wqe_next) { | ||||||||||||||
| return UCT_RC_GDA_RESV_WQE_NO_RESOURCE; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| max_wqe_base = uct_rc_mlx5_gda_max_alloc_wqe_base(ep, count); | ||||||||||||||
| } | ||||||||||||||
|
Comment on lines
+171
to
+179
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: potential livelock: if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||||||||||
|
|
||||||||||||||
| return wqe_base; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| template<ucs_device_level_t level> | ||||||||||||||
| UCS_F_DEVICE void | ||||||||||||||
| uct_rc_mlx5_gda_reserv_wqe(uct_rc_gdaki_dev_ep_t *ep, unsigned count, | ||||||||||||||
| unsigned lane_id, uint64_t &wqe_base) | ||||||||||||||
| { | ||||||||||||||
| wqe_base = 0; | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intentionally added zero initialization to avoid a crash with syndrome 68
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why it cause this crash and how this initialization prevent it?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this one was quite tricky, and I also struggled to understand. This issue was not always reproducible on rock, but quite frequently failed in CI with syndrome 68. |
||||||||||||||
|
|
||||||||||||||
| if (lane_id == 0) { | ||||||||||||||
| wqe_base = uct_rc_mlx5_gda_reserv_wqe_thread(ep, count); | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -211,7 +281,7 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep, | |||||||||||||
| UCS_F_DEVICE bool | ||||||||||||||
| uct_rc_mlx5_gda_fc(const uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx) | ||||||||||||||
| { | ||||||||||||||
| return (wqe_idx & ep->sq_fc_mask) == 1; | ||||||||||||||
| return !(wqe_idx & ep->sq_fc_mask); | ||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: flow-control condition inverted from
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. flow-control request completion at least on half of work queue size, |
||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| template<ucs_device_level_t level> | ||||||||||||||
|
|
@@ -494,82 +564,9 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len) | |||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep) | ||||||||||||||
| { | ||||||||||||||
| void *cqe = ep->cqe_daddr; | ||||||||||||||
| size_t cqe_num = ep->cqe_num; | ||||||||||||||
| uint64_t cqe_idx = ep->cqe_ci; | ||||||||||||||
| uint32_t idx = cqe_idx & (cqe_num - 1); | ||||||||||||||
| void *curr_cqe = (uint8_t*)cqe + (idx * DOCA_GPUNETIO_VERBS_CQE_SIZE); | ||||||||||||||
| auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(curr_cqe); | ||||||||||||||
|
|
||||||||||||||
| /* Read last 3 fields with a single atomic operation */ | ||||||||||||||
| uint32_t *data_ptr = (uint32_t *)&cqe64->wqe_counter; | ||||||||||||||
| uint32_t data = READ_ONCE(*data_ptr); | ||||||||||||||
| uint8_t op_owner = data >> 24; | ||||||||||||||
| if ((op_owner & MLX5_CQE_OWNER_MASK) ^ !!(cqe_idx & cqe_num)) { | ||||||||||||||
| return; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| cuda::atomic_ref<uint64_t, cuda::thread_scope_device> ref(ep->cqe_ci); | ||||||||||||||
| if (!ref.compare_exchange_strong(cqe_idx, cqe_idx + 1, | ||||||||||||||
| cuda::std::memory_order_relaxed)) { | ||||||||||||||
| return; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| uint8_t opcode = op_owner >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT; | ||||||||||||||
| uint32_t data_cpu = doca_gpu_dev_verbs_bswap32(data); | ||||||||||||||
| uint16_t wqe_cnt = (data_cpu >> 16) & 0xffff; | ||||||||||||||
| uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1); | ||||||||||||||
|
|
||||||||||||||
| cuda::atomic_ref<uint64_t, cuda::thread_scope_device> pi_ref(ep->sq_wqe_pi); | ||||||||||||||
| uint64_t sq_wqe_pi = pi_ref.load(cuda::std::memory_order_relaxed); | ||||||||||||||
| uint64_t new_wqe_pi; | ||||||||||||||
|
|
||||||||||||||
| do { | ||||||||||||||
| /* Skip CQE if it's older than current producer index, could be already | ||||||||||||||
| * processed by another thread. This handles CQE wrap-around. */ | ||||||||||||||
| if ((int16_t)(wqe_cnt - (uint16_t)sq_wqe_pi) < 0) { | ||||||||||||||
| return; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| uint16_t completed_delta = wqe_cnt - (uint16_t)sq_wqe_pi; | ||||||||||||||
| new_wqe_pi = sq_wqe_pi + completed_delta + 1; | ||||||||||||||
| } while (!pi_ref.compare_exchange_weak(sq_wqe_pi, new_wqe_pi, | ||||||||||||||
| cuda::std::memory_order_release, | ||||||||||||||
| cuda::std::memory_order_relaxed)); | ||||||||||||||
|
|
||||||||||||||
| if (opcode == MLX5_CQE_REQ) { | ||||||||||||||
| atomicAdd(&ep->avail_count, (int32_t)(new_wqe_pi - sq_wqe_pi)); | ||||||||||||||
| return; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| auto err_cqe = reinterpret_cast<mlx5_err_cqe_ex*>(cqe64); | ||||||||||||||
| auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); | ||||||||||||||
| ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x " | ||||||||||||||
| "wqe_idx:0x%x qp:0x%x", | ||||||||||||||
| idx, err_cqe->syndrome, err_cqe->vendor_err_synd, | ||||||||||||||
| err_cqe->hw_err_synd, wqe_idx, | ||||||||||||||
| doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) & | ||||||||||||||
| 0xffffff); | ||||||||||||||
| uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64); | ||||||||||||||
| uct_rc_mlx5_gda_qedump("CQE", cqe64, 64); | ||||||||||||||
| pi_ref.fetch_max(sq_wqe_pi | UCT_RC_GDA_WQE_ERR); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| template<ucs_device_level_t level> | ||||||||||||||
| UCS_F_DEVICE void uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep) | ||||||||||||||
| { | ||||||||||||||
| uct_rc_gdaki_dev_ep_t *ep = (uct_rc_gdaki_dev_ep_t*)tl_ep; | ||||||||||||||
| unsigned num_lanes; | ||||||||||||||
| unsigned lane_id; | ||||||||||||||
|
|
||||||||||||||
| uct_rc_mlx5_gda_exec_init<level>(lane_id, num_lanes); | ||||||||||||||
| if (lane_id == 0) { | ||||||||||||||
| uct_rc_mlx5_gda_progress_thread(ep); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| uct_rc_mlx5_gda_sync<level>(); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| template<ucs_device_level_t level> | ||||||||||||||
|
|
@@ -578,13 +575,21 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion( | |||||||||||||
| { | ||||||||||||||
| uct_rc_gdaki_dev_ep_t *ep = reinterpret_cast<uct_rc_gdaki_dev_ep_t*>(tl_ep); | ||||||||||||||
| uct_rc_gda_completion_t *comp = &tl_comp->rc_gda; | ||||||||||||||
| uint64_t sq_wqe_pi = ep->sq_wqe_pi; | ||||||||||||||
| uint16_t wqe_cnt; | ||||||||||||||
| uint8_t opcode; | ||||||||||||||
| uint64_t pi; | ||||||||||||||
|
|
||||||||||||||
| pi = uct_rc_mlx5_gda_parse_cqe(ep, &wqe_cnt, &opcode); | ||||||||||||||
|
|
||||||||||||||
| if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) <= comp->wqe_idx) { | ||||||||||||||
| if (pi < comp->wqe_idx) { | ||||||||||||||
| return UCS_INPROGRESS; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| if (sq_wqe_pi & UCT_RC_GDA_WQE_ERR) { | ||||||||||||||
| if (opcode == MLX5_CQE_REQ_ERR) { | ||||||||||||||
| uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1); | ||||||||||||||
| auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); | ||||||||||||||
| uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64); | ||||||||||||||
| uct_rc_mlx5_gda_qedump("CQE", ep->cqe_daddr, 64); | ||||||||||||||
| return UCS_ERR_IO_ERROR; | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -68,7 +68,6 @@ struct test_ucp_device_kernel_result_t { | |
| ucs_status_t status; | ||
| uint64_t producer_index; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we keep the producer index and retrieve it from sq_rsvd_index maybe?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could call uct_rc_mlx5_gda_read_cqe/calc_pi here |
||
| uint64_t ready_index; | ||
| int32_t avail_count; | ||
| }; | ||
|
|
||
| test_ucp_device_kernel_result_t | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.