Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ece8612
UCP/PERF: Addressed PR comments
iyastreb Sep 24, 2025
19d307e
UCP/PERF: Replaced popcount with int field to improve performance
iyastreb Sep 24, 2025
8f1367e
UCP/PERF: Refactored dispatch macro
iyastreb Sep 24, 2025
7009472
UCP/PERF: Support for warp level
iyastreb Sep 24, 2025
9f92aa6
UCP/PERF: Added warp tests in CI
iyastreb Sep 24, 2025
3bab527
UCP/PERF: Cleanup
iyastreb Sep 26, 2025
825f7de
UCP/PERF: Speed up intermediate report
iyastreb Sep 30, 2025
2816236
UCP/PERF: Reduce progress latency by 2us
iyastreb Sep 30, 2025
05f7b20
UCP/PERF: Fix for ULL
iyastreb Sep 30, 2025
7eca345
UCP/PERF: Allow NULL comp
iyastreb Oct 1, 2025
36fe9e3
UCP/PERF: Merged with master
iyastreb Oct 6, 2025
f3e93e2
UCP/PERF: Merged with master
iyastreb Oct 9, 2025
13c2d8b
UCP/PERF: Flow control window
iyastreb Oct 15, 2025
df5ba8f
UCP/PERF: Flow control
iyastreb Oct 15, 2025
3928d6e
UCP/PERF: Fix
iyastreb Oct 15, 2025
6a393a0
UCP/PERF: Added pending_map
iyastreb Oct 15, 2025
03959ee
UCP: Coverity fix
iyastreb Oct 15, 2025
46633a4
UCP/PERF: Removed unused var
iyastreb Oct 16, 2025
03b7da4
UCP/PERF: Disable CI perftest
iyastreb Oct 17, 2025
464887d
UCP: Merge branch 'master' into ucp-perf-warp-fixes
iyastreb Oct 27, 2025
1f59fbd
UCP: Merged with master
iyastreb Oct 28, 2025
59ea068
PERF: Use __threadfence_system
iyastreb Oct 29, 2025
8d28eb6
UCP/PERF: CR fixes
iyastreb Oct 30, 2025
d675046
UCP/PERF: Merge branch 'master' into ucp-perf-warp-fixes
iyastreb Nov 3, 2025
9313ce3
UCP/PERF: Enabled jenkins tests
iyastreb Nov 3, 2025
ca1a574
UCP: Merge branch 'master' into ucp-perf-warp-fixes
iyastreb Nov 6, 2025
fe5a631
UCP/PERF: CR fixes
iyastreb Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions contrib/ucx_perftest_config/test_types_ucp_device_cuda
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#
# UCP basic device cuda tests
#
ucp_device_cuda_single_lat_8b_1thread -t ucp_put_single_lat -m cuda -s 8 -n 10000
ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000
ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000
ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
Expand All @@ -11,10 +12,16 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2
# Increase number of threads after following fixes:
# - Use thread-local memory instead of shared for requests (limit 48K)
# - Fix WQE size limit of 1024
# TODO - enable when wqe reserve is fixed.
# ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
# ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
# ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2

ucp_device_cuda_single_bw_1k_1warp -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 -L warp
ucp_device_cuda_single_lat_1k_1warp -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 -L warp
ucp_device_cuda_multi_bw_1k_1warp -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
ucp_device_cuda_multi_lat_1k_1warp -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp
ucp_device_cuda_partial_bw_1k_1warp -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
ucp_device_cuda_partial_lat_1k_1warp -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp
8 changes: 5 additions & 3 deletions src/tools/perf/api/libperf.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ typedef enum {

#define UCX_PERF_MEM_DEV_DEFAULT -1

#define UCP_PERF_FC_WINDOW_DEFAULT 4

/**
* Performance counter type.
Expand Down Expand Up @@ -266,9 +267,9 @@ typedef struct ucx_perf_params {
ucx_perf_wait_mode_t wait_mode; /* How to wait */
ucs_memory_type_t send_mem_type; /* Send memory type */
ucs_memory_type_t recv_mem_type; /* Recv memory type */
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
ucs_device_level_t device_level; /* Device level for gdaki */
ucx_perf_accel_dev_t send_device; /* Send memory device */
ucx_perf_accel_dev_t recv_device; /* Recv memory device */
ucs_device_level_t device_level; /* Device level */
unsigned flags; /* See ucx_perf_test_flags. */

size_t *msg_size_list; /* Test message sizes list. The size
Expand All @@ -289,6 +290,7 @@ typedef struct ucx_perf_params {
in latency tests */
unsigned device_thread_count; /* Number of device threads */
unsigned device_block_count; /* Number of device blocks */
unsigned device_fc_window; /* Flow control window size for device tests */

void *rte_group; /* Opaque RTE group handle */
ucx_perf_rte_t *rte; /* RTE functions used to exchange data */
Expand Down
172 changes: 95 additions & 77 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ typedef unsigned long long ucx_perf_cuda_time_t;

struct ucx_perf_cuda_context {
unsigned max_outstanding;
unsigned device_fc_window;
ucx_perf_counter_t max_iters;
ucx_perf_cuda_time_t report_interval_ns;
ucx_perf_counter_t completed_iters;
Expand All @@ -32,22 +33,46 @@ UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
return globaltimer;
}

UCS_F_DEVICE void
ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
ucx_perf_counter_t completed,
ucx_perf_counter_t max_iters,
ucx_perf_cuda_time_t &last_report_time)
{
if (threadIdx.x == 0) {
ucx_perf_cuda_time_t current_time = ucx_perf_cuda_get_time_ns();
if (((current_time - last_report_time) >= ctx.report_interval_ns) ||
(completed >= max_iters)) {
ctx.completed_iters = completed;
last_report_time = current_time;
__threadfence();
class ucx_perf_cuda_reporter {
public:
/* Number of updates per report interval */
static const unsigned UPDATES_PER_INTERVAL = 5;

__device__
ucx_perf_cuda_reporter(ucx_perf_cuda_context &ctx) :
m_ctx(ctx),
m_max_iters(ctx.max_iters),
m_next_report_iter(1),
m_last_report_time(ucx_perf_cuda_get_time_ns()),
m_report_interval_ns(ctx.report_interval_ns / UPDATES_PER_INTERVAL)
{
}

__device__ inline void
update_report(ucx_perf_counter_t completed)
{
if ((threadIdx.x == 0) && ucs_unlikely(completed >= m_next_report_iter)) {
assert(completed - m_ctx.completed_iters > 0);
ucx_perf_cuda_time_t cur_time = ucx_perf_cuda_get_time_ns();
ucx_perf_cuda_time_t iter_time = (cur_time - m_last_report_time) /
(completed - m_ctx.completed_iters);
assert(iter_time > 0);
m_last_report_time = cur_time;
m_ctx.completed_iters = completed;
__threadfence_system();

m_next_report_iter = ucs_min(completed + (m_report_interval_ns / iter_time),
m_max_iters);
}
}
}

private:
ucx_perf_cuda_context &m_ctx;
ucx_perf_counter_t m_max_iters;
ucx_perf_counter_t m_next_report_iter;
ucx_perf_cuda_time_t m_last_report_time;
ucx_perf_cuda_time_t m_report_interval_ns;
};

static UCS_F_ALWAYS_INLINE uint64_t *
ucx_perf_cuda_get_sn(const void *address, size_t length)
Expand All @@ -63,95 +88,83 @@ UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
__syncthreads();
}

/* Simple bitset */
#define UCX_BIT_MASK(bit) (1 << ((bit) & (CHAR_BIT - 1)))
#define UCX_BIT_SET(set, bit) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
#define UCX_BIT_RESET(set, bit) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK(bit))
#define UCX_BIT_GET(set, bit) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK(bit))
#define UCX_BITSET_SIZE(bits) ((bits + CHAR_BIT - 1) / CHAR_BIT)

UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
size_t count = 0;
for (size_t i = 0; i < bits; i++) {
if (UCX_BIT_GET(set, i)) {
count++;
}
}
return count;
}

UCS_F_DEVICE size_t
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from)
template<ucs_device_level_t level>
__host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
{
for (size_t i = from; i < bits; i++) {
if (!UCX_BIT_GET(set, i)) {
return i;
}
switch (level) {
case UCS_DEVICE_LEVEL_THREAD: return tid;
case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP;
default: return 0;
}
return bits;
}

#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
do { \
switch (cmd) { \
case UCX_PERF_CMD_PUT_SINGLE: \
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_MULTI: \
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_PARTIAL: \
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported cmd: %d", cmd); \
break; \
} \
} while (0)
#define UCX_PERF_THREAD_INDEX_SET(_level, _tid, _outval) \
(_outval) = ucx_perf_cuda_thread_index<_level>(_tid)

#define UCX_PERF_SWITCH_CMD(_cmd, _func, ...) \
switch (_cmd) { \
case UCX_PERF_CMD_PUT_SINGLE: \
_func(UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_MULTI: \
_func(UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_PARTIAL: \
_func(UCX_PERF_CMD_PUT_PARTIAL, __VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported cmd: %d", _cmd); \
break; \
}

#define UCX_KERNEL_DISPATCH(perf, func, ...) \
do { \
ucs_device_level_t _level = perf.params.device_level; \
ucx_perf_cmd_t _cmd = perf.params.command; \
unsigned _blocks = perf.params.device_block_count; \
unsigned _threads = perf.params.device_thread_count; \
size_t _shared_size = _threads * perf.params.max_outstanding * \
sizeof(ucp_device_request_t); \
switch (_level) { \
#define UCX_PERF_SWITCH_LEVEL(_level, _func, ...) \
switch (_level) { \
case UCS_DEVICE_LEVEL_THREAD: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
_func(UCS_DEVICE_LEVEL_THREAD, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_WARP: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
_func(UCS_DEVICE_LEVEL_WARP, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_BLOCK: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_GRID: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported level: %d", _level); \
break; \
} \
}

#define UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL(_cmd, _level, _perf, _kernel, ...) \
do { \
unsigned _blocks = _perf.params.device_block_count; \
unsigned _threads = _perf.params.device_thread_count; \
unsigned _reqs_count = ucs_div_round_up(_perf.params.max_outstanding, \
_perf.params.device_fc_window); \
size_t _shared_size = _reqs_count * sizeof(ucp_device_request_t) * \
ucx_perf_cuda_thread_index<_level>(_threads); \
_kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>>(__VA_ARGS__); \
} while (0)

#define UCX_PERF_KERNEL_DISPATCH_CMD(_level, _perf, _kernel, ...) \
UCX_PERF_SWITCH_CMD(_perf.params.command, UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL, \
_level, _perf, _kernel, __VA_ARGS__);

#define UCX_PERF_KERNEL_DISPATCH(_perf, _kernel, ...) \
UCX_PERF_SWITCH_LEVEL(_perf.params.device_level, UCX_PERF_KERNEL_DISPATCH_CMD, \
_perf, _kernel, __VA_ARGS__);


class ucx_perf_cuda_test_runner {
public:
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
{
init_ctx();

m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
m_cpu_ctx->device_fc_window = perf.params.device_fc_window;
m_cpu_ctx->max_iters = perf.max_iter;
m_cpu_ctx->completed_iters = 0;
m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
ULONG_MAX :
ucs_time_to_nsec(perf.report_interval) / 100;
ucs_time_to_nsec(perf.report_interval);
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
}

Expand All @@ -166,11 +179,16 @@ public:
ucx_perf_counter_t last_completed = 0;
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
unsigned thread_count = m_perf.params.device_thread_count;
ucs_device_level_t level = m_perf.params.device_level;
unsigned msgs_per_iter;
UCX_PERF_SWITCH_LEVEL(level, UCX_PERF_THREAD_INDEX_SET, thread_count,
msgs_per_iter);

while (true) {
ucx_perf_counter_t delta = completed - last_completed;
if (delta > 0) {
// TODO: calculate latency percentile on kernel
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
ucx_perf_update(&m_perf, delta, delta * msgs_per_iter, msg_length);
} else if (completed >= m_perf.max_iter) {
break;
}
Expand Down
Loading
Loading