Skip to content

Commit 7fa6733

Browse files
casteryhfacebook-github-bot
authored andcommitted
Fallback when mlx5dv is not supported. (#1665)
Summary: This change adds fallback support when mlx5dv (Mellanox device-specific extensions) is not available for RDMA operations. It modifies the queue pair creation logic to conditionally use either extended mlx5dv-based queue pairs (when supported) or standard ibverbs queue pairs (as fallback). The pt_cuda_alloc flag is updated to require mlx5dv support since it's necessary for merging memory segments when using PyTorch's CUDA allocator. The change adds a new `is_extended` parameter to control whether to create extended or standard queue pairs at runtime. Adds an env variable `MONARCH_RDMA_MLX5DV_DISABLED` to test the new code path on dev machine. Differential Revision: D85504061
1 parent c647c1c commit 7fa6733

File tree

5 files changed

+87
-63
lines changed

5 files changed

+87
-63
lines changed

monarch_rdma/src/ibverbs_primitives.rs

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,10 @@ pub fn mlx5dv_supported() -> bool {
650650
}
651651

652652
fn mlx5dv_supported_impl() -> bool {
653+
if std::env::var("MONARCH_RDMA_MLX5DV_DISABLED").is_ok() {
654+
eprintln!("mlx5dv support disabled by MONARCH_RDMA_MLX5DV_DISABLED");
655+
return false;
656+
}
653657
// SAFETY: We are calling C functions from libibverbs and libmlx5.
654658
unsafe {
655659
let mut num_devices = 0;
@@ -733,7 +737,7 @@ fn ibverbs_supported_impl() -> bool {
733737
///
734738
/// `true` if both ibverbs devices and mlx5dv extensions are available, `false` otherwise.
735739
pub fn rdma_supported() -> bool {
736-
ibverbs_supported() && mlx5dv_supported()
740+
ibverbs_supported()
737741
}
738742

739743
/// Represents a view of a memory region that can be registered with an RDMA device.
@@ -1108,24 +1112,4 @@ mod tests {
11081112
let mlx5dv_support = mlx5dv_supported();
11091113
println!("mlx5dv_supported: {}", mlx5dv_support);
11101114
}
1111-
1112-
#[test]
1113-
fn test_rdma_supported_combines_checks() {
1114-
// This test verifies that rdma_supported() properly combines both checks
1115-
let ibverbs_support = ibverbs_supported();
1116-
let mlx5dv_support = mlx5dv_supported();
1117-
let rdma_support = rdma_supported();
1118-
1119-
// rdma_supported should be true only if both checks pass
1120-
assert_eq!(
1121-
rdma_support,
1122-
ibverbs_support && mlx5dv_support,
1123-
"rdma_supported should equal (ibverbs_supported && mlx5dv_supported)"
1124-
);
1125-
1126-
println!(
1127-
"ibverbs_supported: {}, mlx5dv_supported: {}, rdma_supported: {}",
1128-
ibverbs_support, mlx5dv_support, rdma_support
1129-
);
1130-
}
11311115
}

monarch_rdma/src/rdma_components.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ use crate::ibverbs_primitives::IbvWc;
6666
use crate::ibverbs_primitives::IbverbsConfig;
6767
use crate::ibverbs_primitives::RdmaOperation;
6868
use crate::ibverbs_primitives::RdmaQpInfo;
69+
use crate::ibverbs_primitives::mlx5dv_supported;
6970

7071
#[derive(Debug, Named, Clone, Serialize, Deserialize)]
7172
pub struct DoorBell {
@@ -530,6 +531,7 @@ impl RdmaQueuePair {
530531
config.max_recv_wr.try_into().unwrap(),
531532
config.max_send_sge.try_into().unwrap(),
532533
config.max_recv_sge.try_into().unwrap(),
534+
mlx5dv_supported() as u8,
533535
);
534536

535537
if qp.is_null() {

monarch_rdma/src/rdma_manager_actor.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ use crate::ibverbs_primitives::IbverbsConfig;
4848
use crate::ibverbs_primitives::RdmaMemoryRegionView;
4949
use crate::ibverbs_primitives::RdmaQpInfo;
5050
use crate::ibverbs_primitives::ibverbs_supported;
51+
use crate::ibverbs_primitives::mlx5dv_supported;
5152
use crate::rdma_components::RdmaBuffer;
5253
use crate::rdma_components::RdmaDomain;
5354
use crate::rdma_components::RdmaQueuePair;
@@ -120,7 +121,7 @@ pub enum RdmaManagerMessage {
120121
reply: OncePortRef<RdmaQpInfo>,
121122
},
122123
ReleaseQueuePair {
123-
/// `other` - The ActorId to release queue pair for
124+
/// `other` - The ActorId to release queue pair for
124125
other: ActorRef<RdmaManagerActor>,
125126
self_device: String,
126127
other_device: String,
@@ -146,8 +147,9 @@ pub struct RdmaManagerActor {
146147

147148
config: IbverbsConfig,
148149

149-
// Flag indicating PyTorch CUDA allocator compatibility
150+
// Flag indicating whether we register all memory regions allocated by the PyTorch CUDA allocator
150151
// True if both C10 CUDA allocator is enabled AND expandable segments are enabled
152+
// AND mlx5dv is supported (required for merging segments)
151153
pt_cuda_alloc: bool,
152154

153155
// Map of unique RdmaMemoryRegionView to ibv_mr*. In case of cuda w/ pytorch its -1
@@ -527,7 +529,8 @@ impl Actor for RdmaManagerActor {
527529
let mut config = params.unwrap_or_default();
528530
tracing::debug!("rdma is enabled, config device hint: {}", config.device);
529531

530-
let pt_cuda_alloc = crate::rdma_components::pt_cuda_allocator_compatibility();
532+
let pt_cuda_alloc =
533+
crate::rdma_components::pt_cuda_allocator_compatibility() && mlx5dv_supported();
531534

532535
// check config and hardware support align
533536
if config.use_gpu_direct {

rdmaxcel-sys/src/rdmaxcel.c

Lines changed: 72 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ struct ibv_qp* create_qp(
2828
int max_send_wr,
2929
int max_recv_wr,
3030
int max_send_sge,
31-
int max_recv_sge) {
31+
int max_recv_sge,
32+
uint8_t is_extended) {
3233
// Create separate completion queues for send and receive operations
3334
struct ibv_cq* send_cq = ibv_create_cq(context, cq_entries, NULL, NULL, 0);
3435
if (!send_cq) {
@@ -43,44 +44,77 @@ struct ibv_qp* create_qp(
4344
return NULL;
4445
}
4546

46-
// Initialize extended queue pair attributes
47-
struct ibv_qp_init_attr_ex qp_init_attr_ex = {
48-
.qp_context = NULL,
49-
.send_cq = send_cq,
50-
.recv_cq = recv_cq,
51-
.srq = NULL,
52-
.cap =
53-
{
54-
.max_send_wr = max_send_wr,
55-
.max_recv_wr = max_recv_wr,
56-
.max_send_sge = max_send_sge,
57-
.max_recv_sge = max_recv_sge,
58-
.max_inline_data = 0,
59-
},
60-
.qp_type = IBV_QPT_RC,
61-
.sq_sig_all = 0,
62-
.pd = pd,
63-
.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS,
64-
.send_ops_flags = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_READ |
65-
IBV_QP_EX_WITH_SEND,
66-
.create_flags = 0,
67-
};
68-
69-
struct mlx5dv_qp_init_attr mlx5dv_attr = {};
70-
mlx5dv_attr.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS;
71-
mlx5dv_attr.send_ops_flags =
72-
MLX5DV_QP_EX_WITH_MKEY_CONFIGURE | MLX5DV_QP_EX_WITH_MR_LIST;
73-
74-
// Create extended queue pair
75-
struct ibv_qp* qp = mlx5dv_create_qp(context, &qp_init_attr_ex, &mlx5dv_attr);
76-
if (!qp) {
77-
perror("failed to create extended queue pair (QP)");
78-
ibv_destroy_cq(send_cq);
79-
ibv_destroy_cq(recv_cq);
80-
return NULL;
47+
if (is_extended) {
48+
// Initialize extended queue pair attributes
49+
struct ibv_qp_init_attr_ex qp_init_attr_ex = {
50+
.qp_context = NULL,
51+
.send_cq = send_cq,
52+
.recv_cq = recv_cq,
53+
.srq = NULL,
54+
.cap =
55+
{
56+
.max_send_wr = max_send_wr,
57+
.max_recv_wr = max_recv_wr,
58+
.max_send_sge = max_send_sge,
59+
.max_recv_sge = max_recv_sge,
60+
.max_inline_data = 0,
61+
},
62+
.qp_type = IBV_QPT_RC,
63+
.sq_sig_all = 0,
64+
.pd = pd,
65+
.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS,
66+
.send_ops_flags = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_READ |
67+
IBV_QP_EX_WITH_SEND,
68+
.create_flags = 0,
69+
};
70+
71+
struct mlx5dv_qp_init_attr mlx5dv_attr = {};
72+
mlx5dv_attr.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS;
73+
mlx5dv_attr.send_ops_flags =
74+
MLX5DV_QP_EX_WITH_MKEY_CONFIGURE | MLX5DV_QP_EX_WITH_MR_LIST;
75+
76+
// Create extended queue pair
77+
struct ibv_qp* qp =
78+
mlx5dv_create_qp(context, &qp_init_attr_ex, &mlx5dv_attr);
79+
if (!qp) {
80+
perror("failed to create extended queue pair (QP)");
81+
ibv_destroy_cq(send_cq);
82+
ibv_destroy_cq(recv_cq);
83+
return NULL;
84+
}
85+
86+
return qp;
87+
88+
} else {
89+
// Initialize queue pair attributes
90+
struct ibv_qp_init_attr qp_init_attr = {
91+
.qp_context = NULL,
92+
.send_cq = send_cq,
93+
.recv_cq = recv_cq,
94+
.srq = NULL,
95+
.cap =
96+
{
97+
.max_send_wr = max_send_wr,
98+
.max_recv_wr = max_recv_wr,
99+
.max_send_sge = max_send_sge,
100+
.max_recv_sge = max_recv_sge,
101+
.max_inline_data = 0,
102+
},
103+
.qp_type = IBV_QPT_RC,
104+
.sq_sig_all = 0,
105+
};
106+
107+
// Create queue pair
108+
struct ibv_qp* qp = ibv_create_qp(pd, &qp_init_attr);
109+
if (!qp) {
110+
perror("failed to create queue pair (QP)");
111+
ibv_destroy_cq(send_cq);
112+
ibv_destroy_cq(recv_cq);
113+
return NULL;
114+
}
115+
116+
return qp;
81117
}
82-
83-
return qp;
84118
}
85119

86120
struct mlx5dv_qp* create_mlx5dv_qp(struct ibv_qp* qp) {

rdmaxcel-sys/src/rdmaxcel.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ struct ibv_qp* create_qp(
6969
int max_send_wr,
7070
int max_recv_wr,
7171
int max_send_sge,
72-
int max_recv_sge);
72+
int max_recv_sge,
73+
uint8_t is_extended);
7374

7475
struct mlx5dv_qp* create_mlx5dv_qp(struct ibv_qp* qp);
7576

0 commit comments

Comments
 (0)