Skip to content

Commit c3bcb7a

Browse files
casteryhfacebook-github-bot
authored andcommitted
Fallback when mlx5dv is not supported. (#1665)
Summary: This change adds fallback support when mlx5dv (Mellanox device-specific extensions) is not available for RDMA operations. It modifies the queue pair creation logic to conditionally use either extended mlx5dv-based queue pairs (when supported) or standard ibverbs queue pairs (as fallback). The pt_cuda_alloc flag is updated to require mlx5dv support since it's necessary for merging memory segments when using PyTorch's CUDA allocator. The change adds a new `is_extended` parameter to control whether to create extended or standard queue pairs at runtime. Adds an env variable `MONARCH_RDMA_MLX5DV_DISABLED` to test the new code path on dev machine. Differential Revision: D85504061
1 parent 5055495 commit c3bcb7a

File tree

5 files changed

+95
-66
lines changed

5 files changed

+95
-66
lines changed

monarch_rdma/src/ibverbs_primitives.rs

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,10 @@ pub fn mlx5dv_supported() -> bool {
650650
}
651651

652652
fn mlx5dv_supported_impl() -> bool {
653+
if std::env::var("MONARCH_RDMA_DEBUG_MLX5DV_DISABLED_DO_NOT_USE").is_ok() {
654+
eprintln!("mlx5dv support disabled by MONARCH_RDMA_DEBUG_MLX5DV_DISABLED_DO_NOT_USE");
655+
return false;
656+
}
653657
// SAFETY: We are calling C functions from libibverbs and libmlx5.
654658
unsafe {
655659
let mut num_devices = 0;
@@ -733,7 +737,7 @@ fn ibverbs_supported_impl() -> bool {
733737
///
734738
/// `true` if both ibverbs devices and mlx5dv extensions are available, `false` otherwise.
735739
pub fn rdma_supported() -> bool {
736-
ibverbs_supported() && mlx5dv_supported()
740+
ibverbs_supported()
737741
}
738742

739743
/// Represents a view of a memory region that can be registered with an RDMA device.
@@ -1108,24 +1112,4 @@ mod tests {
11081112
let mlx5dv_support = mlx5dv_supported();
11091113
println!("mlx5dv_supported: {}", mlx5dv_support);
11101114
}
1111-
1112-
#[test]
1113-
fn test_rdma_supported_combines_checks() {
1114-
// This test verifies that rdma_supported() properly combines both checks
1115-
let ibverbs_support = ibverbs_supported();
1116-
let mlx5dv_support = mlx5dv_supported();
1117-
let rdma_support = rdma_supported();
1118-
1119-
// rdma_supported should be true only if both checks pass
1120-
assert_eq!(
1121-
rdma_support,
1122-
ibverbs_support && mlx5dv_support,
1123-
"rdma_supported should equal (ibverbs_supported && mlx5dv_supported)"
1124-
);
1125-
1126-
println!(
1127-
"ibverbs_supported: {}, mlx5dv_supported: {}, rdma_supported: {}",
1128-
ibverbs_support, mlx5dv_support, rdma_support
1129-
);
1130-
}
11311115
}

monarch_rdma/src/rdma_components.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ use crate::ibverbs_primitives::IbvWc;
6666
use crate::ibverbs_primitives::IbverbsConfig;
6767
use crate::ibverbs_primitives::RdmaOperation;
6868
use crate::ibverbs_primitives::RdmaQpInfo;
69+
use crate::ibverbs_primitives::mlx5dv_supported;
6970

7071
#[derive(Debug, Named, Clone, Serialize, Deserialize)]
7172
pub struct DoorBell {
@@ -530,6 +531,7 @@ impl RdmaQueuePair {
530531
config.max_recv_wr.try_into().unwrap(),
531532
config.max_send_sge.try_into().unwrap(),
532533
config.max_recv_sge.try_into().unwrap(),
534+
mlx5dv_supported() as u8,
533535
);
534536

535537
if qp.is_null() {

monarch_rdma/src/rdma_manager_actor.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ use crate::ibverbs_primitives::IbverbsConfig;
4848
use crate::ibverbs_primitives::RdmaMemoryRegionView;
4949
use crate::ibverbs_primitives::RdmaQpInfo;
5050
use crate::ibverbs_primitives::ibverbs_supported;
51+
use crate::ibverbs_primitives::mlx5dv_supported;
5152
use crate::rdma_components::RdmaBuffer;
5253
use crate::rdma_components::RdmaDomain;
5354
use crate::rdma_components::RdmaQueuePair;
@@ -120,7 +121,7 @@ pub enum RdmaManagerMessage {
120121
reply: OncePortRef<RdmaQpInfo>,
121122
},
122123
ReleaseQueuePair {
123-
/// `other` - The ActorId to release queue pair for
124+
/// `other` - The ActorId to release queue pair for
124125
other: ActorRef<RdmaManagerActor>,
125126
self_device: String,
126127
other_device: String,
@@ -150,6 +151,8 @@ pub struct RdmaManagerActor {
150151
// True if both C10 CUDA allocator is enabled AND expandable segments are enabled
151152
pt_cuda_alloc: bool,
152153

154+
mlx5dv_supported: bool,
155+
153156
// Map of unique RdmaMemoryRegionView to ibv_mr*. In case of cuda w/ pytorch its -1
154157
// since its managed independently. Only used for registration/deregistration purposes
155158
mr_map: HashMap<usize, usize>,
@@ -248,7 +251,7 @@ impl Drop for RdmaManagerActor {
248251
}
249252

250253
// 4. Deregister all CUDA segments (if using PyTorch CUDA allocator)
251-
if self.pt_cuda_alloc {
254+
if self.cuda_pt_alloc_enabled() {
252255
unsafe {
253256
let result = rdmaxcel_sys::deregister_segments();
254257
if result != 0 {
@@ -265,6 +268,11 @@ impl Drop for RdmaManagerActor {
265268
}
266269

267270
impl RdmaManagerActor {
271+
/// Whether to register all memory regions allocated by the PyTorch CUDA allocator
272+
/// True if both `pt_cuda_alloc` and `mlx5dv_supported` are true
273+
fn cuda_pt_alloc_enabled(&self) -> bool {
274+
self.pt_cuda_alloc && self.mlx5dv_supported
275+
}
268276
/// Get or create a domain and loopback QP for the specified RDMA device
269277
fn get_or_create_device_domain(
270278
&mut self,
@@ -420,10 +428,7 @@ impl RdmaManagerActor {
420428
let mut mr: *mut rdmaxcel_sys::ibv_mr = std::ptr::null_mut();
421429
let mrv;
422430

423-
// Copy pt_cuda_alloc to avoid borrowing issues
424-
let pt_cuda_alloc = self.pt_cuda_alloc;
425-
426-
if is_cuda && pt_cuda_alloc {
431+
if is_cuda && self.cuda_pt_alloc_enabled() {
427432
// Get registered segments and check if our memory range is covered
428433
let mut maybe_mrv = self.find_cuda_segment_for_address(addr, size);
429434
// not found, lets re-sync with caching allocator and retry
@@ -529,6 +534,8 @@ impl Actor for RdmaManagerActor {
529534

530535
let pt_cuda_alloc = crate::rdma_components::pt_cuda_allocator_compatibility();
531536

537+
let mlx5dv_supported = mlx5dv_supported();
538+
532539
// check config and hardware support align
533540
if config.use_gpu_direct {
534541
match validate_execution_context().await {
@@ -557,6 +564,7 @@ impl Actor for RdmaManagerActor {
557564
device_domains: HashMap::new(),
558565
config,
559566
pt_cuda_alloc,
567+
mlx5dv_supported,
560568
mr_map: HashMap::new(),
561569
mrv_id: 0,
562570
pci_to_device,

rdmaxcel-sys/src/rdmaxcel.c

Lines changed: 72 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ struct ibv_qp* create_qp(
2828
int max_send_wr,
2929
int max_recv_wr,
3030
int max_send_sge,
31-
int max_recv_sge) {
31+
int max_recv_sge,
32+
uint8_t is_extended) {
3233
// Create separate completion queues for send and receive operations
3334
struct ibv_cq* send_cq = ibv_create_cq(context, cq_entries, NULL, NULL, 0);
3435
if (!send_cq) {
@@ -43,44 +44,77 @@ struct ibv_qp* create_qp(
4344
return NULL;
4445
}
4546

46-
// Initialize extended queue pair attributes
47-
struct ibv_qp_init_attr_ex qp_init_attr_ex = {
48-
.qp_context = NULL,
49-
.send_cq = send_cq,
50-
.recv_cq = recv_cq,
51-
.srq = NULL,
52-
.cap =
53-
{
54-
.max_send_wr = max_send_wr,
55-
.max_recv_wr = max_recv_wr,
56-
.max_send_sge = max_send_sge,
57-
.max_recv_sge = max_recv_sge,
58-
.max_inline_data = 0,
59-
},
60-
.qp_type = IBV_QPT_RC,
61-
.sq_sig_all = 0,
62-
.pd = pd,
63-
.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS,
64-
.send_ops_flags = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_READ |
65-
IBV_QP_EX_WITH_SEND,
66-
.create_flags = 0,
67-
};
68-
69-
struct mlx5dv_qp_init_attr mlx5dv_attr = {};
70-
mlx5dv_attr.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS;
71-
mlx5dv_attr.send_ops_flags =
72-
MLX5DV_QP_EX_WITH_MKEY_CONFIGURE | MLX5DV_QP_EX_WITH_MR_LIST;
73-
74-
// Create extended queue pair
75-
struct ibv_qp* qp = mlx5dv_create_qp(context, &qp_init_attr_ex, &mlx5dv_attr);
76-
if (!qp) {
77-
perror("failed to create extended queue pair (QP)");
78-
ibv_destroy_cq(send_cq);
79-
ibv_destroy_cq(recv_cq);
80-
return NULL;
47+
if (is_extended) {
48+
// Initialize extended queue pair attributes
49+
struct ibv_qp_init_attr_ex qp_init_attr_ex = {
50+
.qp_context = NULL,
51+
.send_cq = send_cq,
52+
.recv_cq = recv_cq,
53+
.srq = NULL,
54+
.cap =
55+
{
56+
.max_send_wr = max_send_wr,
57+
.max_recv_wr = max_recv_wr,
58+
.max_send_sge = max_send_sge,
59+
.max_recv_sge = max_recv_sge,
60+
.max_inline_data = 0,
61+
},
62+
.qp_type = IBV_QPT_RC,
63+
.sq_sig_all = 0,
64+
.pd = pd,
65+
.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS,
66+
.send_ops_flags = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_READ |
67+
IBV_QP_EX_WITH_SEND,
68+
.create_flags = 0,
69+
};
70+
71+
struct mlx5dv_qp_init_attr mlx5dv_attr = {};
72+
mlx5dv_attr.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS;
73+
mlx5dv_attr.send_ops_flags =
74+
MLX5DV_QP_EX_WITH_MKEY_CONFIGURE | MLX5DV_QP_EX_WITH_MR_LIST;
75+
76+
// Create extended queue pair
77+
struct ibv_qp* qp =
78+
mlx5dv_create_qp(context, &qp_init_attr_ex, &mlx5dv_attr);
79+
if (!qp) {
80+
perror("failed to create extended queue pair (QP)");
81+
ibv_destroy_cq(send_cq);
82+
ibv_destroy_cq(recv_cq);
83+
return NULL;
84+
}
85+
86+
return qp;
87+
88+
} else {
89+
// Initialize queue pair attributes
90+
struct ibv_qp_init_attr qp_init_attr = {
91+
.qp_context = NULL,
92+
.send_cq = send_cq,
93+
.recv_cq = recv_cq,
94+
.srq = NULL,
95+
.cap =
96+
{
97+
.max_send_wr = max_send_wr,
98+
.max_recv_wr = max_recv_wr,
99+
.max_send_sge = max_send_sge,
100+
.max_recv_sge = max_recv_sge,
101+
.max_inline_data = 0,
102+
},
103+
.qp_type = IBV_QPT_RC,
104+
.sq_sig_all = 0,
105+
};
106+
107+
// Create queue pair
108+
struct ibv_qp* qp = ibv_create_qp(pd, &qp_init_attr);
109+
if (!qp) {
110+
perror("failed to create queue pair (QP)");
111+
ibv_destroy_cq(send_cq);
112+
ibv_destroy_cq(recv_cq);
113+
return NULL;
114+
}
115+
116+
return qp;
81117
}
82-
83-
return qp;
84118
}
85119

86120
struct mlx5dv_qp* create_mlx5dv_qp(struct ibv_qp* qp) {

rdmaxcel-sys/src/rdmaxcel.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ struct ibv_qp* create_qp(
6969
int max_send_wr,
7070
int max_recv_wr,
7171
int max_send_sge,
72-
int max_recv_sge);
72+
int max_recv_sge,
73+
uint8_t is_extended);
7374

7475
struct mlx5dv_qp* create_mlx5dv_qp(struct ibv_qp* qp);
7576

0 commit comments

Comments
 (0)