Skip to content

Commit 9828943

Browse files
casteryhfacebook-github-bot
authored andcommitted
Fallback when mlx5dv is not supported. (#1665)
Summary: This change adds fallback support when mlx5dv (Mellanox device-specific extensions) is not available for RDMA operations. It modifies the queue pair creation logic to conditionally use either extended mlx5dv-based queue pairs (when supported) or standard ibverbs queue pairs (as fallback). The pt_cuda_alloc flag is updated to require mlx5dv support since it's necessary for merging memory segments when using PyTorch's CUDA allocator. The change adds a new `is_extended` parameter to control whether to create extended or standard queue pairs at runtime. Adds an env variable `MONARCH_RDMA_MLX5DV_DISABLED` to test the new code path on dev machine. ## Changes in Latest Revision Based on reviewer feedback, the implementation has been updated with a cleaner, configuration-based approach: **API Changes:** - Replaced `uint8_t is_extended` parameter with `rdma_qp_type_t` enum in C API - Added `RdmaQpType` enum to Rust with three variants: - `Auto`: Auto-detect based on device capabilities (default) - `Standard`: Force standard ibverbs queue pairs - `Mlx5dv`: Force mlx5dv extended queue pairs - Added `qp_type` field to `IbverbsConfig` for explicit QP type control - C code uses switch statement with proper default case for unknown types **Architecture:** - Rust resolves `Auto` mode before calling C (single source of truth for detection) - C function becomes a pure executor - no capability detection logic - Removed environment variable approach in favor of configuration **Testing:** - Added `setup_with_qp_type()` helper function in test utilities - Added 4 new unit tests to verify standard QP fallback path: - `test_rdma_read_into_standard_qp` (CPU-to-CPU) - `test_rdma_write_from_standard_qp` (CPU-to-CPU) - `test_rdma_read_into_standard_qp_cuda` (GPU-to-GPU) - `test_rdma_write_from_standard_qp_cuda` (GPU-to-GPU) Reviewed By: dstaay-fb Differential Revision: D85504061
1 parent a75d8b3 commit 9828943

File tree

7 files changed

+293
-62
lines changed

7 files changed

+293
-62
lines changed

monarch_rdma/src/ibverbs_primitives.rs

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,35 @@ impl AsMut<rdmaxcel_sys::ibv_gid> for Gid {
8787
}
8888
}
8989

90+
/// Queue pair type for RDMA operations.
91+
///
92+
/// Controls whether to use standard ibverbs queue pairs or mlx5dv extended queue pairs.
93+
/// Auto mode automatically selects based on device capabilities.
94+
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
95+
pub enum RdmaQpType {
96+
/// Auto-detect based on device capabilities
97+
Auto,
98+
/// Force standard ibverbs queue pair
99+
Standard,
100+
/// Force mlx5dv extended queue pair
101+
Mlx5dv,
102+
}
103+
104+
/// Converts `RdmaQpType` to the corresponding integer enum value in rdmaxcel_sys.
105+
pub fn resolve_qp_type(qp_type: RdmaQpType) -> u32 {
106+
match qp_type {
107+
RdmaQpType::Auto => {
108+
if mlx5dv_supported() {
109+
rdmaxcel_sys::RDMA_QP_TYPE_MLX5DV
110+
} else {
111+
rdmaxcel_sys::RDMA_QP_TYPE_STANDARD
112+
}
113+
}
114+
RdmaQpType::Standard => rdmaxcel_sys::RDMA_QP_TYPE_STANDARD,
115+
RdmaQpType::Mlx5dv => rdmaxcel_sys::RDMA_QP_TYPE_MLX5DV,
116+
}
117+
}
118+
90119
/// Represents ibverbs specific configurations.
91120
///
92121
/// This struct holds various parameters required to establish and manage an RDMA connection.
@@ -133,6 +162,8 @@ pub struct IbverbsConfig {
133162
/// `hw_init_delay_ms` - The delay in milliseconds before initializing the hardware.
134163
/// This is used to allow the hardware to settle before starting the first transmission.
135164
pub hw_init_delay_ms: u64,
165+
/// `qp_type` - The type of queue pair to create (Auto, Standard, or Mlx5dv).
166+
pub qp_type: RdmaQpType,
136167
}
137168

138169
/// Default RDMA parameters below are based on common values from rdma-core examples
@@ -160,6 +191,7 @@ impl Default for IbverbsConfig {
160191
psn: rand::random::<u32>() & 0xffffff,
161192
use_gpu_direct: false, // nv_peermem enabled for cuda
162193
hw_init_delay_ms: 2,
194+
qp_type: RdmaQpType::Auto,
163195
}
164196
}
165197
}
@@ -698,21 +730,9 @@ fn ibverbs_supported_impl() -> bool {
698730

699731
/// Checks if RDMA is fully supported on this system.
700732
///
701-
/// This is the canonical function to check if RDMA can be used. It verifies both:
702-
/// 1. Basic ibverbs device availability (`ibverbs_supported()`)
703-
/// 2. mlx5dv device-specific extensions (`mlx5dv_supported()`)
704-
///
705-
/// mlx5dv extensions are required for this library's advanced features including
706-
/// GPU Direct RDMA and direct queue pair manipulation. Systems with non-Mellanox
707-
/// RDMA devices will have `ibverbs_supported() == true` but `rdma_supported() == false`.
708-
///
709-
/// The result is cached after the first call, making subsequent calls essentially free.
710-
///
711-
/// # Returns
712-
///
713-
/// `true` if both ibverbs devices and mlx5dv extensions are available, `false` otherwise.
733+
/// This is the canonical function to check if RDMA can be used.
714734
pub fn rdma_supported() -> bool {
715-
ibverbs_supported() && mlx5dv_supported()
735+
ibverbs_supported()
716736
}
717737

718738
/// Represents a view of a memory region that can be registered with an RDMA device.

monarch_rdma/src/rdma_components.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ use crate::ibverbs_primitives::IbvWc;
6666
use crate::ibverbs_primitives::IbverbsConfig;
6767
use crate::ibverbs_primitives::RdmaOperation;
6868
use crate::ibverbs_primitives::RdmaQpInfo;
69+
use crate::ibverbs_primitives::resolve_qp_type;
6970

7071
#[derive(Debug, Named, Clone, Serialize, Deserialize)]
7172
pub struct DoorBell {
@@ -521,7 +522,9 @@ impl RdmaQueuePair {
521522
) -> Result<Self, anyhow::Error> {
522523
tracing::debug!("creating an RdmaQueuePair from config {}", config);
523524
unsafe {
524-
// standard ibverbs QP
525+
// Resolve Auto to a concrete QP type based on device capabilities
526+
let resolved_qp_type = resolve_qp_type(config.qp_type);
527+
525528
let qp = rdmaxcel_sys::create_qp(
526529
context,
527530
pd,
@@ -530,6 +533,7 @@ impl RdmaQueuePair {
530533
config.max_recv_wr.try_into().unwrap(),
531534
config.max_send_sge.try_into().unwrap(),
532535
config.max_recv_sge.try_into().unwrap(),
536+
resolved_qp_type,
533537
);
534538

535539
if qp.is_null() {

monarch_rdma/src/rdma_manager_actor.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ use crate::ibverbs_primitives::IbverbsConfig;
4848
use crate::ibverbs_primitives::RdmaMemoryRegionView;
4949
use crate::ibverbs_primitives::RdmaQpInfo;
5050
use crate::ibverbs_primitives::ibverbs_supported;
51+
use crate::ibverbs_primitives::resolve_qp_type;
5152
use crate::rdma_components::RdmaBuffer;
5253
use crate::rdma_components::RdmaDomain;
5354
use crate::rdma_components::RdmaQueuePair;
@@ -120,7 +121,7 @@ pub enum RdmaManagerMessage {
120121
reply: OncePortRef<RdmaQpInfo>,
121122
},
122123
ReleaseQueuePair {
123-
/// `other` - The ActorId to release queue pair for
124+
/// `other` - The ActorId to release queue pair for
124125
other: ActorRef<RdmaManagerActor>,
125126
self_device: String,
126127
other_device: String,
@@ -150,6 +151,8 @@ pub struct RdmaManagerActor {
150151
// True if both C10 CUDA allocator is enabled AND expandable segments are enabled
151152
pt_cuda_alloc: bool,
152153

154+
mlx5dv_enabled: bool,
155+
153156
// Map of unique RdmaMemoryRegionView to ibv_mr*. In case of cuda w/ pytorch its -1
154157
// since its managed independently. Only used for registration/deregistration purposes
155158
mr_map: HashMap<usize, usize>,
@@ -248,7 +251,7 @@ impl Drop for RdmaManagerActor {
248251
}
249252

250253
// 4. Deregister all CUDA segments (if using PyTorch CUDA allocator)
251-
if self.pt_cuda_alloc {
254+
if self.cuda_pt_alloc_enabled() {
252255
unsafe {
253256
let result = rdmaxcel_sys::deregister_segments();
254257
if result != 0 {
@@ -265,6 +268,11 @@ impl Drop for RdmaManagerActor {
265268
}
266269

267270
impl RdmaManagerActor {
271+
/// Whether to register all memory regions allocated by the PyTorch CUDA allocator
272+
/// True if both `pt_cuda_alloc` and `mlx5dv_enabled` are true
273+
fn cuda_pt_alloc_enabled(&self) -> bool {
274+
self.pt_cuda_alloc && self.mlx5dv_enabled
275+
}
268276
/// Get or create a domain and loopback QP for the specified RDMA device
269277
fn get_or_create_device_domain(
270278
&mut self,
@@ -420,10 +428,7 @@ impl RdmaManagerActor {
420428
let mut mr: *mut rdmaxcel_sys::ibv_mr = std::ptr::null_mut();
421429
let mrv;
422430

423-
// Copy pt_cuda_alloc to avoid borrowing issues
424-
let pt_cuda_alloc = self.pt_cuda_alloc;
425-
426-
if is_cuda && pt_cuda_alloc {
431+
if is_cuda && self.cuda_pt_alloc_enabled() {
427432
// Get registered segments and check if our memory range is covered
428433
let mut maybe_mrv = self.find_cuda_segment_for_address(addr, size);
429434
// not found, lets re-sync with caching allocator and retry
@@ -529,6 +534,8 @@ impl Actor for RdmaManagerActor {
529534

530535
let pt_cuda_alloc = crate::rdma_components::pt_cuda_allocator_compatibility();
531536

537+
let mlx5dv_enabled = resolve_qp_type(config.qp_type) == rdmaxcel_sys::RDMA_QP_TYPE_MLX5DV;
538+
532539
// check config and hardware support align
533540
if config.use_gpu_direct {
534541
match validate_execution_context().await {
@@ -557,6 +564,7 @@ impl Actor for RdmaManagerActor {
557564
device_domains: HashMap::new(),
558565
config,
559566
pt_cuda_alloc,
567+
mlx5dv_enabled,
560568
mr_map: HashMap::new(),
561569
mrv_id: 0,
562570
pci_to_device,

monarch_rdma/src/rdma_manager_actor_tests.rs

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,4 +620,124 @@ mod tests {
620620
env.cleanup().await?;
621621
Ok(())
622622
}
623+
624+
#[timed_test::async_timed_test(timeout_secs = 60)]
625+
async fn test_rdma_read_into_standard_qp() -> Result<(), anyhow::Error> {
626+
const BSIZE: usize = 32;
627+
// Skip test if RDMA devices are not available
628+
let devices = get_all_devices();
629+
if devices.is_empty() {
630+
println!("Skipping test: RDMA devices not available");
631+
return Ok(());
632+
}
633+
634+
let env = RdmaManagerTestEnv::setup_with_qp_type(
635+
BSIZE,
636+
"cpu:0",
637+
"cpu:0",
638+
crate::ibverbs_primitives::RdmaQpType::Standard,
639+
)
640+
.await?;
641+
642+
let rdma_handle_1 = env.rdma_handle_1.clone();
643+
rdma_handle_1
644+
.read_into(env.client_1, env.rdma_handle_2.clone(), 2)
645+
.await?;
646+
647+
env.verify_buffers(BSIZE).await?;
648+
env.cleanup().await?;
649+
Ok(())
650+
}
651+
652+
#[timed_test::async_timed_test(timeout_secs = 60)]
653+
async fn test_rdma_write_from_standard_qp() -> Result<(), anyhow::Error> {
654+
const BSIZE: usize = 32;
655+
// Skip test if RDMA devices are not available
656+
let devices = get_all_devices();
657+
if devices.is_empty() {
658+
println!("Skipping test: RDMA devices not available");
659+
return Ok(());
660+
}
661+
662+
let env = RdmaManagerTestEnv::setup_with_qp_type(
663+
BSIZE,
664+
"cpu:0",
665+
"cpu:0",
666+
crate::ibverbs_primitives::RdmaQpType::Standard,
667+
)
668+
.await?;
669+
670+
let rdma_handle_1 = env.rdma_handle_1.clone();
671+
rdma_handle_1
672+
.write_from(env.client_1, env.rdma_handle_2.clone(), 2)
673+
.await?;
674+
675+
env.verify_buffers(BSIZE).await?;
676+
env.cleanup().await?;
677+
Ok(())
678+
}
679+
680+
#[timed_test::async_timed_test(timeout_secs = 60)]
681+
async fn test_rdma_read_into_standard_qp_cuda() -> Result<(), anyhow::Error> {
682+
if is_cpu_only_mode() {
683+
println!("Skipping CUDA test in CPU-only mode");
684+
return Ok(());
685+
}
686+
const BSIZE: usize = 16 * 1024 * 1024;
687+
// Skip test if RDMA devices are not available
688+
let devices = get_all_devices();
689+
if devices.is_empty() {
690+
println!("Skipping test: RDMA devices not available");
691+
return Ok(());
692+
}
693+
694+
let env = RdmaManagerTestEnv::setup_with_qp_type(
695+
BSIZE,
696+
"cuda:0",
697+
"cuda:1",
698+
crate::ibverbs_primitives::RdmaQpType::Standard,
699+
)
700+
.await?;
701+
702+
let rdma_handle_1 = env.rdma_handle_1.clone();
703+
rdma_handle_1
704+
.read_into(env.client_1, env.rdma_handle_2.clone(), 5)
705+
.await?;
706+
707+
env.verify_buffers(BSIZE).await?;
708+
env.cleanup().await?;
709+
Ok(())
710+
}
711+
712+
#[timed_test::async_timed_test(timeout_secs = 60)]
713+
async fn test_rdma_write_from_standard_qp_cuda() -> Result<(), anyhow::Error> {
714+
if is_cpu_only_mode() {
715+
println!("Skipping CUDA test in CPU-only mode");
716+
return Ok(());
717+
}
718+
const BSIZE: usize = 16 * 1024 * 1024;
719+
// Skip test if RDMA devices are not available
720+
let devices = get_all_devices();
721+
if devices.is_empty() {
722+
println!("Skipping test: RDMA devices not available");
723+
return Ok(());
724+
}
725+
726+
let env = RdmaManagerTestEnv::setup_with_qp_type(
727+
BSIZE,
728+
"cuda:0",
729+
"cuda:1",
730+
crate::ibverbs_primitives::RdmaQpType::Standard,
731+
)
732+
.await?;
733+
734+
let rdma_handle_1 = env.rdma_handle_1.clone();
735+
rdma_handle_1
736+
.write_from(env.client_1, env.rdma_handle_2.clone(), 5)
737+
.await?;
738+
739+
env.verify_buffers(BSIZE).await?;
740+
env.cleanup().await?;
741+
Ok(())
742+
}
623743
}

monarch_rdma/src/test_utils.rs

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ pub mod test_utils {
294294
}
295295

296296
impl RdmaManagerTestEnv<'_> {
297-
/// Sets up the RDMA test environment.
297+
/// Sets up the RDMA test environment with a specified QP type.
298298
///
299299
/// This function initializes the RDMA test environment by setting up two actor meshes
300300
/// with their respective RDMA configurations. It also prepares two buffers for testing
@@ -305,15 +305,21 @@ pub mod test_utils {
305305
/// * `buffer_size` - The size of the buffers to be used in the test.
306306
/// * `accel1` - Accelerator for first actor (e.g., "cpu:0", "cuda:0")
307307
/// * `accel2` - Accelerator for second actor (e.g., "cpu:0", "cuda:1")
308-
pub async fn setup(
308+
/// * `qp_type` - The queue pair type to use (Auto, Standard, or Mlx5dv)
309+
pub async fn setup_with_qp_type(
309310
buffer_size: usize,
310311
accel1: &str,
311312
accel2: &str,
313+
qp_type: crate::ibverbs_primitives::RdmaQpType,
312314
) -> Result<Self, anyhow::Error> {
313315
// Use device selection logic to find optimal RDMA devices
314316
let mut config1 = IbverbsConfig::targeting(accel1);
315317
let mut config2 = IbverbsConfig::targeting(accel2);
316318

319+
// Set the QP type
320+
config1.qp_type = qp_type;
321+
config2.qp_type = qp_type;
322+
317323
let parsed_accel1 = parse_accel(accel1, &mut config1).await;
318324
let parsed_accel2 = parse_accel(accel2, &mut config2).await;
319325

@@ -537,6 +543,30 @@ pub mod test_utils {
537543
Ok(())
538544
}
539545

546+
/// Sets up the RDMA test environment with auto-detected QP type.
547+
///
548+
/// This is a convenience wrapper around `setup_with_qp_type` that uses
549+
/// `RdmaQpType::Auto` to automatically select the appropriate QP type.
550+
///
551+
/// # Arguments
552+
///
553+
/// * `buffer_size` - The size of the buffers to be used in the test.
554+
/// * `accel1` - Accelerator for first actor (e.g., "cpu:0", "cuda:0")
555+
/// * `accel2` - Accelerator for second actor (e.g., "cpu:0", "cuda:1")
556+
pub async fn setup(
557+
buffer_size: usize,
558+
accel1: &str,
559+
accel2: &str,
560+
) -> Result<Self, anyhow::Error> {
561+
Self::setup_with_qp_type(
562+
buffer_size,
563+
accel1,
564+
accel2,
565+
crate::ibverbs_primitives::RdmaQpType::Auto,
566+
)
567+
.await
568+
}
569+
540570
pub async fn verify_buffers(&self, size: usize) -> Result<(), anyhow::Error> {
541571
let mut buf_vec = Vec::new();
542572
for (virtual_addr, cuda_context) in [

0 commit comments

Comments
 (0)