Fallback when mlx5dv is not supported. (#1665)

casteryh · facebook-github-bot · commit 9828943aec91 · 2025-10-29T22:45:28.000-07:00
Summary:

This change adds fallback support when mlx5dv (Mellanox device-specific extensions) is not available for RDMA operations. It modifies the queue pair creation logic to conditionally use either extended mlx5dv-based queue pairs (when supported) or standard ibverbs queue pairs (as fallback). The pt_cuda_alloc flag is updated to require mlx5dv support since it's necessary for merging memory segments when using PyTorch's CUDA allocator. The change adds a new `is_extended` parameter to control whether to create extended or standard queue pairs at runtime.

Adds an env variable `MONARCH_RDMA_MLX5DV_DISABLED` to test the new code path on dev machine.

## Changes in Latest Revision

Based on reviewer feedback, the implementation has been updated with a cleaner, configuration-based approach:

**API Changes:**
  - Replaced `uint8_t is_extended` parameter with `rdma_qp_type_t` enum in C API
  - Added `RdmaQpType` enum to Rust with three variants:
    - `Auto`: Auto-detect based on device capabilities (default)
    - `Standard`: Force standard ibverbs queue pairs
    - `Mlx5dv`: Force mlx5dv extended queue pairs
  - Added `qp_type` field to `IbverbsConfig` for explicit QP type control
  - C code uses switch statement with proper default case for unknown types

**Architecture:**
  - Rust resolves `Auto` mode before calling C (single source of truth for detection)
  - C function becomes a pure executor - no capability detection logic
  - Removed environment variable approach in favor of configuration

**Testing:**
  - Added `setup_with_qp_type()` helper function in test utilities
  - Added 4 new unit tests to verify standard QP fallback path:
    - `test_rdma_read_into_standard_qp` (CPU-to-CPU)
    - `test_rdma_write_from_standard_qp` (CPU-to-CPU)
    - `test_rdma_read_into_standard_qp_cuda` (GPU-to-GPU)
    - `test_rdma_write_from_standard_qp_cuda` (GPU-to-GPU)

Reviewed By: dstaay-fb

Differential Revision: D85504061
diff --git a/monarch_rdma/src/ibverbs_primitives.rs b/monarch_rdma/src/ibverbs_primitives.rs
@@ -87,6 +87,35 @@ impl AsMut<rdmaxcel_sys::ibv_gid> for Gid {
     }
 }
 
+/// Queue pair type for RDMA operations.
+///
+/// Controls whether to use standard ibverbs queue pairs or mlx5dv extended queue pairs.
+/// Auto mode automatically selects based on device capabilities.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+pub enum RdmaQpType {
+    /// Auto-detect based on device capabilities
+    Auto,
+    /// Force standard ibverbs queue pair
+    Standard,
+    /// Force mlx5dv extended queue pair
+    Mlx5dv,
+}
+
+/// Converts `RdmaQpType` to the corresponding integer enum value in rdmaxcel_sys.
+pub fn resolve_qp_type(qp_type: RdmaQpType) -> u32 {
+    match qp_type {
+        RdmaQpType::Auto => {
+            if mlx5dv_supported() {
+                rdmaxcel_sys::RDMA_QP_TYPE_MLX5DV
+            } else {
+                rdmaxcel_sys::RDMA_QP_TYPE_STANDARD
+            }
+        }
+        RdmaQpType::Standard => rdmaxcel_sys::RDMA_QP_TYPE_STANDARD,
+        RdmaQpType::Mlx5dv => rdmaxcel_sys::RDMA_QP_TYPE_MLX5DV,
+    }
+}
+
 /// Represents ibverbs specific configurations.
 ///
 /// This struct holds various parameters required to establish and manage an RDMA connection.
@@ -133,6 +162,8 @@ pub struct IbverbsConfig {
     /// `hw_init_delay_ms` - The delay in milliseconds before initializing the hardware.
     /// This is used to allow the hardware to settle before starting the first transmission.
     pub hw_init_delay_ms: u64,
+    /// `qp_type` - The type of queue pair to create (Auto, Standard, or Mlx5dv).
+    pub qp_type: RdmaQpType,
 }
 
 /// Default RDMA parameters below are based on common values from rdma-core examples
@@ -160,6 +191,7 @@ impl Default for IbverbsConfig {
             psn: rand::random::<u32>() & 0xffffff,
             use_gpu_direct: false, // nv_peermem enabled for cuda
             hw_init_delay_ms: 2,
+            qp_type: RdmaQpType::Auto,
         }
     }
 }
@@ -698,21 +730,9 @@ fn ibverbs_supported_impl() -> bool {
 
 /// Checks if RDMA is fully supported on this system.
 ///
-/// This is the canonical function to check if RDMA can be used. It verifies both:
-/// 1. Basic ibverbs device availability (`ibverbs_supported()`)
-/// 2. mlx5dv device-specific extensions (`mlx5dv_supported()`)
-///
-/// mlx5dv extensions are required for this library's advanced features including
-/// GPU Direct RDMA and direct queue pair manipulation. Systems with non-Mellanox
-/// RDMA devices will have `ibverbs_supported() == true` but `rdma_supported() == false`.
-///
-/// The result is cached after the first call, making subsequent calls essentially free.
-///
-/// # Returns
-///
-/// `true` if both ibverbs devices and mlx5dv extensions are available, `false` otherwise.
+/// This is the canonical function to check if RDMA can be used.
 pub fn rdma_supported() -> bool {
-    ibverbs_supported() && mlx5dv_supported()
+    ibverbs_supported()
 }
 
 /// Represents a view of a memory region that can be registered with an RDMA device.
diff --git a/monarch_rdma/src/rdma_components.rs b/monarch_rdma/src/rdma_components.rs
@@ -66,6 +66,7 @@ use crate::ibverbs_primitives::IbvWc;
 use crate::ibverbs_primitives::IbverbsConfig;
 use crate::ibverbs_primitives::RdmaOperation;
 use crate::ibverbs_primitives::RdmaQpInfo;
+use crate::ibverbs_primitives::resolve_qp_type;
 
 #[derive(Debug, Named, Clone, Serialize, Deserialize)]
 pub struct DoorBell {
@@ -521,7 +522,9 @@ impl RdmaQueuePair {
     ) -> Result<Self, anyhow::Error> {
         tracing::debug!("creating an RdmaQueuePair from config {}", config);
         unsafe {
-            // standard ibverbs QP
+            // Resolve Auto to a concrete QP type based on device capabilities
+            let resolved_qp_type = resolve_qp_type(config.qp_type);
+
             let qp = rdmaxcel_sys::create_qp(
                 context,
                 pd,
@@ -530,6 +533,7 @@ impl RdmaQueuePair {
                 config.max_recv_wr.try_into().unwrap(),
                 config.max_send_sge.try_into().unwrap(),
                 config.max_recv_sge.try_into().unwrap(),
+                resolved_qp_type,
             );
 
             if qp.is_null() {
diff --git a/monarch_rdma/src/rdma_manager_actor.rs b/monarch_rdma/src/rdma_manager_actor.rs
@@ -48,6 +48,7 @@ use crate::ibverbs_primitives::IbverbsConfig;
 use crate::ibverbs_primitives::RdmaMemoryRegionView;
 use crate::ibverbs_primitives::RdmaQpInfo;
 use crate::ibverbs_primitives::ibverbs_supported;
+use crate::ibverbs_primitives::resolve_qp_type;
 use crate::rdma_components::RdmaBuffer;
 use crate::rdma_components::RdmaDomain;
 use crate::rdma_components::RdmaQueuePair;
@@ -120,7 +121,7 @@ pub enum RdmaManagerMessage {
         reply: OncePortRef<RdmaQpInfo>,
     },
     ReleaseQueuePair {
-        /// `other` - The ActorId to release queue pair for  
+        /// `other` - The ActorId to release queue pair for
         other: ActorRef<RdmaManagerActor>,
         self_device: String,
         other_device: String,
@@ -150,6 +151,8 @@ pub struct RdmaManagerActor {
     // True if both C10 CUDA allocator is enabled AND expandable segments are enabled
     pt_cuda_alloc: bool,
 
+    mlx5dv_enabled: bool,
+
     // Map of unique RdmaMemoryRegionView to ibv_mr*.  In case of cuda w/ pytorch its -1
     // since its managed independently.  Only used for registration/deregistration purposes
     mr_map: HashMap<usize, usize>,
@@ -248,7 +251,7 @@ impl Drop for RdmaManagerActor {
         }
 
         // 4. Deregister all CUDA segments (if using PyTorch CUDA allocator)
-        if self.pt_cuda_alloc {
+        if self.cuda_pt_alloc_enabled() {
             unsafe {
                 let result = rdmaxcel_sys::deregister_segments();
                 if result != 0 {
@@ -265,6 +268,11 @@ impl Drop for RdmaManagerActor {
 }
 
 impl RdmaManagerActor {
+    /// Whether to register all memory regions allocated by the PyTorch CUDA allocator
+    /// True if both `pt_cuda_alloc` and `mlx5dv_enabled` are true
+    fn cuda_pt_alloc_enabled(&self) -> bool {
+        self.pt_cuda_alloc && self.mlx5dv_enabled
+    }
     /// Get or create a domain and loopback QP for the specified RDMA device
     fn get_or_create_device_domain(
         &mut self,
@@ -420,10 +428,7 @@ impl RdmaManagerActor {
             let mut mr: *mut rdmaxcel_sys::ibv_mr = std::ptr::null_mut();
             let mrv;
 
-            // Copy pt_cuda_alloc to avoid borrowing issues
-            let pt_cuda_alloc = self.pt_cuda_alloc;
-
-            if is_cuda && pt_cuda_alloc {
+            if is_cuda && self.cuda_pt_alloc_enabled() {
                 // Get registered segments and check if our memory range is covered
                 let mut maybe_mrv = self.find_cuda_segment_for_address(addr, size);
                 // not found, lets re-sync with caching allocator  and retry
@@ -529,6 +534,8 @@ impl Actor for RdmaManagerActor {
 
         let pt_cuda_alloc = crate::rdma_components::pt_cuda_allocator_compatibility();
 
+        let mlx5dv_enabled = resolve_qp_type(config.qp_type) == rdmaxcel_sys::RDMA_QP_TYPE_MLX5DV;
+
         // check config and hardware support align
         if config.use_gpu_direct {
             match validate_execution_context().await {
@@ -557,6 +564,7 @@ impl Actor for RdmaManagerActor {
             device_domains: HashMap::new(),
             config,
             pt_cuda_alloc,
+            mlx5dv_enabled,
             mr_map: HashMap::new(),
             mrv_id: 0,
             pci_to_device,
diff --git a/monarch_rdma/src/rdma_manager_actor_tests.rs b/monarch_rdma/src/rdma_manager_actor_tests.rs
@@ -620,4 +620,124 @@ mod tests {
         env.cleanup().await?;
         Ok(())
     }
+
+    #[timed_test::async_timed_test(timeout_secs = 60)]
+    async fn test_rdma_read_into_standard_qp() -> Result<(), anyhow::Error> {
+        const BSIZE: usize = 32;
+        // Skip test if RDMA devices are not available
+        let devices = get_all_devices();
+        if devices.is_empty() {
+            println!("Skipping test: RDMA devices not available");
+            return Ok(());
+        }
+
+        let env = RdmaManagerTestEnv::setup_with_qp_type(
+            BSIZE,
+            "cpu:0",
+            "cpu:0",
+            crate::ibverbs_primitives::RdmaQpType::Standard,
+        )
+        .await?;
+
+        let rdma_handle_1 = env.rdma_handle_1.clone();
+        rdma_handle_1
+            .read_into(env.client_1, env.rdma_handle_2.clone(), 2)
+            .await?;
+
+        env.verify_buffers(BSIZE).await?;
+        env.cleanup().await?;
+        Ok(())
+    }
+
+    #[timed_test::async_timed_test(timeout_secs = 60)]
+    async fn test_rdma_write_from_standard_qp() -> Result<(), anyhow::Error> {
+        const BSIZE: usize = 32;
+        // Skip test if RDMA devices are not available
+        let devices = get_all_devices();
+        if devices.is_empty() {
+            println!("Skipping test: RDMA devices not available");
+            return Ok(());
+        }
+
+        let env = RdmaManagerTestEnv::setup_with_qp_type(
+            BSIZE,
+            "cpu:0",
+            "cpu:0",
+            crate::ibverbs_primitives::RdmaQpType::Standard,
+        )
+        .await?;
+
+        let rdma_handle_1 = env.rdma_handle_1.clone();
+        rdma_handle_1
+            .write_from(env.client_1, env.rdma_handle_2.clone(), 2)
+            .await?;
+
+        env.verify_buffers(BSIZE).await?;
+        env.cleanup().await?;
+        Ok(())
+    }
+
+    #[timed_test::async_timed_test(timeout_secs = 60)]
+    async fn test_rdma_read_into_standard_qp_cuda() -> Result<(), anyhow::Error> {
+        if is_cpu_only_mode() {
+            println!("Skipping CUDA test in CPU-only mode");
+            return Ok(());
+        }
+        const BSIZE: usize = 16 * 1024 * 1024;
+        // Skip test if RDMA devices are not available
+        let devices = get_all_devices();
+        if devices.is_empty() {
+            println!("Skipping test: RDMA devices not available");
+            return Ok(());
+        }
+
+        let env = RdmaManagerTestEnv::setup_with_qp_type(
+            BSIZE,
+            "cuda:0",
+            "cuda:1",
+            crate::ibverbs_primitives::RdmaQpType::Standard,
+        )
+        .await?;
+
+        let rdma_handle_1 = env.rdma_handle_1.clone();
+        rdma_handle_1
+            .read_into(env.client_1, env.rdma_handle_2.clone(), 5)
+            .await?;
+
+        env.verify_buffers(BSIZE).await?;
+        env.cleanup().await?;
+        Ok(())
+    }
+
+    #[timed_test::async_timed_test(timeout_secs = 60)]
+    async fn test_rdma_write_from_standard_qp_cuda() -> Result<(), anyhow::Error> {
+        if is_cpu_only_mode() {
+            println!("Skipping CUDA test in CPU-only mode");
+            return Ok(());
+        }
+        const BSIZE: usize = 16 * 1024 * 1024;
+        // Skip test if RDMA devices are not available
+        let devices = get_all_devices();
+        if devices.is_empty() {
+            println!("Skipping test: RDMA devices not available");
+            return Ok(());
+        }
+
+        let env = RdmaManagerTestEnv::setup_with_qp_type(
+            BSIZE,
+            "cuda:0",
+            "cuda:1",
+            crate::ibverbs_primitives::RdmaQpType::Standard,
+        )
+        .await?;
+
+        let rdma_handle_1 = env.rdma_handle_1.clone();
+        rdma_handle_1
+            .write_from(env.client_1, env.rdma_handle_2.clone(), 5)
+            .await?;
+
+        env.verify_buffers(BSIZE).await?;
+        env.cleanup().await?;
+        Ok(())
+    }
 }
diff --git a/monarch_rdma/src/test_utils.rs b/monarch_rdma/src/test_utils.rs
@@ -294,7 +294,7 @@ pub mod test_utils {
     }
 
     impl RdmaManagerTestEnv<'_> {
-        /// Sets up the RDMA test environment.
+        /// Sets up the RDMA test environment with a specified QP type.
         ///
         /// This function initializes the RDMA test environment by setting up two actor meshes
         /// with their respective RDMA configurations. It also prepares two buffers for testing
@@ -305,15 +305,21 @@ pub mod test_utils {
         /// * `buffer_size` - The size of the buffers to be used in the test.
         /// * `accel1` - Accelerator for first actor (e.g., "cpu:0", "cuda:0")
         /// * `accel2` - Accelerator for second actor (e.g., "cpu:0", "cuda:1")
-        pub async fn setup(
+        /// * `qp_type` - The queue pair type to use (Auto, Standard, or Mlx5dv)
+        pub async fn setup_with_qp_type(
             buffer_size: usize,
             accel1: &str,
             accel2: &str,
+            qp_type: crate::ibverbs_primitives::RdmaQpType,
         ) -> Result<Self, anyhow::Error> {
             // Use device selection logic to find optimal RDMA devices
             let mut config1 = IbverbsConfig::targeting(accel1);
             let mut config2 = IbverbsConfig::targeting(accel2);
 
+            // Set the QP type
+            config1.qp_type = qp_type;
+            config2.qp_type = qp_type;
+
             let parsed_accel1 = parse_accel(accel1, &mut config1).await;
             let parsed_accel2 = parse_accel(accel2, &mut config2).await;
 
@@ -537,6 +543,30 @@ pub mod test_utils {
             Ok(())
         }
 
+        /// Sets up the RDMA test environment with auto-detected QP type.
+        ///
+        /// This is a convenience wrapper around `setup_with_qp_type` that uses
+        /// `RdmaQpType::Auto` to automatically select the appropriate QP type.
+        ///
+        /// # Arguments
+        ///
+        /// * `buffer_size` - The size of the buffers to be used in the test.
+        /// * `accel1` - Accelerator for first actor (e.g., "cpu:0", "cuda:0")
+        /// * `accel2` - Accelerator for second actor (e.g., "cpu:0", "cuda:1")
+        pub async fn setup(
+            buffer_size: usize,
+            accel1: &str,
+            accel2: &str,
+        ) -> Result<Self, anyhow::Error> {
+            Self::setup_with_qp_type(
+                buffer_size,
+                accel1,
+                accel2,
+                crate::ibverbs_primitives::RdmaQpType::Auto,
+            )
+            .await
+        }
+
         pub async fn verify_buffers(&self, size: usize) -> Result<(), anyhow::Error> {
             let mut buf_vec = Vec::new();
             for (virtual_addr, cuda_context) in [
diff --git a/rdmaxcel-sys/src/rdmaxcel.c b/rdmaxcel-sys/src/rdmaxcel.c
diff --git a/rdmaxcel-sys/src/rdmaxcel.h b/rdmaxcel-sys/src/rdmaxcel.h