2828//! - `IbvWc`: Wrapper around ibverbs work completion structure, used to track the status of RDMA operations.
2929use std:: ffi:: CStr ;
3030use std:: fmt;
31+ use std:: sync:: OnceLock ;
3132
3233use hyperactor:: Named ;
3334use serde:: Deserialize ;
@@ -168,7 +169,7 @@ impl IbverbsConfig {
168169 ///
169170 /// Device targets use a unified "type:id" format:
170171 /// - "cpu:N" -> finds RDMA device closest to NUMA node N
171- /// - "cuda:N" -> finds RDMA device closest to CUDA device N
172+ /// - "cuda:N" -> finds RDMA device closest to CUDA device N
172173 /// - "nic:mlx5_N" -> returns the specified NIC directly
173174 ///
174175 /// Shortcuts:
@@ -630,16 +631,81 @@ pub fn get_all_devices() -> Vec<RdmaDevice> {
630631 devices
631632}
632633
634+ /// Cached result of mlx5dv support check.
635+ static MLX5DV_SUPPORTED_CACHE : OnceLock < bool > = OnceLock :: new ( ) ;
636+
637+ /// Checks if mlx5dv (Mellanox device-specific verbs extension) is supported.
638+ ///
639+ /// This function attempts to open the first available RDMA device and check if
640+ /// mlx5dv extensions can be initialized. The mlx5dv extensions are required for
641+ /// advanced features like GPU Direct RDMA and direct queue pair manipulation.
642+ ///
643+ /// The result is cached after the first call, making subsequent calls essentially free.
644+ ///
645+ /// # Returns
646+ ///
647+ /// `true` if mlx5dv extensions are supported, `false` otherwise.
648+ pub fn mlx5dv_supported ( ) -> bool {
649+ * MLX5DV_SUPPORTED_CACHE . get_or_init ( mlx5dv_supported_impl)
650+ }
651+
652+ fn mlx5dv_supported_impl ( ) -> bool {
653+ // SAFETY: We are calling C functions from libibverbs and libmlx5.
654+ unsafe {
655+ let mut num_devices = 0 ;
656+ let device_list = rdmaxcel_sys:: ibv_get_device_list ( & mut num_devices) ;
657+
658+ // Compute result in a block, ensuring cleanup happens afterward
659+ let result = {
660+ if device_list. is_null ( ) || num_devices == 0 {
661+ false
662+ } else {
663+ // Try to open the first device and check mlx5dv support
664+ let device = * device_list;
665+ let mut mlx5dv_supported = false ;
666+
667+ if !device. is_null ( ) {
668+ let context = rdmaxcel_sys:: ibv_open_device ( device) ;
669+ if !context. is_null ( ) {
670+ // Try to query device capabilities with mlx5dv
671+ let mut attrs_out = rdmaxcel_sys:: mlx5dv_context:: default ( ) ;
672+
673+ // mlx5dv_query_device returns 0 on success
674+ if rdmaxcel_sys:: mlx5dv_query_device ( context, & mut attrs_out) == 0 {
675+ mlx5dv_supported = true ;
676+ }
677+
678+ rdmaxcel_sys:: ibv_close_device ( context) ;
679+ }
680+ }
681+ mlx5dv_supported
682+ }
683+ } ;
684+
685+ rdmaxcel_sys:: ibv_free_device_list ( device_list) ;
686+ result
687+ }
688+ }
689+
690+ /// Cached result of ibverbs support check.
691+ static IBVERBS_SUPPORTED_CACHE : OnceLock < bool > = OnceLock :: new ( ) ;
692+
633693/// Checks if ibverbs devices can be retrieved successfully.
634694///
635695/// This function attempts to retrieve the list of RDMA devices using the
636696/// `ibv_get_device_list` function from the ibverbs library. It returns `true`
637697/// if devices are found, and `false` otherwise.
638698///
699+ /// The result is cached after the first call, making subsequent calls essentially free.
700+ ///
639701/// # Returns
640702///
641703/// `true` if devices are successfully retrieved, `false` otherwise.
642704pub fn ibverbs_supported ( ) -> bool {
705+ * IBVERBS_SUPPORTED_CACHE . get_or_init ( ibverbs_supported_impl)
706+ }
707+
708+ fn ibverbs_supported_impl ( ) -> bool {
643709 // SAFETY: We are calling a C function from libibverbs.
644710 unsafe {
645711 let mut num_devices = 0 ;
@@ -651,6 +717,25 @@ pub fn ibverbs_supported() -> bool {
651717 }
652718}
653719
720+ /// Checks if RDMA is fully supported on this system.
721+ ///
722+ /// This is the canonical function to check if RDMA can be used. It verifies both:
723+ /// 1. Basic ibverbs device availability (`ibverbs_supported()`)
724+ /// 2. mlx5dv device-specific extensions (`mlx5dv_supported()`)
725+ ///
726+ /// mlx5dv extensions are required for this library's advanced features including
727+ /// GPU Direct RDMA and direct queue pair manipulation. Systems with non-Mellanox
728+ /// RDMA devices will have `ibverbs_supported() == true` but `rdma_supported() == false`.
729+ ///
730+ /// The result is cached after the first call, making subsequent calls essentially free.
731+ ///
732+ /// # Returns
733+ ///
734+ /// `true` if both ibverbs devices and mlx5dv extensions are available, `false` otherwise.
735+ pub fn rdma_supported ( ) -> bool {
736+ ibverbs_supported ( ) && mlx5dv_supported ( )
737+ }
738+
654739/// Represents a view of a memory region that can be registered with an RDMA device.
655740///
656741/// This is a 'view' of a registered Memory Region, allowing multiple views into a single
@@ -1016,4 +1101,31 @@ mod tests {
10161101 let formatted = format_gid ( & gid) ;
10171102 assert_eq ! ( formatted, "1234:5678:9abc:def0:1122:3344:5566:7788" ) ;
10181103 }
1104+
1105+ #[ test]
1106+ fn test_mlx5dv_supported_basic ( ) {
1107+ // The test just verifies the function doesn't panic
1108+ let mlx5dv_support = mlx5dv_supported ( ) ;
1109+ println ! ( "mlx5dv_supported: {}" , mlx5dv_support) ;
1110+ }
1111+
1112+ #[ test]
1113+ fn test_rdma_supported_combines_checks ( ) {
1114+ // This test verifies that rdma_supported() properly combines both checks
1115+ let ibverbs_support = ibverbs_supported ( ) ;
1116+ let mlx5dv_support = mlx5dv_supported ( ) ;
1117+ let rdma_support = rdma_supported ( ) ;
1118+
1119+ // rdma_supported should be true only if both checks pass
1120+ assert_eq ! (
1121+ rdma_support,
1122+ ibverbs_support && mlx5dv_support,
1123+ "rdma_supported should equal (ibverbs_supported && mlx5dv_supported)"
1124+ ) ;
1125+
1126+ println ! (
1127+ "ibverbs_supported: {}, mlx5dv_supported: {}, rdma_supported: {}" ,
1128+ ibverbs_support, mlx5dv_support, rdma_support
1129+ ) ;
1130+ }
10191131}
0 commit comments