openucx · rakhmets · Mar 1, 2025 · Feb 27, 2025
diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh
@@ -197,12 +197,6 @@ try_load_cuda_env() {
     then
         az_module_load dev/gdrcopy2.4.1_cuda12.5.1 && have_gdrcopy=yes
     fi
-
-    # Set CUDA_VISIBLE_DEVICES
-    if [ -n "${worker}" ]
-    then
-        export CUDA_VISIBLE_DEVICES=$((worker % num_gpus))
-    fi
 }
 
 load_cuda_env() {

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -571,6 +571,67 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
     return 1; /* return 1 byte to avoid division by zero */
 }
 
+/**
+ * Get information on memory allocations.
+ *
+ * @param [in]  address        Pointer to the memory allocation to query
+ * @param [in]  length         Size of the allocation
+ * @param [in]  ctx            CUDA context on which a pointer was allocated.
+ *                             NULL in case of VMM
+ * @param [out] base_address_p Returned base address
+ * @param [out] alloc_length_p Returned size of the memory allocation
+ *
+ * @return Error code as defined by @ref ucs_status_t.
+ */
+static ucs_status_t
+uct_cuda_copy_md_get_address_range(const void *address, size_t length,
+                                   CUcontext ctx, void **base_address_p,
+                                   size_t *alloc_length_p)
+{
+    ucs_status_t status;
+    CUdeviceptr base;
+    size_t alloc_length;
+    ucs_status_t status_ctx_pop;
+    CUcontext popped_ctx;
+
+    if (ctx != NULL) {
+        /* GetAddressRange requires context to be set. On DGXA100 it takes
+         * 0.03us to push and pop the context associated with address. */
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx));
+        if (status != UCS_OK) {
+            return status;
+        }
+    }
+
+    status = UCT_CUDADRV_FUNC_LOG_DEBUG(
+            cuMemGetAddressRange(&base, &alloc_length, (CUdeviceptr)address));
+    if (ctx != NULL) {
+        status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx));
+        if (status != UCS_OK) {
+            /* cuMemGetAddressRange failed after pushing non-NULL context */
+            return UCS_ERR_INVALID_ADDR;
+        }
+
+        if (status_ctx_pop != UCS_OK) {
+            /* Failed to set the context that was current before the other
+             * context was pushed. */
+            return status_ctx_pop;
+        }
+    }
+
+    if (status == UCS_OK) {
+        *base_address_p = (void*)base;
+        *alloc_length_p = alloc_length;
+    } else {
+        /* Use default values when cuMemGetAddressRange failed without pushing
+         * non-NULL context */
+        *base_address_p = (void*)address;
+        *alloc_length_p = length;
+    }
+
+    return UCS_OK;
+}
+
 static ucs_status_t
 uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
                                   size_t length, ucs_memory_info_t *mem_info)
@@ -582,11 +643,11 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
     CUcontext cuda_mem_ctx     = NULL;
     CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
     void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
-    CUdeviceptr base_address;
+    void *base_address;
     size_t alloc_length;
     size_t total_bytes;
     int32_t pref_loc;
-    unsigned is_vmm;
+    int is_vmm;
     CUresult cu_err;
     ucs_status_t status;
 
@@ -681,16 +742,14 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         goto out_default_range;
     }
 
-    cu_err = cuMemGetAddressRange(&base_address, &alloc_length,
-                                  (CUdeviceptr)address);
-    if (cu_err != CUDA_SUCCESS) {
-        ucs_error("cuMemGetAddressRange(%p) error: %s", address,
-                  uct_cuda_base_cu_get_error_string(cu_err));
-        return UCS_ERR_INVALID_ADDR;
+    status = uct_cuda_copy_md_get_address_range(address, length, cuda_mem_ctx,
+                                                &base_address, &alloc_length);
+    if (status != UCS_OK) {
+        return status;
     }
 
-    ucs_trace("query address %p: 0x%llx..0x%llx length %zu", address,
-              base_address, base_address + alloc_length, alloc_length);
+    ucs_trace("query address %p: %p..%p length %zu", address, base_address,
+              UCS_PTR_BYTE_OFFSET(base_address, alloc_length), alloc_length);
 
     if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) {
         total_bytes = uct_cuda_copy_md_get_total_device_mem(cuda_device);
@@ -701,7 +760,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON);
     }
 
-    mem_info->base_address = (void*)base_address;
+    mem_info->base_address = base_address;
     mem_info->alloc_length = alloc_length;
     return UCS_OK;
 

diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am
@@ -260,6 +260,7 @@ if HAVE_CUDA
 gtest_SOURCES += \
 	common/cuda_context.cc \
 	ucm/cuda_hooks.cc \
+	uct/cuda/test_switch_cuda_device.cc \
 	uct/cuda/test_cuda_ipc_md.cc
 gtest_CPPFLAGS += \
 	$(CUDA_CPPFLAGS)

diff --git a/test/gtest/uct/cuda/test_switch_cuda_device.cc b/test/gtest/uct/cuda/test_switch_cuda_device.cc
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include <uct/test_md.h>
+
+extern "C" {
+#include <ucs/sys/ptr_arith.h>
+}
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+class test_switch_cuda_device : public test_md {
+public:
+    template<class T> void detect_mem_type(ucs_memory_type_t mem_type) const;
+};
+
+template<class T>
+void test_switch_cuda_device::detect_mem_type(ucs_memory_type_t mem_type) const
+{
+    int num_devices;
+    ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
+
+    if (num_devices < 2) {
+        UCS_TEST_SKIP_R("less than two cuda devices available");
+    }
+
+    int current_device;
+    ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    ASSERT_EQ(cudaSetDevice((current_device + 1) % num_devices), cudaSuccess);
+
+    const size_t size = 16;
+    T buffer(size, mem_type);
+
+    EXPECT_EQ(cudaSetDevice(current_device), cudaSuccess);
+
+    ucs_memory_type_t detected_mem_type;
+    ASSERT_EQ(uct_md_detect_memory_type(m_md, buffer.ptr(), size,
+                                        &detected_mem_type),
+              UCS_OK);
+    EXPECT_EQ(detected_mem_type, mem_type);
+}
+
+#if HAVE_CUDA_FABRIC
+class cuda_fabric_mem_buffer {
+public:
+    cuda_fabric_mem_buffer(size_t size, ucs_memory_type_t mem_type);
+    virtual ~cuda_fabric_mem_buffer();
+    void *ptr() const;
+
+private:
+    size_t m_size;
+    CUmemGenericAllocationHandle m_alloc_handle;
+    CUdeviceptr m_ptr;
+};
+
+cuda_fabric_mem_buffer::cuda_fabric_mem_buffer(size_t size,
+                                               ucs_memory_type_t mem_type) :
+    m_size(size)
+{
+    size_t granularity          = 0;
+    CUmemAllocationProp prop    = {};
+    CUmemAccessDesc access_desc = {};
+    CUdevice device;
+    if (cuCtxGetDevice(&device) != CUDA_SUCCESS) {
+        UCS_TEST_ABORT("failed to get the device handle for the current "
+                       "context");
+    }
+
+    prop.type                 = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type        = CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.id          = device;
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+    if (cuMemGetAllocationGranularity(&granularity, &prop,
+                                      CU_MEM_ALLOC_GRANULARITY_MINIMUM) !=
+        CUDA_SUCCESS) {
+        goto err;
+    }
+
+    m_size = ucs_align_up(m_size, granularity);
+    if (cuMemCreate(&m_alloc_handle, m_size, &prop, 0) != CUDA_SUCCESS) {
+        goto err;
+    }
+
+    if (cuMemAddressReserve(&m_ptr, m_size, 0, 0, 0) != CUDA_SUCCESS) {
+        goto err_mem_release;
+    }
+
+    if (cuMemMap(m_ptr, m_size, 0, m_alloc_handle, 0) != CUDA_SUCCESS) {
+        goto err_address_free;
+    }
+
+    access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc.location.id   = device;
+    access_desc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    if (cuMemSetAccess(m_ptr, m_size, &access_desc, 1) != CUDA_SUCCESS) {
+        goto err_mem_unmap;
+    }
+
+    return;
+
+err_mem_unmap:
+    cuMemUnmap(m_ptr, m_size);
+err_address_free:
+    cuMemAddressFree(m_ptr, m_size);
+err_mem_release:
+    cuMemRelease(m_alloc_handle);
+err:
+    UCS_TEST_SKIP_R("failed to allocate CUDA fabric memory");
+}
+
+cuda_fabric_mem_buffer::~cuda_fabric_mem_buffer()
+{
+    cuMemUnmap(m_ptr, m_size);
+    cuMemAddressFree(m_ptr, m_size);
+    cuMemRelease(m_alloc_handle);
+}
+
+void *cuda_fabric_mem_buffer::ptr() const
+{
+    return (void*)m_ptr;
+}
+#endif
+
+UCS_TEST_P(test_switch_cuda_device, detect_mem_type_cuda)
+{
+    detect_mem_type<mem_buffer>(UCS_MEMORY_TYPE_CUDA);
+}
+
+UCS_TEST_P(test_switch_cuda_device, detect_mem_type_cuda_managed)
+{
+    detect_mem_type<mem_buffer>(UCS_MEMORY_TYPE_CUDA_MANAGED);
+}
+
+#if HAVE_CUDA_FABRIC
+UCS_TEST_P(test_switch_cuda_device, detect_mem_type_cuda_fabric)
+{
+    detect_mem_type<cuda_fabric_mem_buffer>(UCS_MEMORY_TYPE_CUDA);
+}
+#endif
+
+_UCT_MD_INSTANTIATE_TEST_CASE(test_switch_cuda_device, cuda_cpy);