diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh
index d53d876ffd0..fc7aeead0a8 100644
--- a/buildlib/az-helpers.sh
+++ b/buildlib/az-helpers.sh
@@ -197,12 +197,6 @@ try_load_cuda_env() {
     then
         az_module_load dev/gdrcopy2.4.1_cuda12.5.1 && have_gdrcopy=yes
     fi
-
-    # Set CUDA_VISIBLE_DEVICES
-    if [ -n "${worker}" ]
-    then
-        export CUDA_VISIBLE_DEVICES=$((worker % num_gpus))
-    fi
 }
 
 load_cuda_env() {
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index df46630726a..454c4597308 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -571,6 +571,68 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
     return 1; /* return 1 byte to avoid division by zero */
 }
 
+/**
+ * Get information on memory allocations.
+ *
+ * @param [in]  address        Pointer to the memory allocation to query
+ * @param [in]  length         Size of the allocation
+ * @param [in]  ctx            CUDA context on which a pointer was allocated.
+ *                             NULL in case of VMM
+ * @param [out] base_address_p Returned base address
+ * @param [out] alloc_length_p Returned size of the memory allocation
+ *
+ * @return Error code as defined by @ref ucs_status_t.
+ */
+static ucs_status_t
+uct_cuda_copy_md_get_address_range(const void *address, size_t length,
+                                   CUcontext ctx, void **base_address_p,
+                                   size_t *alloc_length_p)
+{
+    ucs_log_level_t log_level = (ctx == NULL) ? UCS_LOG_LEVEL_DEBUG :
+                                                UCS_LOG_LEVEL_ERROR;
+    ucs_status_t status;
+    CUdeviceptr base;
+    size_t size;
+    ucs_status_t status_ctx_pop;
+    CUcontext popped_ctx;
+
+    if (ctx != NULL) {
+        /* GetAddressRange requires context to be set. On DGXA100 it takes
+         * 0.03us to push and pop the context associated with address. */
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx));
+        if (status != UCS_OK) {
+            return status;
+        }
+    }
+
+    status = UCT_CUDADRV_FUNC(cuMemGetAddressRange(&base, &size,
+                                                   (CUdeviceptr)address),
+                              log_level);
+    if (ctx != NULL) {
+        status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx));
+        if (status != UCS_OK) {
+            /* cuMemGetAddressRange failed after pushing non-NULL context */
+            return UCS_ERR_INVALID_ADDR;
+        }
+
+        if (status_ctx_pop != UCS_OK) {
+            return status_ctx_pop;
+        }
+    }
+
+    if (status == UCS_OK) {
+        *base_address_p = (void*)base;
+        *alloc_length_p = size;
+    } else {
+        /* Use default values when cuMemGetAddressRange failed without pushing
+         * non-NULL context */
+        *base_address_p = (void*)address;
+        *alloc_length_p = length;
+    }
+
+    return UCS_OK;
+}
+
 static ucs_status_t
 uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
                                   size_t length, ucs_memory_info_t *mem_info)
@@ -582,11 +644,11 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
     CUcontext cuda_mem_ctx     = NULL;
     CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
     void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
-    CUdeviceptr base_address;
+    void *base_address;
     size_t alloc_length;
     size_t total_bytes;
     int32_t pref_loc;
-    unsigned is_vmm;
+    int is_vmm;
     CUresult cu_err;
     ucs_status_t status;
 
@@ -681,16 +743,14 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         goto out_default_range;
     }
 
-    cu_err = cuMemGetAddressRange(&base_address, &alloc_length,
-                                  (CUdeviceptr)address);
-    if (cu_err != CUDA_SUCCESS) {
-        ucs_error("cuMemGetAddressRange(%p) error: %s", address,
-                  uct_cuda_base_cu_get_error_string(cu_err));
-        return UCS_ERR_INVALID_ADDR;
+    status = uct_cuda_copy_md_get_address_range(address, length, cuda_mem_ctx,
+                                                &base_address, &alloc_length);
+    if (status != UCS_OK) {
+        return status;
     }
 
-    ucs_trace("query address %p: 0x%llx..0x%llx length %zu", address,
-              base_address, base_address + alloc_length, alloc_length);
+    ucs_trace("query address %p: %p..%p length %zu", address, base_address,
+              UCS_PTR_BYTE_OFFSET(base_address, alloc_length), alloc_length);
 
     if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) {
         total_bytes = uct_cuda_copy_md_get_total_device_mem(cuda_device);
@@ -701,7 +761,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON);
     }
 
-    mem_info->base_address = (void*)base_address;
+    mem_info->base_address = base_address;
     mem_info->alloc_length = alloc_length;
     return UCS_OK;
 
diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am
index a2f265fcf07..68a1ed353bc 100644
--- a/test/gtest/Makefile.am
+++ b/test/gtest/Makefile.am
@@ -260,6 +260,7 @@ if HAVE_CUDA
 gtest_SOURCES += \
 	common/cuda_context.cc \
 	ucm/cuda_hooks.cc \
+	uct/cuda/test_cuda_copy_md.cc \
 	uct/cuda/test_cuda_ipc_md.cc
 gtest_CPPFLAGS += \
 	$(CUDA_CPPFLAGS)
diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc
new file mode 100644
index 00000000000..b230038c401
--- /dev/null
+++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc
@@ -0,0 +1,60 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include <uct/test_md.h>
+
+#include <cuda_runtime.h>
+
+class test_cuda_copy_md_multi_gpu : public test_md {
+public:
+    uct_allocated_memory_t mem_alloc(size_t size) const;
+};
+
+uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const
+{
+    uct_alloc_method_t method = UCT_ALLOC_METHOD_MD;
+    uct_md_h md               = m_md.get();
+
+    uct_mem_alloc_params_t params;
+    params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE |
+                        UCT_MEM_ALLOC_PARAM_FIELD_MDS |
+                        UCT_MEM_ALLOC_PARAM_FIELD_NAME;
+    params.mem_type   = UCS_MEMORY_TYPE_CUDA;
+    params.mds.mds    = &md;
+    params.mds.count  = 1;
+    params.name       = "test_cuda_copy_md_multi_gpu";
+
+    uct_allocated_memory_t mem;
+    EXPECT_EQ(uct_mem_alloc(size, &method, 1, &params, &mem), UCS_OK);
+    return mem;
+}
+
+UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) {
+    int num_devices;
+    ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
+
+    if (num_devices < 2) {
+        UCS_TEST_SKIP_R("less than two cuda devices available");
+    }
+
+    int device;
+    ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
+    ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
+
+    const size_t size = 16;
+    auto mem          = mem_alloc(size);
+
+    EXPECT_EQ(cudaSetDevice(device), cudaSuccess);
+
+    uct_md_mem_attr_t mem_attr = {};
+    mem_attr.field_mask        = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
+    EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr),
+              UCS_OK);
+    EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA);
+    EXPECT_EQ(uct_mem_free(&mem), UCS_OK);
+}
+
+_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md_multi_gpu, cuda_cpy);