From 1adcb5e3cefe59b892e3c3fe8aa6d15451f76da4 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Tue, 17 Dec 2024 20:02:16 +0200
Subject: [PATCH 01/12] TEST/GTEST: Added cuda gpu switching testing.

---
 test/gtest/common/mem_buffer.cc |  6 ++--
 test/gtest/common/mem_buffer.h  |  2 +-
 test/gtest/ucp/test_ucp_mmap.cc | 52 +++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc
index 5b5222b768c..7bc8afb713a 100644
--- a/test/gtest/common/mem_buffer.cc
+++ b/test/gtest/common/mem_buffer.cc
@@ -169,7 +169,7 @@ bool mem_buffer::is_mem_type_supported(ucs_memory_type_t mem_type)
            mem_types.end();
 }
 
-void mem_buffer::set_device_context()
+void mem_buffer::set_device_context(int device)
 {
     static __thread bool device_set = false;
 
@@ -179,7 +179,7 @@ void mem_buffer::set_device_context()
 
 #if HAVE_CUDA
     if (is_cuda_supported()) {
-        cudaSetDevice(0);
+        cudaSetDevice(device);
         /* need to call free as context maybe lazily initialized when calling
          * cudaSetDevice(0) but calling cudaFree(0) should guarantee context
          * creation upon return */
@@ -189,7 +189,7 @@ void mem_buffer::set_device_context()
 
 #if HAVE_ROCM
     if (is_rocm_supported()) {
-        hipSetDevice(0);
+        hipSetDevice(device);
     }
 #endif
 
diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h
index 4b1c285b2b8..9c45c8466e2 100644
--- a/test/gtest/common/mem_buffer.h
+++ b/test/gtest/common/mem_buffer.h
@@ -86,7 +86,7 @@ class mem_buffer {
     static bool is_gpu_supported();
 
     /* set device context if compiled with GPU support */
-    static void set_device_context();
+    static void set_device_context(int device = 0);
 
     /* returns whether ROCM device supports managed memory */
     static bool is_rocm_managed_supported();
diff --git a/test/gtest/ucp/test_ucp_mmap.cc b/test/gtest/ucp/test_ucp_mmap.cc
index 21b6cc3d2da..f9027258713 100644
--- a/test/gtest/ucp/test_ucp_mmap.cc
+++ b/test/gtest/ucp/test_ucp_mmap.cc
@@ -17,6 +17,10 @@ extern "C" {
 #include <ucs/type/float8.h>
 }
 
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include <cmath>
 #include <list>
 
@@ -1248,3 +1252,51 @@ UCS_TEST_P(test_ucp_mmap_export, export_import) {
 }
 
 UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_mmap_export)
+
+#if HAVE_CUDA
+class test_ucp_mmap_mgpu : public ucs::test {
+};
+
+UCS_TEST_F(test_ucp_mmap_mgpu, switch_gpu) {
+    if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) {
+        UCS_TEST_SKIP_R("cuda is not supported");
+    }
+
+    int num_devices;
+    ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
+
+    if (num_devices < 2) {
+        UCS_TEST_SKIP_R("less than two cuda devices available");
+    }
+
+    ucs::handle<ucp_config_t*> config;
+    UCS_TEST_CREATE_HANDLE(ucp_config_t*, config, ucp_config_release,
+                           ucp_config_read, NULL, NULL);
+
+    ucs::handle<ucp_context_h> context;
+    ucp_params_t params;
+    params.field_mask = UCP_PARAM_FIELD_FEATURES;
+    params.features   = UCP_FEATURE_TAG;
+    UCS_TEST_CREATE_HANDLE(ucp_context_h, context, ucp_cleanup, ucp_init,
+                           &params, config.get());
+
+    int device;
+    ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
+    ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
+
+    const size_t size = 16;
+    mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA);
+
+    ASSERT_EQ(cudaSetDevice(device), cudaSuccess);
+
+    ucp_mem_map_params_t mem_map_params;
+    mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
+                                UCP_MEM_MAP_PARAM_FIELD_LENGTH;
+    mem_map_params.address    = buffer.ptr();
+    mem_map_params.length     = size;
+
+    ucp_mem_h ucp_mem;
+    ASSERT_EQ(ucp_mem_map(context.get(), &mem_map_params, &ucp_mem), UCS_OK);
+    EXPECT_EQ(ucp_mem_unmap(context.get(), ucp_mem), UCS_OK);
+}
+#endif

From 18daf411b29a4872e9e025045e4ae3a2a71adf47 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Fri, 7 Feb 2025 19:14:53 +0200
Subject: [PATCH 02/12] UCT/CUDA/CUDA_COPY: push cuda context before getting
 address range.

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index a185dde3779..946b2669245 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -643,8 +643,13 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         goto out_default_range;
     }
 
+    /* GetAddressRange requires context to be set. On DGXA100 it takes 0.03 us
+     * to push and pop the context associated with address (which should be
+     * non-NULL if we are at this point). */
+    cuCtxPushCurrent(cuda_mem_ctx);
     cu_err = cuMemGetAddressRange(&base_address, &alloc_length,
                                   (CUdeviceptr)address);
+    cuCtxPopCurrent(&cuda_mem_ctx);
     if (cu_err != CUDA_SUCCESS) {
         ucs_error("cuMemGetAddressRange(%p) error: %s", address,
                   uct_cuda_base_cu_get_error_string(cu_err));

From 276df75c3f54ba2e49f36d8d07808c92ba9286b8 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Tue, 11 Feb 2025 21:16:04 +0200
Subject: [PATCH 03/12] UCT/CUDA/CUDA_COPY: WA for vmm.

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 88 ++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 16 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 946b2669245..982cb6d4ecf 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -533,6 +533,69 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address)
 #endif
 }
 
+/**
+ * Get information on memory allocations.
+ *
+ * @param [in]  address        Pointer to the memory allocation to query
+ * @param [in]  length         Size of the allocation
+ * @param [in]  ctx            CUDA context on which a pointer was allocated.
+ *                             NULL in case of VMM
+ * @param [out] base_address_p Returned base address
+ * @param [out] alloc_length_p Returned size of the memory allocation
+ *
+ * @return Error code as defined by @ref ucs_status_t.
+ */
+static ucs_status_t
+uct_cuda_copy_md_get_address_range(const void *address, size_t length,
+                                   CUcontext ctx, void **base_address_p,
+                                   size_t *alloc_length_p)
+{
+    ucs_log_level_t log_level = (ctx != NULL) ? UCS_LOG_LEVEL_DEBUG :
+                                                UCS_LOG_LEVEL_ERROR;
+    ucs_status_t status;
+    CUdeviceptr base;
+    size_t size;
+    ucs_status_t status_ctx_pop;
+    CUcontext popped_ctx;
+
+    if (ctx != NULL) {
+        /* GetAddressRange requires context to be set. On DGXA100 it takes
+         * 0.03us to push and pop the context associated with address (which
+         * should be non-NULL if we are at this point). */
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx));
+        if (status != UCS_OK) {
+            return status;
+        }
+    }
+
+    status = UCT_CUDADRV_FUNC(cuMemGetAddressRange(&base, &size,
+                                                   (CUdeviceptr)address),
+                              log_level);
+    if (ctx != NULL) {
+        status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx));
+        if (status_ctx_pop != UCS_OK) {
+            return status_ctx_pop;
+        }
+    }
+
+    if ((status != UCS_OK) && (ctx != NULL)) {
+        /* cuMemGetAddressRange failed after pushing non-NULL context */
+        return UCS_ERR_INVALID_ADDR;
+    }
+
+    if (status == UCS_OK) {
+        *base_address_p = (void*)base;
+        *alloc_length_p = size;
+    } else {
+        /* Use default values when cuMemGetAddressRange failed without pushing
+         * non-NULL context */
+        *base_address_p = address;
+        *alloc_length_p = length;
+    }
+
+    return UCS_OK;
+}
+
 static ucs_status_t
 uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
                                   size_t length, ucs_memory_info_t *mem_info)
@@ -544,11 +607,11 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
     CUcontext cuda_mem_ctx     = NULL;
     CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
     void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
-    CUdeviceptr base_address;
+    void *base_address;
     size_t alloc_length;
     size_t total_bytes;
     int32_t pref_loc;
-    unsigned is_vmm;
+    int is_vmm;
     CUresult cu_err;
     ucs_status_t status;
 
@@ -643,21 +706,14 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         goto out_default_range;
     }
 
-    /* GetAddressRange requires context to be set. On DGXA100 it takes 0.03 us
-     * to push and pop the context associated with address (which should be
-     * non-NULL if we are at this point). */
-    cuCtxPushCurrent(cuda_mem_ctx);
-    cu_err = cuMemGetAddressRange(&base_address, &alloc_length,
-                                  (CUdeviceptr)address);
-    cuCtxPopCurrent(&cuda_mem_ctx);
-    if (cu_err != CUDA_SUCCESS) {
-        ucs_error("cuMemGetAddressRange(%p) error: %s", address,
-                  uct_cuda_base_cu_get_error_string(cu_err));
-        return UCS_ERR_INVALID_ADDR;
+    status = uct_cuda_copy_md_get_address_range(address, length, cuda_mem_ctx,
+                                                &base_address, &alloc_length);
+    if (status != UCS_OK) {
+        return status;
     }
 
-    ucs_trace("query address %p: 0x%llx..0x%llx length %zu", address,
-              base_address, base_address + alloc_length, alloc_length);
+    ucs_trace("query address %p: %p..%p length %zu", address, base_address,
+              UCS_PTR_BYTE_OFFSET(base_address, alloc_length), alloc_length);
 
     if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) {
         total_bytes = uct_cuda_copy_md_get_total_device_mem(cuda_device);
@@ -668,7 +724,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
         ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON);
     }
 
-    mem_info->base_address = (void*)base_address;
+    mem_info->base_address = base_address;
     mem_info->alloc_length = alloc_length;
     return UCS_OK;
 

From 6d38f73048838c90fee9ffa8e468ace517262c51 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Tue, 11 Feb 2025 21:32:34 +0200
Subject: [PATCH 04/12] UCT/CUDA/CUDA_COPY: Fixed compilation warning.

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index a18a5b27f1e..3c087520673 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -627,7 +627,7 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length,
     } else {
         /* Use default values when cuMemGetAddressRange failed without pushing
          * non-NULL context */
-        *base_address_p = address;
+        *base_address_p = (void*)address;
         *alloc_length_p = length;
     }
 

From 1a9871fc4741c3a1325c3c53d263bc57ea770aab Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Wed, 12 Feb 2025 17:02:41 +0200
Subject: [PATCH 05/12] UCT/CUDA/CUDA_COPY: Addressed review comments.

---
 src/uct/cuda/cuda_copy/cuda_copy_md.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 3c087520673..454c4597308 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -588,7 +588,7 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length,
                                    CUcontext ctx, void **base_address_p,
                                    size_t *alloc_length_p)
 {
-    ucs_log_level_t log_level = (ctx != NULL) ? UCS_LOG_LEVEL_DEBUG :
+    ucs_log_level_t log_level = (ctx == NULL) ? UCS_LOG_LEVEL_DEBUG :
                                                 UCS_LOG_LEVEL_ERROR;
     ucs_status_t status;
     CUdeviceptr base;
@@ -598,8 +598,7 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length,
 
     if (ctx != NULL) {
         /* GetAddressRange requires context to be set. On DGXA100 it takes
-         * 0.03us to push and pop the context associated with address (which
-         * should be non-NULL if we are at this point). */
+         * 0.03us to push and pop the context associated with address. */
         status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx));
         if (status != UCS_OK) {
             return status;
@@ -611,16 +610,16 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length,
                               log_level);
     if (ctx != NULL) {
         status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx));
+        if (status != UCS_OK) {
+            /* cuMemGetAddressRange failed after pushing non-NULL context */
+            return UCS_ERR_INVALID_ADDR;
+        }
+
         if (status_ctx_pop != UCS_OK) {
             return status_ctx_pop;
         }
     }
 
-    if ((status != UCS_OK) && (ctx != NULL)) {
-        /* cuMemGetAddressRange failed after pushing non-NULL context */
-        return UCS_ERR_INVALID_ADDR;
-    }
-
     if (status == UCS_OK) {
         *base_address_p = (void*)base;
         *alloc_length_p = size;

From 0858b3d56b98903d2a86f6a88d49c3fcf44fe3bf Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Thu, 13 Feb 2025 16:35:33 +0200
Subject: [PATCH 06/12] TEST/GTEST/UCT/CUDA: Added UCT test.

---
 test/gtest/Makefile.am                   |  1 +
 test/gtest/uct/cuda/test_cuda_copy_md.cc | 41 ++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 test/gtest/uct/cuda/test_cuda_copy_md.cc

diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am
index a2f265fcf07..68a1ed353bc 100644
--- a/test/gtest/Makefile.am
+++ b/test/gtest/Makefile.am
@@ -260,6 +260,7 @@ if HAVE_CUDA
 gtest_SOURCES += \
 	common/cuda_context.cc \
 	ucm/cuda_hooks.cc \
+	uct/cuda/test_cuda_copy_md.cc \
 	uct/cuda/test_cuda_ipc_md.cc
 gtest_CPPFLAGS += \
 	$(CUDA_CPPFLAGS)
diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc
new file mode 100644
index 00000000000..e22b329c833
--- /dev/null
+++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc
@@ -0,0 +1,41 @@
+/**
+ * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include <uct/test_md.h>
+
+#include <cuda_runtime.h>
+
+class test_cuda_copy_md : public test_md {
+};
+
+UCS_TEST_P(test_cuda_copy_md, switch_gpu) {
+    if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) {
+        UCS_TEST_SKIP_R("cuda is not supported");
+    }
+
+    int num_devices;
+    ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
+
+    if (num_devices < 2) {
+        UCS_TEST_SKIP_R("less than two cuda devices available");
+    }
+
+    int device;
+    ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
+    ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
+
+    const size_t size = 16;
+    mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA);
+
+    ASSERT_EQ(cudaSetDevice(device), cudaSuccess);
+
+    ucs_memory_type_t mem_type;
+    ASSERT_EQ(uct_md_detect_memory_type(m_md, buffer.ptr(), size, &mem_type),
+              UCS_OK);
+    EXPECT_EQ(mem_type, UCS_MEMORY_TYPE_CUDA);
+}
+
+_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md, cuda_cpy);

From 0d57b51f4be90d6eae89c1447885b7f846a9f601 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Thu, 13 Feb 2025 17:55:53 +0200
Subject: [PATCH 07/12] GTEST/UCT/CUDA: Updated test.

---
 test/gtest/uct/cuda/test_cuda_copy_md.cc | 44 ++++++++++++++++--------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc
index e22b329c833..e3a5c185fcf 100644
--- a/test/gtest/uct/cuda/test_cuda_copy_md.cc
+++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc
@@ -6,16 +6,17 @@
 
 #include <uct/test_md.h>
 
+#include <cuda.h>
 #include <cuda_runtime.h>
 
-class test_cuda_copy_md : public test_md {
-};
+extern "C" {
+#include <ucs/sys/ptr_arith.h>
+}
 
-UCS_TEST_P(test_cuda_copy_md, switch_gpu) {
-    if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) {
-        UCS_TEST_SKIP_R("cuda is not supported");
-    }
+class test_cuda_copy_md_multi_gpu : public test_md {
+};
 
+UCS_TEST_P(test_cuda_copy_md_multi_gpu, query_mem_type) {
     int num_devices;
     ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
 
@@ -27,15 +28,28 @@ UCS_TEST_P(test_cuda_copy_md, switch_gpu) {
     ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
     ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
 
-    const size_t size = 16;
-    mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA);
-
-    ASSERT_EQ(cudaSetDevice(device), cudaSuccess);
-
-    ucs_memory_type_t mem_type;
-    ASSERT_EQ(uct_md_detect_memory_type(m_md, buffer.ptr(), size, &mem_type),
+    const int size            = 16;
+    uct_alloc_method_t method = UCT_ALLOC_METHOD_MD;
+    uct_md_h md               = m_md.get();
+    uct_mem_alloc_params_t params;
+    params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE |
+                        UCT_MEM_ALLOC_PARAM_FIELD_MDS      |
+                        UCT_MEM_ALLOC_PARAM_FIELD_NAME;
+    params.name       = "test_cuda_copy_md_multi_gpu";
+    params.mem_type   = UCS_MEMORY_TYPE_CUDA;
+    params.mds.mds    = &md;
+    params.mds.count  = 1;
+    uct_allocated_memory_t mem;
+    ASSERT_EQ(uct_mem_alloc(size, &method, 1, &params, &mem), UCS_OK);
+
+    EXPECT_EQ(cudaSetDevice(device), cudaSuccess);
+
+    uct_md_mem_attr_t mem_attr = {};
+    mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
+    EXPECT_EQ(uct_md_mem_query(md, mem.address, size, &mem_attr),
               UCS_OK);
-    EXPECT_EQ(mem_type, UCS_MEMORY_TYPE_CUDA);
+    EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA);
+    EXPECT_EQ(uct_mem_free(&mem), UCS_OK);
 }
 
-_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md, cuda_cpy);
+_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md_multi_gpu, cuda_cpy);

From 7141389e1bc64392c295a9d8a018e39c701377f0 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Thu, 13 Feb 2025 18:07:38 +0200
Subject: [PATCH 08/12] GTEST/UCT/CUDA: Updated test.

---
 test/gtest/uct/cuda/test_cuda_copy_md.cc | 45 +++++++++++++-----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc
index e3a5c185fcf..3f80d6328cd 100644
--- a/test/gtest/uct/cuda/test_cuda_copy_md.cc
+++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc
@@ -6,17 +6,33 @@
 
 #include <uct/test_md.h>
 
-#include <cuda.h>
 #include <cuda_runtime.h>
 
-extern "C" {
-#include <ucs/sys/ptr_arith.h>
-}
-
 class test_cuda_copy_md_multi_gpu : public test_md {
+public:
+    uct_allocated_memory_t mem_alloc(size_t size) const;
 };
 
-UCS_TEST_P(test_cuda_copy_md_multi_gpu, query_mem_type) {
+uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const
+{
+    uct_alloc_method_t method = UCT_ALLOC_METHOD_MD;
+    uct_md_h md               = m_md.get();
+
+    uct_mem_alloc_params_t params;
+    params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE |
+                        UCT_MEM_ALLOC_PARAM_FIELD_MDS      |
+                        UCT_MEM_ALLOC_PARAM_FIELD_NAME;
+    params.mem_type   = UCS_MEMORY_TYPE_CUDA;
+    params.mds.mds    = &md;
+    params.mds.count  = 1;
+    params.name       = "test_cuda_copy_md_multi_gpu";
+
+    uct_allocated_memory_t mem;
+    EXPECT_EQ(uct_mem_alloc(size, &method, 1, &params, &mem), UCS_OK);
+    return mem;
+}
+
+UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) {
     int num_devices;
     ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
 
@@ -28,25 +44,14 @@ UCS_TEST_P(test_cuda_copy_md_multi_gpu, query_mem_type) {
     ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
     ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
 
-    const int size            = 16;
-    uct_alloc_method_t method = UCT_ALLOC_METHOD_MD;
-    uct_md_h md               = m_md.get();
-    uct_mem_alloc_params_t params;
-    params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE |
-                        UCT_MEM_ALLOC_PARAM_FIELD_MDS      |
-                        UCT_MEM_ALLOC_PARAM_FIELD_NAME;
-    params.name       = "test_cuda_copy_md_multi_gpu";
-    params.mem_type   = UCS_MEMORY_TYPE_CUDA;
-    params.mds.mds    = &md;
-    params.mds.count  = 1;
-    uct_allocated_memory_t mem;
-    ASSERT_EQ(uct_mem_alloc(size, &method, 1, &params, &mem), UCS_OK);
+    const int size = 16;
+    auto mem       = mem_alloc(size);
 
     EXPECT_EQ(cudaSetDevice(device), cudaSuccess);
 
     uct_md_mem_attr_t mem_attr = {};
     mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
-    EXPECT_EQ(uct_md_mem_query(md, mem.address, size, &mem_attr),
+    EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr),
               UCS_OK);
     EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA);
     EXPECT_EQ(uct_mem_free(&mem), UCS_OK);

From d709e1118635b30d81de10f7c75712e4a3519729 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Thu, 13 Feb 2025 18:16:40 +0200
Subject: [PATCH 09/12] TEST/GTEST: Added cuda gpu switching testing. Reverted.

This reverts commit 1adcb5e3cefe59b892e3c3fe8aa6d15451f76da4.
---
 test/gtest/common/mem_buffer.cc |  6 ++--
 test/gtest/common/mem_buffer.h  |  2 +-
 test/gtest/ucp/test_ucp_mmap.cc | 52 ---------------------------------
 3 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc
index 7bc8afb713a..5b5222b768c 100644
--- a/test/gtest/common/mem_buffer.cc
+++ b/test/gtest/common/mem_buffer.cc
@@ -169,7 +169,7 @@ bool mem_buffer::is_mem_type_supported(ucs_memory_type_t mem_type)
            mem_types.end();
 }
 
-void mem_buffer::set_device_context(int device)
+void mem_buffer::set_device_context()
 {
     static __thread bool device_set = false;
 
@@ -179,7 +179,7 @@ void mem_buffer::set_device_context(int device)
 
 #if HAVE_CUDA
     if (is_cuda_supported()) {
-        cudaSetDevice(device);
+        cudaSetDevice(0);
         /* need to call free as context maybe lazily initialized when calling
          * cudaSetDevice(0) but calling cudaFree(0) should guarantee context
          * creation upon return */
@@ -189,7 +189,7 @@ void mem_buffer::set_device_context(int device)
 
 #if HAVE_ROCM
     if (is_rocm_supported()) {
-        hipSetDevice(device);
+        hipSetDevice(0);
     }
 #endif
 
diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h
index 9c45c8466e2..4b1c285b2b8 100644
--- a/test/gtest/common/mem_buffer.h
+++ b/test/gtest/common/mem_buffer.h
@@ -86,7 +86,7 @@ class mem_buffer {
     static bool is_gpu_supported();
 
     /* set device context if compiled with GPU support */
-    static void set_device_context(int device = 0);
+    static void set_device_context();
 
     /* returns whether ROCM device supports managed memory */
     static bool is_rocm_managed_supported();
diff --git a/test/gtest/ucp/test_ucp_mmap.cc b/test/gtest/ucp/test_ucp_mmap.cc
index f9027258713..21b6cc3d2da 100644
--- a/test/gtest/ucp/test_ucp_mmap.cc
+++ b/test/gtest/ucp/test_ucp_mmap.cc
@@ -17,10 +17,6 @@ extern "C" {
 #include <ucs/type/float8.h>
 }
 
-#if HAVE_CUDA
-#include <cuda_runtime.h>
-#endif
-
 #include <cmath>
 #include <list>
 
@@ -1252,51 +1248,3 @@ UCS_TEST_P(test_ucp_mmap_export, export_import) {
 }
 
 UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_mmap_export)
-
-#if HAVE_CUDA
-class test_ucp_mmap_mgpu : public ucs::test {
-};
-
-UCS_TEST_F(test_ucp_mmap_mgpu, switch_gpu) {
-    if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) {
-        UCS_TEST_SKIP_R("cuda is not supported");
-    }
-
-    int num_devices;
-    ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);
-
-    if (num_devices < 2) {
-        UCS_TEST_SKIP_R("less than two cuda devices available");
-    }
-
-    ucs::handle<ucp_config_t*> config;
-    UCS_TEST_CREATE_HANDLE(ucp_config_t*, config, ucp_config_release,
-                           ucp_config_read, NULL, NULL);
-
-    ucs::handle<ucp_context_h> context;
-    ucp_params_t params;
-    params.field_mask = UCP_PARAM_FIELD_FEATURES;
-    params.features   = UCP_FEATURE_TAG;
-    UCS_TEST_CREATE_HANDLE(ucp_context_h, context, ucp_cleanup, ucp_init,
-                           &params, config.get());
-
-    int device;
-    ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
-    ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
-
-    const size_t size = 16;
-    mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA);
-
-    ASSERT_EQ(cudaSetDevice(device), cudaSuccess);
-
-    ucp_mem_map_params_t mem_map_params;
-    mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
-                                UCP_MEM_MAP_PARAM_FIELD_LENGTH;
-    mem_map_params.address    = buffer.ptr();
-    mem_map_params.length     = size;
-
-    ucp_mem_h ucp_mem;
-    ASSERT_EQ(ucp_mem_map(context.get(), &mem_map_params, &ucp_mem), UCS_OK);
-    EXPECT_EQ(ucp_mem_unmap(context.get(), ucp_mem), UCS_OK);
-}
-#endif

From e5f9f9e9c461a5160695481daa194749a5a22068 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Thu, 13 Feb 2025 18:38:51 +0200
Subject: [PATCH 10/12] GTEST/UCT/CUDA: Fixed code format issues.

---
 test/gtest/uct/cuda/test_cuda_copy_md.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc
index 3f80d6328cd..35ddd031bdd 100644
--- a/test/gtest/uct/cuda/test_cuda_copy_md.cc
+++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc
@@ -20,7 +20,7 @@ uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const
 
     uct_mem_alloc_params_t params;
     params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE |
-                        UCT_MEM_ALLOC_PARAM_FIELD_MDS      |
+                        UCT_MEM_ALLOC_PARAM_FIELD_MDS |
                         UCT_MEM_ALLOC_PARAM_FIELD_NAME;
     params.mem_type   = UCS_MEMORY_TYPE_CUDA;
     params.mds.mds    = &md;
@@ -50,7 +50,7 @@ UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) {
     EXPECT_EQ(cudaSetDevice(device), cudaSuccess);
 
     uct_md_mem_attr_t mem_attr = {};
-    mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
+    mem_attr.field_mask        = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
     EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr),
               UCS_OK);
     EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA);

From 20bdcbb41ec339c85af927a03cab9b65b279809d Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Fri, 14 Feb 2025 13:22:04 +0200
Subject: [PATCH 11/12] GTEST/UCT/CUDA: Fixed type.

---
 test/gtest/uct/cuda/test_cuda_copy_md.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc
index 35ddd031bdd..b230038c401 100644
--- a/test/gtest/uct/cuda/test_cuda_copy_md.cc
+++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc
@@ -44,8 +44,8 @@ UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) {
     ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
     ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);
 
-    const int size = 16;
-    auto mem       = mem_alloc(size);
+    const size_t size = 16;
+    auto mem          = mem_alloc(size);
 
     EXPECT_EQ(cudaSetDevice(device), cudaSuccess);
 

From 578d42a1e9f7afb56adaa36b26f5f10310c5ded7 Mon Sep 17 00:00:00 2001
From: rakhmets <rakhmetshin@nvidia.com>
Date: Fri, 14 Feb 2025 18:45:32 +0200
Subject: [PATCH 12/12] BUILDLIB: Removed setting visible device from
 load_cuda_env.

---
 buildlib/az-helpers.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh
index d53d876ffd0..fc7aeead0a8 100644
--- a/buildlib/az-helpers.sh
+++ b/buildlib/az-helpers.sh
@@ -197,12 +197,6 @@ try_load_cuda_env() {
     then
         az_module_load dev/gdrcopy2.4.1_cuda12.5.1 && have_gdrcopy=yes
     fi
-
-    # Set CUDA_VISIBLE_DEVICES
-    if [ -n "${worker}" ]
-    then
-        export CUDA_VISIBLE_DEVICES=$((worker % num_gpus))
-    fi
 }
 
 load_cuda_env() {