From 1adcb5e3cefe59b892e3c3fe8aa6d15451f76da4 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Tue, 17 Dec 2024 20:02:16 +0200 Subject: [PATCH 01/12] TEST/GTEST: Added cuda gpu switching testing. --- test/gtest/common/mem_buffer.cc | 6 ++-- test/gtest/common/mem_buffer.h | 2 +- test/gtest/ucp/test_ucp_mmap.cc | 52 +++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 4 deletions(-) diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc index 5b5222b768c..7bc8afb713a 100644 --- a/test/gtest/common/mem_buffer.cc +++ b/test/gtest/common/mem_buffer.cc @@ -169,7 +169,7 @@ bool mem_buffer::is_mem_type_supported(ucs_memory_type_t mem_type) mem_types.end(); } -void mem_buffer::set_device_context() +void mem_buffer::set_device_context(int device) { static __thread bool device_set = false; @@ -179,7 +179,7 @@ void mem_buffer::set_device_context() #if HAVE_CUDA if (is_cuda_supported()) { - cudaSetDevice(0); + cudaSetDevice(device); /* need to call free as context maybe lazily initialized when calling * cudaSetDevice(0) but calling cudaFree(0) should guarantee context * creation upon return */ @@ -189,7 +189,7 @@ void mem_buffer::set_device_context() #if HAVE_ROCM if (is_rocm_supported()) { - hipSetDevice(0); + hipSetDevice(device); } #endif diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h index 4b1c285b2b8..9c45c8466e2 100644 --- a/test/gtest/common/mem_buffer.h +++ b/test/gtest/common/mem_buffer.h @@ -86,7 +86,7 @@ class mem_buffer { static bool is_gpu_supported(); /* set device context if compiled with GPU support */ - static void set_device_context(); + static void set_device_context(int device = 0); /* returns whether ROCM device supports managed memory */ static bool is_rocm_managed_supported(); diff --git a/test/gtest/ucp/test_ucp_mmap.cc b/test/gtest/ucp/test_ucp_mmap.cc index 21b6cc3d2da..f9027258713 100644 --- a/test/gtest/ucp/test_ucp_mmap.cc +++ b/test/gtest/ucp/test_ucp_mmap.cc @@ -17,6 +17,10 @@ extern "C" { #include } +#if HAVE_CUDA +#include +#endif + #include #include @@ -1248,3 +1252,51 @@ UCS_TEST_P(test_ucp_mmap_export, export_import) { } UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_mmap_export) + +#if HAVE_CUDA +class test_ucp_mmap_mgpu : public ucs::test { +}; + +UCS_TEST_F(test_ucp_mmap_mgpu, switch_gpu) { + if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) { + UCS_TEST_SKIP_R("cuda is not supported"); + } + + int num_devices; + ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess); + + if (num_devices < 2) { + UCS_TEST_SKIP_R("less than two cuda devices available"); + } + + ucs::handle config; + UCS_TEST_CREATE_HANDLE(ucp_config_t*, config, ucp_config_release, + ucp_config_read, NULL, NULL); + + ucs::handle context; + ucp_params_t params; + params.field_mask = UCP_PARAM_FIELD_FEATURES; + params.features = UCP_FEATURE_TAG; + UCS_TEST_CREATE_HANDLE(ucp_context_h, context, ucp_cleanup, ucp_init, + ¶ms, config.get()); + + int device; + ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); + ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); + + const size_t size = 16; + mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA); + + ASSERT_EQ(cudaSetDevice(device), cudaSuccess); + + ucp_mem_map_params_t mem_map_params; + mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | + UCP_MEM_MAP_PARAM_FIELD_LENGTH; + mem_map_params.address = buffer.ptr(); + mem_map_params.length = size; + + ucp_mem_h ucp_mem; + ASSERT_EQ(ucp_mem_map(context.get(), &mem_map_params, &ucp_mem), UCS_OK); + EXPECT_EQ(ucp_mem_unmap(context.get(), ucp_mem), UCS_OK); +} +#endif From 18daf411b29a4872e9e025045e4ae3a2a71adf47 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Fri, 7 Feb 2025 19:14:53 +0200 Subject: [PATCH 02/12] UCT/CUDA/CUDA_COPY: push cuda context before getting address range. --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index a185dde3779..946b2669245 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -643,8 +643,13 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, goto out_default_range; } + /* GetAddressRange requires context to be set. On DGXA100 it takes 0.03 us + * to push and pop the context associated with address (which should be + * non-NULL if we are at this point). */ + cuCtxPushCurrent(cuda_mem_ctx); cu_err = cuMemGetAddressRange(&base_address, &alloc_length, (CUdeviceptr)address); + cuCtxPopCurrent(&cuda_mem_ctx); if (cu_err != CUDA_SUCCESS) { ucs_error("cuMemGetAddressRange(%p) error: %s", address, uct_cuda_base_cu_get_error_string(cu_err)); From 276df75c3f54ba2e49f36d8d07808c92ba9286b8 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Tue, 11 Feb 2025 21:16:04 +0200 Subject: [PATCH 03/12] UCT/CUDA/CUDA_COPY: WA for vmm. --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 88 ++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 16 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 946b2669245..982cb6d4ecf 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -533,6 +533,69 @@ uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md, const void *address) #endif } +/** + * Get information on memory allocations. + * + * @param [in] address Pointer to the memory allocation to query + * @param [in] length Size of the allocation + * @param [in] ctx CUDA context on which a pointer was allocated. + * NULL in case of VMM + * @param [out] base_address_p Returned base address + * @param [out] alloc_length_p Returned size of the memory allocation + * + * @return Error code as defined by @ref ucs_status_t. + */ +static ucs_status_t +uct_cuda_copy_md_get_address_range(const void *address, size_t length, + CUcontext ctx, void **base_address_p, + size_t *alloc_length_p) +{ + ucs_log_level_t log_level = (ctx != NULL) ? UCS_LOG_LEVEL_DEBUG : + UCS_LOG_LEVEL_ERROR; + ucs_status_t status; + CUdeviceptr base; + size_t size; + ucs_status_t status_ctx_pop; + CUcontext popped_ctx; + + if (ctx != NULL) { + /* GetAddressRange requires context to be set. On DGXA100 it takes + * 0.03us to push and pop the context associated with address (which + * should be non-NULL if we are at this point). */ + status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx)); + if (status != UCS_OK) { + return status; + } + } + + status = UCT_CUDADRV_FUNC(cuMemGetAddressRange(&base, &size, + (CUdeviceptr)address), + log_level); + if (ctx != NULL) { + status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx)); + if (status_ctx_pop != UCS_OK) { + return status_ctx_pop; + } + } + + if ((status != UCS_OK) && (ctx != NULL)) { + /* cuMemGetAddressRange failed after pushing non-NULL context */ + return UCS_ERR_INVALID_ADDR; + } + + if (status == UCS_OK) { + *base_address_p = (void*)base; + *alloc_length_p = size; + } else { + /* Use default values when cuMemGetAddressRange failed without pushing + * non-NULL context */ + *base_address_p = address; + *alloc_length_p = length; + } + + return UCS_OK; +} + static ucs_status_t uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, size_t length, ucs_memory_info_t *mem_info) @@ -544,11 +607,11 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, CUcontext cuda_mem_ctx = NULL; CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS]; void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS]; - CUdeviceptr base_address; + void *base_address; size_t alloc_length; size_t total_bytes; int32_t pref_loc; - unsigned is_vmm; + int is_vmm; CUresult cu_err; ucs_status_t status; @@ -643,21 +706,14 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, goto out_default_range; } - /* GetAddressRange requires context to be set. On DGXA100 it takes 0.03 us - * to push and pop the context associated with address (which should be - * non-NULL if we are at this point). */ - cuCtxPushCurrent(cuda_mem_ctx); - cu_err = cuMemGetAddressRange(&base_address, &alloc_length, - (CUdeviceptr)address); - cuCtxPopCurrent(&cuda_mem_ctx); - if (cu_err != CUDA_SUCCESS) { - ucs_error("cuMemGetAddressRange(%p) error: %s", address, - uct_cuda_base_cu_get_error_string(cu_err)); - return UCS_ERR_INVALID_ADDR; + status = uct_cuda_copy_md_get_address_range(address, length, cuda_mem_ctx, + &base_address, &alloc_length); + if (status != UCS_OK) { + return status; } - ucs_trace("query address %p: 0x%llx..0x%llx length %zu", address, - base_address, base_address + alloc_length, alloc_length); + ucs_trace("query address %p: %p..%p length %zu", address, base_address, + UCS_PTR_BYTE_OFFSET(base_address, alloc_length), alloc_length); if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) { total_bytes = uct_cuda_copy_md_get_total_device_mem(cuda_device); @@ -668,7 +724,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON); } - mem_info->base_address = (void*)base_address; + mem_info->base_address = base_address; mem_info->alloc_length = alloc_length; return UCS_OK; From 6d38f73048838c90fee9ffa8e468ace517262c51 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Tue, 11 Feb 2025 21:32:34 +0200 Subject: [PATCH 04/12] UCT/CUDA/CUDA_COPY: Fixed compilation warning. --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index a18a5b27f1e..3c087520673 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -627,7 +627,7 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length, } else { /* Use default values when cuMemGetAddressRange failed without pushing * non-NULL context */ - *base_address_p = address; + *base_address_p = (void*)address; *alloc_length_p = length; } From 1a9871fc4741c3a1325c3c53d263bc57ea770aab Mon Sep 17 00:00:00 2001 From: rakhmets Date: Wed, 12 Feb 2025 17:02:41 +0200 Subject: [PATCH 05/12] UCT/CUDA/CUDA_COPY: Addressed review comments. --- src/uct/cuda/cuda_copy/cuda_copy_md.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index 3c087520673..454c4597308 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -588,7 +588,7 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length, CUcontext ctx, void **base_address_p, size_t *alloc_length_p) { - ucs_log_level_t log_level = (ctx != NULL) ? UCS_LOG_LEVEL_DEBUG : + ucs_log_level_t log_level = (ctx == NULL) ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR; ucs_status_t status; CUdeviceptr base; @@ -598,8 +598,7 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length, if (ctx != NULL) { /* GetAddressRange requires context to be set. On DGXA100 it takes - * 0.03us to push and pop the context associated with address (which - * should be non-NULL if we are at this point). */ + * 0.03us to push and pop the context associated with address. */ status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx)); if (status != UCS_OK) { return status; @@ -611,16 +610,16 @@ uct_cuda_copy_md_get_address_range(const void *address, size_t length, log_level); if (ctx != NULL) { status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx)); + if (status != UCS_OK) { + /* cuMemGetAddressRange failed after pushing non-NULL context */ + return UCS_ERR_INVALID_ADDR; + } + if (status_ctx_pop != UCS_OK) { return status_ctx_pop; } } - if ((status != UCS_OK) && (ctx != NULL)) { - /* cuMemGetAddressRange failed after pushing non-NULL context */ - return UCS_ERR_INVALID_ADDR; - } - if (status == UCS_OK) { *base_address_p = (void*)base; *alloc_length_p = size; From 0858b3d56b98903d2a86f6a88d49c3fcf44fe3bf Mon Sep 17 00:00:00 2001 From: rakhmets Date: Thu, 13 Feb 2025 16:35:33 +0200 Subject: [PATCH 06/12] TEST/GTEST/UCT/CUDA: Added UCT test. --- test/gtest/Makefile.am | 1 + test/gtest/uct/cuda/test_cuda_copy_md.cc | 41 ++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 test/gtest/uct/cuda/test_cuda_copy_md.cc diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am index a2f265fcf07..68a1ed353bc 100644 --- a/test/gtest/Makefile.am +++ b/test/gtest/Makefile.am @@ -260,6 +260,7 @@ if HAVE_CUDA gtest_SOURCES += \ common/cuda_context.cc \ ucm/cuda_hooks.cc \ + uct/cuda/test_cuda_copy_md.cc \ uct/cuda/test_cuda_ipc_md.cc gtest_CPPFLAGS += \ $(CUDA_CPPFLAGS) diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc new file mode 100644 index 00000000000..e22b329c833 --- /dev/null +++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc @@ -0,0 +1,41 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include + +#include + +class test_cuda_copy_md : public test_md { +}; + +UCS_TEST_P(test_cuda_copy_md, switch_gpu) { + if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) { + UCS_TEST_SKIP_R("cuda is not supported"); + } + + int num_devices; + ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess); + + if (num_devices < 2) { + UCS_TEST_SKIP_R("less than two cuda devices available"); + } + + int device; + ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); + ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); + + const size_t size = 16; + mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA); + + ASSERT_EQ(cudaSetDevice(device), cudaSuccess); + + ucs_memory_type_t mem_type; + ASSERT_EQ(uct_md_detect_memory_type(m_md, buffer.ptr(), size, &mem_type), + UCS_OK); + EXPECT_EQ(mem_type, UCS_MEMORY_TYPE_CUDA); +} + +_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md, cuda_cpy); From 0d57b51f4be90d6eae89c1447885b7f846a9f601 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Thu, 13 Feb 2025 17:55:53 +0200 Subject: [PATCH 07/12] GTEST/UCT/CUDA: Updated test. --- test/gtest/uct/cuda/test_cuda_copy_md.cc | 44 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc index e22b329c833..e3a5c185fcf 100644 --- a/test/gtest/uct/cuda/test_cuda_copy_md.cc +++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc @@ -6,16 +6,17 @@ #include +#include #include -class test_cuda_copy_md : public test_md { -}; +extern "C" { +#include +} -UCS_TEST_P(test_cuda_copy_md, switch_gpu) { - if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) { - UCS_TEST_SKIP_R("cuda is not supported"); - } +class test_cuda_copy_md_multi_gpu : public test_md { +}; +UCS_TEST_P(test_cuda_copy_md_multi_gpu, query_mem_type) { int num_devices; ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess); @@ -27,15 +28,28 @@ UCS_TEST_P(test_cuda_copy_md, switch_gpu) { ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); - const size_t size = 16; - mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA); - - ASSERT_EQ(cudaSetDevice(device), cudaSuccess); - - ucs_memory_type_t mem_type; - ASSERT_EQ(uct_md_detect_memory_type(m_md, buffer.ptr(), size, &mem_type), + const int size = 16; + uct_alloc_method_t method = UCT_ALLOC_METHOD_MD; + uct_md_h md = m_md.get(); + uct_mem_alloc_params_t params; + params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE | + UCT_MEM_ALLOC_PARAM_FIELD_MDS | + UCT_MEM_ALLOC_PARAM_FIELD_NAME; + params.name = "test_cuda_copy_md_multi_gpu"; + params.mem_type = UCS_MEMORY_TYPE_CUDA; + params.mds.mds = &md; + params.mds.count = 1; + uct_allocated_memory_t mem; + ASSERT_EQ(uct_mem_alloc(size, &method, 1, ¶ms, &mem), UCS_OK); + + EXPECT_EQ(cudaSetDevice(device), cudaSuccess); + + uct_md_mem_attr_t mem_attr = {}; + mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE; + EXPECT_EQ(uct_md_mem_query(md, mem.address, size, &mem_attr), UCS_OK); - EXPECT_EQ(mem_type, UCS_MEMORY_TYPE_CUDA); + EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA); + EXPECT_EQ(uct_mem_free(&mem), UCS_OK); } -_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md, cuda_cpy); +_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md_multi_gpu, cuda_cpy); From 7141389e1bc64392c295a9d8a018e39c701377f0 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Thu, 13 Feb 2025 18:07:38 +0200 Subject: [PATCH 08/12] GTEST/UCT/CUDA: Updated test. --- test/gtest/uct/cuda/test_cuda_copy_md.cc | 45 +++++++++++++----------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc index e3a5c185fcf..3f80d6328cd 100644 --- a/test/gtest/uct/cuda/test_cuda_copy_md.cc +++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc @@ -6,17 +6,33 @@ #include -#include #include -extern "C" { -#include -} - class test_cuda_copy_md_multi_gpu : public test_md { +public: + uct_allocated_memory_t mem_alloc(size_t size) const; }; -UCS_TEST_P(test_cuda_copy_md_multi_gpu, query_mem_type) { +uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const +{ + uct_alloc_method_t method = UCT_ALLOC_METHOD_MD; + uct_md_h md = m_md.get(); + + uct_mem_alloc_params_t params; + params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE | + UCT_MEM_ALLOC_PARAM_FIELD_MDS | + UCT_MEM_ALLOC_PARAM_FIELD_NAME; + params.mem_type = UCS_MEMORY_TYPE_CUDA; + params.mds.mds = &md; + params.mds.count = 1; + params.name = "test_cuda_copy_md_multi_gpu"; + + uct_allocated_memory_t mem; + EXPECT_EQ(uct_mem_alloc(size, &method, 1, ¶ms, &mem), UCS_OK); + return mem; +} + +UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) { int num_devices; ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess); @@ -28,25 +44,14 @@ UCS_TEST_P(test_cuda_copy_md_multi_gpu, query_mem_type) { ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); - const int size = 16; - uct_alloc_method_t method = UCT_ALLOC_METHOD_MD; - uct_md_h md = m_md.get(); - uct_mem_alloc_params_t params; - params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE | - UCT_MEM_ALLOC_PARAM_FIELD_MDS | - UCT_MEM_ALLOC_PARAM_FIELD_NAME; - params.name = "test_cuda_copy_md_multi_gpu"; - params.mem_type = UCS_MEMORY_TYPE_CUDA; - params.mds.mds = &md; - params.mds.count = 1; - uct_allocated_memory_t mem; - ASSERT_EQ(uct_mem_alloc(size, &method, 1, ¶ms, &mem), UCS_OK); + const int size = 16; + auto mem = mem_alloc(size); EXPECT_EQ(cudaSetDevice(device), cudaSuccess); uct_md_mem_attr_t mem_attr = {}; mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE; - EXPECT_EQ(uct_md_mem_query(md, mem.address, size, &mem_attr), + EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr), UCS_OK); EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA); EXPECT_EQ(uct_mem_free(&mem), UCS_OK); From d709e1118635b30d81de10f7c75712e4a3519729 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Thu, 13 Feb 2025 18:16:40 +0200 Subject: [PATCH 09/12] TEST/GTEST: Added cuda gpu switching testing. Reverted. This reverts commit 1adcb5e3cefe59b892e3c3fe8aa6d15451f76da4. --- test/gtest/common/mem_buffer.cc | 6 ++-- test/gtest/common/mem_buffer.h | 2 +- test/gtest/ucp/test_ucp_mmap.cc | 52 --------------------------------- 3 files changed, 4 insertions(+), 56 deletions(-) diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc index 7bc8afb713a..5b5222b768c 100644 --- a/test/gtest/common/mem_buffer.cc +++ b/test/gtest/common/mem_buffer.cc @@ -169,7 +169,7 @@ bool mem_buffer::is_mem_type_supported(ucs_memory_type_t mem_type) mem_types.end(); } -void mem_buffer::set_device_context(int device) +void mem_buffer::set_device_context() { static __thread bool device_set = false; @@ -179,7 +179,7 @@ void mem_buffer::set_device_context(int device) #if HAVE_CUDA if (is_cuda_supported()) { - cudaSetDevice(device); + cudaSetDevice(0); /* need to call free as context maybe lazily initialized when calling * cudaSetDevice(0) but calling cudaFree(0) should guarantee context * creation upon return */ @@ -189,7 +189,7 @@ void mem_buffer::set_device_context(int device) #if HAVE_ROCM if (is_rocm_supported()) { - hipSetDevice(device); + hipSetDevice(0); } #endif diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h index 9c45c8466e2..4b1c285b2b8 100644 --- a/test/gtest/common/mem_buffer.h +++ b/test/gtest/common/mem_buffer.h @@ -86,7 +86,7 @@ class mem_buffer { static bool is_gpu_supported(); /* set device context if compiled with GPU support */ - static void set_device_context(int device = 0); + static void set_device_context(); /* returns whether ROCM device supports managed memory */ static bool is_rocm_managed_supported(); diff --git a/test/gtest/ucp/test_ucp_mmap.cc b/test/gtest/ucp/test_ucp_mmap.cc index f9027258713..21b6cc3d2da 100644 --- a/test/gtest/ucp/test_ucp_mmap.cc +++ b/test/gtest/ucp/test_ucp_mmap.cc @@ -17,10 +17,6 @@ extern "C" { #include } -#if HAVE_CUDA -#include -#endif - #include #include @@ -1252,51 +1248,3 @@ UCS_TEST_P(test_ucp_mmap_export, export_import) { } UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_mmap_export) - -#if HAVE_CUDA -class test_ucp_mmap_mgpu : public ucs::test { -}; - -UCS_TEST_F(test_ucp_mmap_mgpu, switch_gpu) { - if (!mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) { - UCS_TEST_SKIP_R("cuda is not supported"); - } - - int num_devices; - ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess); - - if (num_devices < 2) { - UCS_TEST_SKIP_R("less than two cuda devices available"); - } - - ucs::handle config; - UCS_TEST_CREATE_HANDLE(ucp_config_t*, config, ucp_config_release, - ucp_config_read, NULL, NULL); - - ucs::handle context; - ucp_params_t params; - params.field_mask = UCP_PARAM_FIELD_FEATURES; - params.features = UCP_FEATURE_TAG; - UCS_TEST_CREATE_HANDLE(ucp_context_h, context, ucp_cleanup, ucp_init, - ¶ms, config.get()); - - int device; - ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); - ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); - - const size_t size = 16; - mem_buffer buffer(size, UCS_MEMORY_TYPE_CUDA); - - ASSERT_EQ(cudaSetDevice(device), cudaSuccess); - - ucp_mem_map_params_t mem_map_params; - mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | - UCP_MEM_MAP_PARAM_FIELD_LENGTH; - mem_map_params.address = buffer.ptr(); - mem_map_params.length = size; - - ucp_mem_h ucp_mem; - ASSERT_EQ(ucp_mem_map(context.get(), &mem_map_params, &ucp_mem), UCS_OK); - EXPECT_EQ(ucp_mem_unmap(context.get(), ucp_mem), UCS_OK); -} -#endif From e5f9f9e9c461a5160695481daa194749a5a22068 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Thu, 13 Feb 2025 18:38:51 +0200 Subject: [PATCH 10/12] GTEST/UCT/CUDA: Fixed code format issues. --- test/gtest/uct/cuda/test_cuda_copy_md.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc index 3f80d6328cd..35ddd031bdd 100644 --- a/test/gtest/uct/cuda/test_cuda_copy_md.cc +++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc @@ -20,7 +20,7 @@ uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const uct_mem_alloc_params_t params; params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE | - UCT_MEM_ALLOC_PARAM_FIELD_MDS | + UCT_MEM_ALLOC_PARAM_FIELD_MDS | UCT_MEM_ALLOC_PARAM_FIELD_NAME; params.mem_type = UCS_MEMORY_TYPE_CUDA; params.mds.mds = &md; @@ -50,7 +50,7 @@ UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) { EXPECT_EQ(cudaSetDevice(device), cudaSuccess); uct_md_mem_attr_t mem_attr = {}; - mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE; + mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE; EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr), UCS_OK); EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA); From 20bdcbb41ec339c85af927a03cab9b65b279809d Mon Sep 17 00:00:00 2001 From: rakhmets Date: Fri, 14 Feb 2025 13:22:04 +0200 Subject: [PATCH 11/12] GTEST/UCT/CUDA: Fixed type. --- test/gtest/uct/cuda/test_cuda_copy_md.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc index 35ddd031bdd..b230038c401 100644 --- a/test/gtest/uct/cuda/test_cuda_copy_md.cc +++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc @@ -44,8 +44,8 @@ UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) { ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); - const int size = 16; - auto mem = mem_alloc(size); + const size_t size = 16; + auto mem = mem_alloc(size); EXPECT_EQ(cudaSetDevice(device), cudaSuccess); From 578d42a1e9f7afb56adaa36b26f5f10310c5ded7 Mon Sep 17 00:00:00 2001 From: rakhmets Date: Fri, 14 Feb 2025 18:45:32 +0200 Subject: [PATCH 12/12] BUILDLIB: Removed setting visible device from load_cuda_env. --- buildlib/az-helpers.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh index d53d876ffd0..fc7aeead0a8 100644 --- a/buildlib/az-helpers.sh +++ b/buildlib/az-helpers.sh @@ -197,12 +197,6 @@ try_load_cuda_env() { then az_module_load dev/gdrcopy2.4.1_cuda12.5.1 && have_gdrcopy=yes fi - - # Set CUDA_VISIBLE_DEVICES - if [ -n "${worker}" ] - then - export CUDA_VISIBLE_DEVICES=$((worker % num_gpus)) - fi } load_cuda_env() {