diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh index d53d876ffd0..fc7aeead0a8 100644 --- a/buildlib/az-helpers.sh +++ b/buildlib/az-helpers.sh @@ -197,12 +197,6 @@ try_load_cuda_env() { then az_module_load dev/gdrcopy2.4.1_cuda12.5.1 && have_gdrcopy=yes fi - - # Set CUDA_VISIBLE_DEVICES - if [ -n "${worker}" ] - then - export CUDA_VISIBLE_DEVICES=$((worker % num_gpus)) - fi } load_cuda_env() { diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index df46630726a..454c4597308 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -571,6 +571,68 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device) return 1; /* return 1 byte to avoid division by zero */ } +/** + * Get information on memory allocations. + * + * @param [in] address Pointer to the memory allocation to query + * @param [in] length Size of the allocation + * @param [in] ctx CUDA context on which a pointer was allocated. + * NULL in case of VMM + * @param [out] base_address_p Returned base address + * @param [out] alloc_length_p Returned size of the memory allocation + * + * @return Error code as defined by @ref ucs_status_t. + */ +static ucs_status_t +uct_cuda_copy_md_get_address_range(const void *address, size_t length, + CUcontext ctx, void **base_address_p, + size_t *alloc_length_p) +{ + ucs_log_level_t log_level = (ctx == NULL) ? UCS_LOG_LEVEL_DEBUG : + UCS_LOG_LEVEL_ERROR; + ucs_status_t status; + CUdeviceptr base; + size_t size; + ucs_status_t status_ctx_pop; + CUcontext popped_ctx; + + if (ctx != NULL) { + /* GetAddressRange requires context to be set. On DGXA100 it takes + * 0.03us to push and pop the context associated with address. */ + status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx)); + if (status != UCS_OK) { + return status; + } + } + + status = UCT_CUDADRV_FUNC(cuMemGetAddressRange(&base, &size, + (CUdeviceptr)address), + log_level); + if (ctx != NULL) { + status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx)); + if (status != UCS_OK) { + /* cuMemGetAddressRange failed after pushing non-NULL context */ + return UCS_ERR_INVALID_ADDR; + } + + if (status_ctx_pop != UCS_OK) { + return status_ctx_pop; + } + } + + if (status == UCS_OK) { + *base_address_p = (void*)base; + *alloc_length_p = size; + } else { + /* Use default values when cuMemGetAddressRange failed without pushing + * non-NULL context */ + *base_address_p = (void*)address; + *alloc_length_p = length; + } + + return UCS_OK; +} + static ucs_status_t uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, size_t length, ucs_memory_info_t *mem_info) @@ -582,11 +644,11 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, CUcontext cuda_mem_ctx = NULL; CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS]; void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS]; - CUdeviceptr base_address; + void *base_address; size_t alloc_length; size_t total_bytes; int32_t pref_loc; - unsigned is_vmm; + int is_vmm; CUresult cu_err; ucs_status_t status; @@ -681,16 +743,14 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, goto out_default_range; } - cu_err = cuMemGetAddressRange(&base_address, &alloc_length, - (CUdeviceptr)address); - if (cu_err != CUDA_SUCCESS) { - ucs_error("cuMemGetAddressRange(%p) error: %s", address, - uct_cuda_base_cu_get_error_string(cu_err)); - return UCS_ERR_INVALID_ADDR; + status = uct_cuda_copy_md_get_address_range(address, length, cuda_mem_ctx, + &base_address, &alloc_length); + if (status != UCS_OK) { + return status; } - ucs_trace("query address %p: 0x%llx..0x%llx length %zu", address, - base_address, base_address + alloc_length, alloc_length); + ucs_trace("query address %p: %p..%p length %zu", address, base_address, + UCS_PTR_BYTE_OFFSET(base_address, alloc_length), alloc_length); if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) { total_bytes = uct_cuda_copy_md_get_total_device_mem(cuda_device); @@ -701,7 +761,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address, ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON); } - mem_info->base_address = (void*)base_address; + mem_info->base_address = base_address; mem_info->alloc_length = alloc_length; return UCS_OK; diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am index a2f265fcf07..68a1ed353bc 100644 --- a/test/gtest/Makefile.am +++ b/test/gtest/Makefile.am @@ -260,6 +260,7 @@ if HAVE_CUDA gtest_SOURCES += \ common/cuda_context.cc \ ucm/cuda_hooks.cc \ + uct/cuda/test_cuda_copy_md.cc \ uct/cuda/test_cuda_ipc_md.cc gtest_CPPFLAGS += \ $(CUDA_CPPFLAGS) diff --git a/test/gtest/uct/cuda/test_cuda_copy_md.cc b/test/gtest/uct/cuda/test_cuda_copy_md.cc new file mode 100644 index 00000000000..b230038c401 --- /dev/null +++ b/test/gtest/uct/cuda/test_cuda_copy_md.cc @@ -0,0 +1,60 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include + +#include + +class test_cuda_copy_md_multi_gpu : public test_md { +public: + uct_allocated_memory_t mem_alloc(size_t size) const; +}; + +uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const +{ + uct_alloc_method_t method = UCT_ALLOC_METHOD_MD; + uct_md_h md = m_md.get(); + + uct_mem_alloc_params_t params; + params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE | + UCT_MEM_ALLOC_PARAM_FIELD_MDS | + UCT_MEM_ALLOC_PARAM_FIELD_NAME; + params.mem_type = UCS_MEMORY_TYPE_CUDA; + params.mds.mds = &md; + params.mds.count = 1; + params.name = "test_cuda_copy_md_multi_gpu"; + + uct_allocated_memory_t mem; + EXPECT_EQ(uct_mem_alloc(size, &method, 1, ¶ms, &mem), UCS_OK); + return mem; +} + +UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) { + int num_devices; + ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess); + + if (num_devices < 2) { + UCS_TEST_SKIP_R("less than two cuda devices available"); + } + + int device; + ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); + ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess); + + const size_t size = 16; + auto mem = mem_alloc(size); + + EXPECT_EQ(cudaSetDevice(device), cudaSuccess); + + uct_md_mem_attr_t mem_attr = {}; + mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE; + EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr), + UCS_OK); + EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA); + EXPECT_EQ(uct_mem_free(&mem), UCS_OK); +} + +_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md_multi_gpu, cuda_cpy);