Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/CUDA/CUDA_COPY: Enabled memory attributes query after switching CUDA GPU. #10388

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions buildlib/az-helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,6 @@ try_load_cuda_env() {
then
az_module_load dev/gdrcopy2.4.1_cuda12.5.1 && have_gdrcopy=yes
fi

# Set CUDA_VISIBLE_DEVICES
if [ -n "${worker}" ]
then
export CUDA_VISIBLE_DEVICES=$((worker % num_gpus))
fi
}

load_cuda_env() {
Expand Down
82 changes: 71 additions & 11 deletions src/uct/cuda/cuda_copy/cuda_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,68 @@ static size_t uct_cuda_copy_md_get_total_device_mem(CUdevice cuda_device)
return 1; /* return 1 byte to avoid division by zero */
}

/**
* Get information on memory allocations.
*
* @param [in] address Pointer to the memory allocation to query
* @param [in] length Size of the allocation
* @param [in] ctx CUDA context on which a pointer was allocated.
* NULL in case of VMM
* @param [out] base_address_p Returned base address
* @param [out] alloc_length_p Returned size of the memory allocation
*
* @return Error code as defined by @ref ucs_status_t.
*/
static ucs_status_t
uct_cuda_copy_md_get_address_range(const void *address, size_t length,
CUcontext ctx, void **base_address_p,
size_t *alloc_length_p)
{
ucs_log_level_t log_level = (ctx == NULL) ? UCS_LOG_LEVEL_DEBUG :
UCS_LOG_LEVEL_ERROR;
ucs_status_t status;
CUdeviceptr base;
size_t size;
ucs_status_t status_ctx_pop;
CUcontext popped_ctx;

if (ctx != NULL) {
/* GetAddressRange requires context to be set. On DGXA100 it takes
* 0.03us to push and pop the context associated with address. */
status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(ctx));
if (status != UCS_OK) {
return status;
}
}

status = UCT_CUDADRV_FUNC(cuMemGetAddressRange(&base, &size,
(CUdeviceptr)address),
log_level);
if (ctx != NULL) {
status_ctx_pop = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPopCurrent(&popped_ctx));
if (status != UCS_OK) {
/* cuMemGetAddressRange failed after pushing non-NULL context */
return UCS_ERR_INVALID_ADDR;
}

if (status_ctx_pop != UCS_OK) {
return status_ctx_pop;
}
}

if (status == UCS_OK) {
*base_address_p = (void*)base;
*alloc_length_p = size;
} else {
/* Use default values when cuMemGetAddressRange failed without pushing
* non-NULL context */
*base_address_p = (void*)address;
*alloc_length_p = length;
}

return UCS_OK;
}

static ucs_status_t
uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
size_t length, ucs_memory_info_t *mem_info)
Expand All @@ -582,11 +644,11 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
CUcontext cuda_mem_ctx = NULL;
CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
CUdeviceptr base_address;
void *base_address;
size_t alloc_length;
size_t total_bytes;
int32_t pref_loc;
unsigned is_vmm;
int is_vmm;
CUresult cu_err;
ucs_status_t status;

Expand Down Expand Up @@ -681,16 +743,14 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
goto out_default_range;
}

cu_err = cuMemGetAddressRange(&base_address, &alloc_length,
(CUdeviceptr)address);
if (cu_err != CUDA_SUCCESS) {
ucs_error("cuMemGetAddressRange(%p) error: %s", address,
uct_cuda_base_cu_get_error_string(cu_err));
return UCS_ERR_INVALID_ADDR;
status = uct_cuda_copy_md_get_address_range(address, length, cuda_mem_ctx,
&base_address, &alloc_length);
if (status != UCS_OK) {
return status;
}

ucs_trace("query address %p: 0x%llx..0x%llx length %zu", address,
base_address, base_address + alloc_length, alloc_length);
ucs_trace("query address %p: %p..%p length %zu", address, base_address,
UCS_PTR_BYTE_OFFSET(base_address, alloc_length), alloc_length);

if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) {
total_bytes = uct_cuda_copy_md_get_total_device_mem(cuda_device);
Expand All @@ -701,7 +761,7 @@ uct_cuda_copy_md_query_attributes(uct_cuda_copy_md_t *md, const void *address,
ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON);
}

mem_info->base_address = (void*)base_address;
mem_info->base_address = base_address;
mem_info->alloc_length = alloc_length;
return UCS_OK;

Expand Down
1 change: 1 addition & 0 deletions test/gtest/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ if HAVE_CUDA
gtest_SOURCES += \
common/cuda_context.cc \
ucm/cuda_hooks.cc \
uct/cuda/test_cuda_copy_md.cc \
uct/cuda/test_cuda_ipc_md.cc
gtest_CPPFLAGS += \
$(CUDA_CPPFLAGS)
Expand Down
60 changes: 60 additions & 0 deletions test/gtest/uct/cuda/test_cuda_copy_md.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#include <uct/test_md.h>

#include <cuda_runtime.h>

class test_cuda_copy_md_multi_gpu : public test_md {
public:
uct_allocated_memory_t mem_alloc(size_t size) const;
};

uct_allocated_memory_t test_cuda_copy_md_multi_gpu::mem_alloc(size_t size) const
{
uct_alloc_method_t method = UCT_ALLOC_METHOD_MD;
uct_md_h md = m_md.get();

uct_mem_alloc_params_t params;
params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE |
UCT_MEM_ALLOC_PARAM_FIELD_MDS |
UCT_MEM_ALLOC_PARAM_FIELD_NAME;
params.mem_type = UCS_MEMORY_TYPE_CUDA;
params.mds.mds = &md;
params.mds.count = 1;
params.name = "test_cuda_copy_md_multi_gpu";

uct_allocated_memory_t mem;
EXPECT_EQ(uct_mem_alloc(size, &method, 1, &params, &mem), UCS_OK);
return mem;
}

UCS_TEST_P(test_cuda_copy_md_multi_gpu, mem_query) {
int num_devices;
ASSERT_EQ(cudaGetDeviceCount(&num_devices), cudaSuccess);

if (num_devices < 2) {
UCS_TEST_SKIP_R("less than two cuda devices available");
}

int device;
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
ASSERT_EQ(cudaSetDevice((device + 1) % num_devices), cudaSuccess);

const size_t size = 16;
auto mem = mem_alloc(size);

EXPECT_EQ(cudaSetDevice(device), cudaSuccess);

uct_md_mem_attr_t mem_attr = {};
mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
EXPECT_EQ(uct_md_mem_query(m_md.get(), mem.address, size, &mem_attr),
UCS_OK);
EXPECT_EQ(mem_attr.mem_type, UCS_MEMORY_TYPE_CUDA);
EXPECT_EQ(uct_mem_free(&mem), UCS_OK);
}

_UCT_MD_INSTANTIATE_TEST_CASE(test_cuda_copy_md_multi_gpu, cuda_cpy);
Loading