Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 46 additions & 4 deletions src/runtime_environment/device/cuda/cuda_backend.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* See LICENSE.txt for details
*/
#include "cuda_backend.hpp"
#include <gauxc/exceptions.hpp>

namespace GauXC {

Expand All @@ -28,7 +29,41 @@ CUDABackend::CUDABackend() {

}

CUDABackend::~CUDABackend() noexcept = default;
#ifdef GAUXC_HAS_MPI
CUDABackend::CUDABackend(MPI_Comm c)
{
comm = c;
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &local_comm);
MPI_Comm_size(local_comm, &local_size);
MPI_Comm_rank(local_comm, &local_rank);
Comment on lines +35 to +39
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MPI_Comm_split_type creates local_comm, but the communicator is never freed. Since CUDABackend::~CUDABackend() is defaulted, this will leak MPI communicators over the lifetime of the process (and can become problematic if backends are created/destroyed multiple times). Consider calling MPI_Comm_free(&local_comm) in the destructor when local_comm != MPI_COMM_NULL (and similarly guard/free any other duplicated/split comms you introduce).

Copilot uses AI. Check for mistakes.
int ndev;
auto stat = cudaGetDeviceCount(&ndev);
GAUXC_CUDA_ERROR("CUDA backend init failed", stat);
if(ndev <= 0) GAUXC_GENERIC_EXCEPTION("No CUDA devices found");
gpuid = local_rank % ndev;
stat = cudaSetDevice(gpuid);
GAUXC_CUDA_ERROR("cudaSetDevice failed", stat);

Comment on lines +40 to +47
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gpuid = local_rank % ndev will invoke undefined behavior if ndev is 0. Even if cudaGetDeviceCount typically errors when no devices are present, it’s safer to explicitly handle ndev <= 0 (raise a clear exception) before the modulo. Also, cudaSetDevice(gpuid) returns a status that should be checked/handled via GAUXC_CUDA_ERROR for consistent error reporting.

Copilot uses AI. Check for mistakes.
// Create CUDA Stream and CUBLAS Handles and make them talk to eachother
master_stream = std::make_shared< util::cuda_stream >();
master_handle = std::make_shared< util::cublas_handle >();

cublasSetStream( *master_handle, *master_stream );

#ifdef GAUXC_HAS_MAGMA
// Setup MAGMA queue with CUDA stream / cuBLAS handle
master_magma_queue_ = std::make_shared< util::magma_queue >(0, *master_stream, *master_handle);
#endif
}
#endif

CUDABackend::~CUDABackend() noexcept {
#ifdef GAUXC_HAS_MPI
if(local_comm != MPI_COMM_NULL)
MPI_Comm_free(&local_comm);
#endif
}

CUDABackend::device_buffer_t CUDABackend::allocate_device_buffer(int64_t sz) {
void* ptr;
Expand All @@ -41,6 +76,14 @@ size_t CUDABackend::get_available_mem() {
size_t cuda_avail, cuda_total;
auto stat = cudaMemGetInfo( &cuda_avail, &cuda_total );
GAUXC_CUDA_ERROR( "MemInfo Failed", stat );
#ifdef GAUXC_HAS_MPI
int ndev;
stat = cudaGetDeviceCount(&ndev);
GAUXC_CUDA_ERROR("MemInfo Failed while getting number of devices", stat);
double factor = 1.0 / ((local_size - 1) / ndev + 1);
factor = (factor > 1.0 ? 1.0 : factor);
cuda_avail = size_t(cuda_avail * factor);
Comment on lines +80 to +85
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The memory-splitting logic uses ceil(local_size / ndev) (via ((local_size - 1) / ndev + 1)) for every rank, which does not equal “processes sharing the same GPU” under round-robin assignment when local_size % ndev != 0 (some GPUs have fewer ranks). This contradicts the PR description and can under-allocate usable memory on lightly loaded GPUs. To match the stated behavior, compute the exact number of local ranks mapped to this gpuid (e.g., MPI_Comm_split(local_comm, gpuid, local_rank, &gpu_comm) and use MPI_Comm_size(gpu_comm, &n_on_gpu)) and divide by that.

Copilot uses AI. Check for mistakes.
#endif
return cuda_avail;
}

Expand Down Expand Up @@ -137,8 +180,7 @@ void CUDABackend::check_error_(std::string msg) {
GAUXC_CUDA_ERROR("CUDA Failed ["+msg+"]", stat );
}


std::unique_ptr<DeviceBackend> make_device_backend() {
return std::make_unique<CUDABackend>();
std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c)) {
return std::make_unique<CUDABackend>(GAUXC_MPI_CODE(c));
}
}
9 changes: 9 additions & 0 deletions src/runtime_environment/device/cuda/cuda_backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@ struct CUDABackend : public DeviceBackend {

std::vector<std::shared_ptr<util::cuda_stream>> blas_streams;
std::vector<std::shared_ptr<util::cublas_handle>> blas_handles;

#ifdef GAUXC_HAS_MPI
MPI_Comm comm = MPI_COMM_NULL;
MPI_Comm local_comm = MPI_COMM_NULL;
int gpuid = 0;
int local_rank = 0;
int local_size = 1;
CUDABackend(MPI_Comm comm);
#endif
};

}
4 changes: 2 additions & 2 deletions src/runtime_environment/device/device_backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "device_queue.hpp"
#include "device_blas_handle.hpp"
#include <gauxc/gauxc_config.hpp>
#include <gauxc/util/mpi.hpp>

#ifdef GAUXC_HAS_MAGMA
#include "device_specific/magma_util.hpp"
Expand Down Expand Up @@ -99,6 +100,5 @@ class DeviceBackend {


/// Generate the default device backend for this platform
std::unique_ptr<DeviceBackend> make_device_backend();

std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c));
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {
size_t sz) : parent_type(GAUXC_MPI_CODE(c)),
i_own_this_memory_(false), device_memory_(p),
device_memory_size_(sz),
device_backend_{make_device_backend()} {}
device_backend_{make_device_backend(GAUXC_MPI_CODE(c))} {}


explicit DeviceRuntimeEnvironmentImpl(GAUXC_MPI_CODE(MPI_Comm c,)
Expand All @@ -44,6 +44,7 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {

// Allocate Device Memory
auto avail = device_backend_->get_available_mem();
GAUXC_MPI_CODE(MPI_Barrier(c);)
avail = std::min( avail, detail::memory_cap() );

std::tie( device_memory_, device_memory_size_ ) =
Expand Down
4 changes: 3 additions & 1 deletion src/runtime_environment/device/hip/hip_backend.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ void HIPBackend::check_error_(std::string msg) {
GAUXC_HIP_ERROR("HIP Failed ["+msg+"]", stat );
}

std::unique_ptr<DeviceBackend> make_device_backend() {
std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c))
{
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make_device_backend(GAUXC_MPI_CODE(MPI_Comm c)) ignores c for the HIP backend. In MPI-enabled builds this can trigger -Wunused-parameter (often treated as an error in CI). Consider marking the parameter as unused (e.g., (void)c; under #ifdef GAUXC_HAS_MPI or [[maybe_unused]]) or only adding the parameter on backends that actually use it.

Suggested change
{
{
#ifdef GAUXC_HAS_MPI
(void) c;
#endif

Copilot uses AI. Check for mistakes.
GAUXC_MPI_CODE((void)c;)
return std::make_unique<HIPBackend>();
}
}