-
Notifications
You must be signed in to change notification settings - Fork 35
Improve CUDA resource management for MPI jobs #185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |
| * See LICENSE.txt for details | ||
| */ | ||
| #include "cuda_backend.hpp" | ||
| #include <gauxc/exceptions.hpp> | ||
|
|
||
| namespace GauXC { | ||
|
|
||
|
|
@@ -28,7 +29,41 @@ CUDABackend::CUDABackend() { | |
|
|
||
| } | ||
|
|
||
| CUDABackend::~CUDABackend() noexcept = default; | ||
| #ifdef GAUXC_HAS_MPI | ||
| CUDABackend::CUDABackend(MPI_Comm c) | ||
| { | ||
| comm = c; | ||
| MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, | ||
| MPI_INFO_NULL, &local_comm); | ||
| MPI_Comm_size(local_comm, &local_size); | ||
| MPI_Comm_rank(local_comm, &local_rank); | ||
| int ndev; | ||
| auto stat = cudaGetDeviceCount(&ndev); | ||
| GAUXC_CUDA_ERROR("CUDA backend init failed", stat); | ||
| if(ndev <= 0) GAUXC_GENERIC_EXCEPTION("No CUDA devices found"); | ||
| gpuid = local_rank % ndev; | ||
| stat = cudaSetDevice(gpuid); | ||
| GAUXC_CUDA_ERROR("cudaSetDevice failed", stat); | ||
|
|
||
|
Comment on lines
+40
to
+47
|
||
| // Create CUDA Stream and CUBLAS Handles and make them talk to eachother | ||
| master_stream = std::make_shared< util::cuda_stream >(); | ||
| master_handle = std::make_shared< util::cublas_handle >(); | ||
|
|
||
| cublasSetStream( *master_handle, *master_stream ); | ||
|
|
||
| #ifdef GAUXC_HAS_MAGMA | ||
| // Setup MAGMA queue with CUDA stream / cuBLAS handle | ||
| master_magma_queue_ = std::make_shared< util::magma_queue >(0, *master_stream, *master_handle); | ||
| #endif | ||
| } | ||
| #endif | ||
|
|
||
| CUDABackend::~CUDABackend() noexcept { | ||
| #ifdef GAUXC_HAS_MPI | ||
| if(local_comm != MPI_COMM_NULL) | ||
| MPI_Comm_free(&local_comm); | ||
| #endif | ||
| } | ||
|
|
||
| CUDABackend::device_buffer_t CUDABackend::allocate_device_buffer(int64_t sz) { | ||
| void* ptr; | ||
|
|
@@ -41,6 +76,14 @@ size_t CUDABackend::get_available_mem() { | |
| size_t cuda_avail, cuda_total; | ||
| auto stat = cudaMemGetInfo( &cuda_avail, &cuda_total ); | ||
| GAUXC_CUDA_ERROR( "MemInfo Failed", stat ); | ||
| #ifdef GAUXC_HAS_MPI | ||
| int ndev; | ||
| stat = cudaGetDeviceCount(&ndev); | ||
| GAUXC_CUDA_ERROR("MemInfo Failed while getting number of devices", stat); | ||
| double factor = 1.0 / ((local_size - 1) / ndev + 1); | ||
| factor = (factor > 1.0 ? 1.0 : factor); | ||
| cuda_avail = size_t(cuda_avail * factor); | ||
|
Comment on lines
+80
to
+85
|
||
| #endif | ||
| return cuda_avail; | ||
| } | ||
|
|
||
|
|
@@ -137,8 +180,7 @@ void CUDABackend::check_error_(std::string msg) { | |
| GAUXC_CUDA_ERROR("CUDA Failed ["+msg+"]", stat ); | ||
| } | ||
|
|
||
|
|
||
| std::unique_ptr<DeviceBackend> make_device_backend() { | ||
| return std::make_unique<CUDABackend>(); | ||
| std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c)) { | ||
| return std::make_unique<CUDABackend>(GAUXC_MPI_CODE(c)); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -128,7 +128,9 @@ void HIPBackend::check_error_(std::string msg) { | |||||||||||
| GAUXC_HIP_ERROR("HIP Failed ["+msg+"]", stat ); | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| std::unique_ptr<DeviceBackend> make_device_backend() { | ||||||||||||
| std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c)) | ||||||||||||
| { | ||||||||||||
|
||||||||||||
| { | |
| { | |
| #ifdef GAUXC_HAS_MPI | |
| (void) c; | |
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MPI_Comm_split_typecreateslocal_comm, but the communicator is never freed. SinceCUDABackend::~CUDABackend()is defaulted, this will leak MPI communicators over the lifetime of the process (and can become problematic if backends are created/destroyed multiple times). Consider callingMPI_Comm_free(&local_comm)in the destructor whenlocal_comm != MPI_COMM_NULL(and similarly guard/free any other duplicated/split comms you introduce).