wavefunction91 · vmitq · Nov 5, 2024 · Apr 17, 2026 · Copilot · Apr 6, 2026
diff --git a/src/runtime_environment/device/cuda/cuda_backend.cxx b/src/runtime_environment/device/cuda/cuda_backend.cxx
@@ -10,6 +10,7 @@
  * See LICENSE.txt for details
  */
 #include "cuda_backend.hpp"
+#include <gauxc/exceptions.hpp>
 
 namespace GauXC {
 
@@ -28,7 +29,41 @@ CUDABackend::CUDABackend() {
 
 }
 
-CUDABackend::~CUDABackend() noexcept = default;
+#ifdef GAUXC_HAS_MPI
+CUDABackend::CUDABackend(MPI_Comm c)
+{
+  comm = c;
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
+                      MPI_INFO_NULL, &local_comm);
+  MPI_Comm_size(local_comm, &local_size);
+  MPI_Comm_rank(local_comm, &local_rank);
+  int ndev;
+  auto stat = cudaGetDeviceCount(&ndev);
+  GAUXC_CUDA_ERROR("CUDA backend init failed", stat);
+  if(ndev <= 0) GAUXC_GENERIC_EXCEPTION("No CUDA devices found");
+  gpuid = local_rank % ndev;
+  stat = cudaSetDevice(gpuid);
+  GAUXC_CUDA_ERROR("cudaSetDevice failed", stat);
+
+  // Create CUDA Stream and CUBLAS Handles and make them talk to eachother
+  master_stream = std::make_shared< util::cuda_stream >();
+  master_handle = std::make_shared< util::cublas_handle >();
+
+  cublasSetStream( *master_handle, *master_stream );
+
+#ifdef GAUXC_HAS_MAGMA
+  // Setup MAGMA queue with CUDA stream / cuBLAS handle
+  master_magma_queue_ = std::make_shared< util::magma_queue >(0, *master_stream, *master_handle);
+#endif
+}
+#endif
+
+CUDABackend::~CUDABackend() noexcept {
+#ifdef GAUXC_HAS_MPI
+  if(local_comm != MPI_COMM_NULL)
+    MPI_Comm_free(&local_comm);
+#endif
+}
 
 CUDABackend::device_buffer_t CUDABackend::allocate_device_buffer(int64_t sz) {
   void* ptr;
@@ -41,6 +76,14 @@ size_t CUDABackend::get_available_mem() {
   size_t cuda_avail, cuda_total;
   auto stat = cudaMemGetInfo( &cuda_avail, &cuda_total );
   GAUXC_CUDA_ERROR( "MemInfo Failed", stat );
+#ifdef GAUXC_HAS_MPI
+  int ndev;
+  stat = cudaGetDeviceCount(&ndev);
+  GAUXC_CUDA_ERROR("MemInfo Failed while getting number of devices", stat);
+  double factor = 1.0 / ((local_size - 1) / ndev + 1);
+  factor = (factor > 1.0 ? 1.0 : factor);
+  cuda_avail = size_t(cuda_avail * factor);
+#endif
   return cuda_avail;
 }
 
@@ -137,8 +180,7 @@ void CUDABackend::check_error_(std::string msg) {
   GAUXC_CUDA_ERROR("CUDA Failed ["+msg+"]", stat );
 }
 
-
-std::unique_ptr<DeviceBackend> make_device_backend() {
-  return std::make_unique<CUDABackend>();
+std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c)) {
+  return std::make_unique<CUDABackend>(GAUXC_MPI_CODE(c));
 }
 }
diff --git a/src/runtime_environment/device/cuda/cuda_backend.hpp b/src/runtime_environment/device/cuda/cuda_backend.hpp
@@ -52,6 +52,15 @@ struct CUDABackend : public DeviceBackend {
 
   std::vector<std::shared_ptr<util::cuda_stream>>   blas_streams;
   std::vector<std::shared_ptr<util::cublas_handle>> blas_handles;
+
+#ifdef GAUXC_HAS_MPI
+  MPI_Comm comm = MPI_COMM_NULL;
+  MPI_Comm local_comm = MPI_COMM_NULL;
+  int gpuid = 0;
+  int local_rank = 0;
+  int local_size = 1;
+  CUDABackend(MPI_Comm comm);
+#endif
 };
 
 }
diff --git a/src/runtime_environment/device/device_backend.hpp b/src/runtime_environment/device/device_backend.hpp
@@ -17,6 +17,7 @@
 #include "device_queue.hpp"
 #include "device_blas_handle.hpp"
 #include <gauxc/gauxc_config.hpp>
+#include <gauxc/util/mpi.hpp>
 
 #ifdef GAUXC_HAS_MAGMA
 #include "device_specific/magma_util.hpp"
@@ -99,6 +100,5 @@ class DeviceBackend {
 
 
 /// Generate the default device backend for this platform
-std::unique_ptr<DeviceBackend> make_device_backend();
-
+std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c));
 }
diff --git a/src/runtime_environment/device/device_runtime_environment_impl.hpp b/src/runtime_environment/device/device_runtime_environment_impl.hpp
@@ -35,7 +35,7 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {
    size_t sz) : parent_type(GAUXC_MPI_CODE(c)), 
      i_own_this_memory_(false), device_memory_(p), 
      device_memory_size_(sz),
-     device_backend_{make_device_backend()} {}
+     device_backend_{make_device_backend(GAUXC_MPI_CODE(c))} {}
 
 
   explicit DeviceRuntimeEnvironmentImpl(GAUXC_MPI_CODE(MPI_Comm c,)
@@ -44,6 +44,7 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {
 
     // Allocate Device Memory
     auto avail = device_backend_->get_available_mem();
+    GAUXC_MPI_CODE(MPI_Barrier(c);)
     avail = std::min( avail, detail::memory_cap() );
 
     std::tie( device_memory_, device_memory_size_ ) = 

diff --git a/src/runtime_environment/device/hip/hip_backend.cxx b/src/runtime_environment/device/hip/hip_backend.cxx
@@ -128,7 +128,9 @@ void HIPBackend::check_error_(std::string msg) {
   GAUXC_HIP_ERROR("HIP Failed ["+msg+"]", stat );
 }
 
-std::unique_ptr<DeviceBackend> make_device_backend() {
+std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c))
+{
-{
+{
+#ifdef GAUXC_HAS_MPI
+  (void) c;
+#endif
-{
+{
+#ifdef GAUXC_HAS_MPI
+  (void) c;
+#endif
+  GAUXC_MPI_CODE((void)c;)
   return std::make_unique<HIPBackend>();
 }
 }