Tweak CArena Defragmentation Strategy (#4531)

WeiqunZhang · web-flow · commit 1a59c5e6e000 · 2025-07-02T12:11:17.000-07:00
The previous strategy added in #4451 has a flaw. Suppose a CArena's initial size is small and we have n vectors each with a size of x. Now we are resizing these vectors one by one to size x+y, where y << x. Then we would end up with n new allocations each with a size of 2*x+y. We have doubled the memory usage in the end, because the unused spaces can not be combined. In the new strategy, we only attempt to combine allocations when the combined amount is not less than the requested amount of allocation. We also check the malloc error code now. If it fails, we will try to free more memory and call malloc again.
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
@@ -162,6 +162,10 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
     if (arena_info.use_cpu_memory)
     {
         p = std::malloc(nbytes);
+        if (!p) {
+            freeUnused_protected();
+            p = std::malloc(nbytes);
+        }
 #ifndef _WIN32
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
@@ -175,28 +179,52 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
     }
     else if (arena_info.device_use_hostalloc)
     {
-        AMREX_HIP_OR_CUDA_OR_SYCL(
-            AMREX_HIP_SAFE_CALL (hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent));,
-            AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, nbytes, cudaHostAllocMapped));,
-            p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()));
+#if defined(AMREX_USE_HIP)
+        auto ret = hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent);
+        if (ret != hipSuccess) { p = nullptr; }
+#elif defined(AMREX_USE_CUDA)
+        auto ret = cudaHostAlloc(&p, nbytes, cudaHostAllocMapped);
+        if (ret != cudaSuccess) { p = nullptr; }
+#else
+        p = sycl::malloc_host(nbytes, Gpu::Device::syclContext());
+#endif
+        if (!p) {
+            freeUnused_protected();
+            AMREX_HIP_OR_CUDA_OR_SYCL(
+                AMREX_HIP_SAFE_CALL (hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent));,
+                AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, nbytes, cudaHostAllocMapped));,
+                p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()));
+        }
     }
     else
     {
         std::size_t free_mem_avail = Gpu::Device::freeMemAvailable();
         if (nbytes >= free_mem_avail) {
             free_mem_avail += freeUnused_protected(); // For CArena, mutex has already acquired
-            if (abort_on_out_of_gpu_memory && nbytes >= free_mem_avail) {
+            if (abort_on_out_of_gpu_memory && nbytes >= free_mem_avail && arena_info.device_use_managed_memory) {
                 amrex::Abort("Out of gpu memory. Free: " + std::to_string(free_mem_avail)
                              + " Asked: " + std::to_string(nbytes));
             }
         }
 
         if (arena_info.device_use_managed_memory)
         {
-            AMREX_HIP_OR_CUDA_OR_SYCL
-                (AMREX_HIP_SAFE_CALL(hipMallocManaged(&p, nbytes));,
-                 AMREX_CUDA_SAFE_CALL(cudaMallocManaged(&p, nbytes));,
-                 p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
+#if defined(AMREX_USE_HIP)
+            auto ret = hipMallocManaged(&p, nbytes);
+            if (ret != hipSuccess) { p = nullptr; }
+#elif defined(AMREX_USE_CUDA)
+            auto ret = cudaMallocManaged(&p, nbytes);
+            if (ret != cudaSuccess) { p = nullptr; }
+#else
+            p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext());
+#endif
+            if (!p) {
+                freeUnused_protected();
+                AMREX_HIP_OR_CUDA_OR_SYCL
+                    (AMREX_HIP_SAFE_CALL(hipMallocManaged(&p, nbytes));,
+                     AMREX_CUDA_SAFE_CALL(cudaMallocManaged(&p, nbytes));,
+                     p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
+            }
 #ifdef AMREX_USE_HIP
             // Otherwise atomiAdd won't work because we instruct the compiler to do unsafe atomics
             AMREX_HIP_SAFE_CALL(hipMemAdvise(p, nbytes, hipMemAdviseSetCoarseGrain,
@@ -214,14 +242,30 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
         }
         else
         {
-            AMREX_HIP_OR_CUDA_OR_SYCL
-                (AMREX_HIP_SAFE_CALL ( hipMalloc(&p, nbytes));,
-                 AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, nbytes));,
-                 p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
+#if defined(AMREX_USE_HIP)
+            auto ret = hipMalloc(&p, nbytes);
+            if (ret != hipSuccess) { p = nullptr; }
+#elif defined(AMREX_USE_CUDA)
+            auto ret = cudaMalloc(&p, nbytes);
+            if (ret != cudaSuccess) { p = nullptr; }
+#else
+            p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext());
+#endif
+            if (!p) {
+                freeUnused_protected();
+                AMREX_HIP_OR_CUDA_OR_SYCL
+                    (AMREX_HIP_SAFE_CALL ( hipMalloc(&p, nbytes));,
+                     AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, nbytes));,
+                     p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
+            }
         }
     }
 #else
     p = std::malloc(nbytes);
+    if (!p) {
+        freeUnused_protected();
+        p = std::malloc(nbytes);
+    }
 #ifndef _WIN32
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
@@ -65,6 +65,9 @@ public:
 
     std::size_t freeUnused () final;
 
+    //! Return the amount of memory that can be freed
+    [[nodiscard]] std::size_t freeableMemory () const;
+
     /**
      * \brief Does the device have enough free memory for allocating this
      * much memory?  For CPU builds, this always return true.  This is not a
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
@@ -65,11 +65,16 @@ CArena::alloc_protected (std::size_t nbytes)
     if (free_it == m_freelist.end())
     {
         // Both freeUnused_protected and allocate_system may invalidate free_it.
-        // All unused memory allocations are combined with the new one to reduce fragmentation.
-        const auto freed_bytes = (freeunused_called || !arena_info.defragmentation)
-            ? std::size_t(0) : freeUnused_protected();
 
-        const std::size_t N = std::max(m_hunk, freed_bytes + nbytes);
+        std::size_t N = std::max(m_hunk, nbytes);
+
+        if ((!freeunused_called) && arena_info.defragmentation) {
+            auto freeable_nbytes = freeableMemory();
+            if (freeable_nbytes >= N) {
+                freeUnused_protected();
+                N = freeable_nbytes;
+            }
+        }
 
         vp = allocate_system(N);
 
@@ -355,6 +360,19 @@ CArena::freeUnused ()
     return freeUnused_protected();
 }
 
+std::size_t
+CArena::freeableMemory () const
+{
+    std::size_t nbytes = 0;
+    for (auto const& [p, sz] : m_alloc) {
+        auto it = m_freelist.find(Node(p,nullptr,0));
+        if (it != m_freelist.end() && it->owner() == p && it->size()  == sz) {
+            nbytes += sz;
+        }
+    }
+    return nbytes;
+}
+
 std::size_t
 CArena::freeUnused_protected ()
 {