Skip to content

Commit 1a59c5e

Browse files
authored
Tweak CArena Defragmentation Strategy (#4531)
The previous strategy added in #4451 has a flaw. Suppose a CArena's initial size is small and we have n vectors each with a size of x. Now we are resizing these vectors one by one to size x+y, where y << x. Then we would end up with n new allocations each with a size of 2*x+y. We have doubled the memory usage in the end, because the unused spaces can not be combined. In the new strategy, we only attempt to combine allocations when the combined amount is not less than the requested amount of allocation. We also check the malloc error code now. If it fails, we will try to free more memory and call malloc again.
1 parent 69ad456 commit 1a59c5e

File tree

3 files changed

+82
-17
lines changed

3 files changed

+82
-17
lines changed

Src/Base/AMReX_Arena.cpp

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
162162
if (arena_info.use_cpu_memory)
163163
{
164164
p = std::malloc(nbytes);
165+
if (!p) {
166+
freeUnused_protected();
167+
p = std::malloc(nbytes);
168+
}
165169
#ifndef _WIN32
166170
#if defined(__GNUC__) && !defined(__clang__)
167171
#pragma GCC diagnostic push
@@ -175,28 +179,52 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
175179
}
176180
else if (arena_info.device_use_hostalloc)
177181
{
178-
AMREX_HIP_OR_CUDA_OR_SYCL(
179-
AMREX_HIP_SAFE_CALL (hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent));,
180-
AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, nbytes, cudaHostAllocMapped));,
181-
p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()));
182+
#if defined(AMREX_USE_HIP)
183+
auto ret = hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent);
184+
if (ret != hipSuccess) { p = nullptr; }
185+
#elif defined(AMREX_USE_CUDA)
186+
auto ret = cudaHostAlloc(&p, nbytes, cudaHostAllocMapped);
187+
if (ret != cudaSuccess) { p = nullptr; }
188+
#else
189+
p = sycl::malloc_host(nbytes, Gpu::Device::syclContext());
190+
#endif
191+
if (!p) {
192+
freeUnused_protected();
193+
AMREX_HIP_OR_CUDA_OR_SYCL(
194+
AMREX_HIP_SAFE_CALL (hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent));,
195+
AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, nbytes, cudaHostAllocMapped));,
196+
p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()));
197+
}
182198
}
183199
else
184200
{
185201
std::size_t free_mem_avail = Gpu::Device::freeMemAvailable();
186202
if (nbytes >= free_mem_avail) {
187203
free_mem_avail += freeUnused_protected(); // For CArena, mutex has already acquired
188-
if (abort_on_out_of_gpu_memory && nbytes >= free_mem_avail) {
204+
if (abort_on_out_of_gpu_memory && nbytes >= free_mem_avail && arena_info.device_use_managed_memory) {
189205
amrex::Abort("Out of gpu memory. Free: " + std::to_string(free_mem_avail)
190206
+ " Asked: " + std::to_string(nbytes));
191207
}
192208
}
193209

194210
if (arena_info.device_use_managed_memory)
195211
{
196-
AMREX_HIP_OR_CUDA_OR_SYCL
197-
(AMREX_HIP_SAFE_CALL(hipMallocManaged(&p, nbytes));,
198-
AMREX_CUDA_SAFE_CALL(cudaMallocManaged(&p, nbytes));,
199-
p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
212+
#if defined(AMREX_USE_HIP)
213+
auto ret = hipMallocManaged(&p, nbytes);
214+
if (ret != hipSuccess) { p = nullptr; }
215+
#elif defined(AMREX_USE_CUDA)
216+
auto ret = cudaMallocManaged(&p, nbytes);
217+
if (ret != cudaSuccess) { p = nullptr; }
218+
#else
219+
p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext());
220+
#endif
221+
if (!p) {
222+
freeUnused_protected();
223+
AMREX_HIP_OR_CUDA_OR_SYCL
224+
(AMREX_HIP_SAFE_CALL(hipMallocManaged(&p, nbytes));,
225+
AMREX_CUDA_SAFE_CALL(cudaMallocManaged(&p, nbytes));,
226+
p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
227+
}
200228
#ifdef AMREX_USE_HIP
201229
// Otherwise atomiAdd won't work because we instruct the compiler to do unsafe atomics
202230
AMREX_HIP_SAFE_CALL(hipMemAdvise(p, nbytes, hipMemAdviseSetCoarseGrain,
@@ -214,14 +242,30 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
214242
}
215243
else
216244
{
217-
AMREX_HIP_OR_CUDA_OR_SYCL
218-
(AMREX_HIP_SAFE_CALL ( hipMalloc(&p, nbytes));,
219-
AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, nbytes));,
220-
p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
245+
#if defined(AMREX_USE_HIP)
246+
auto ret = hipMalloc(&p, nbytes);
247+
if (ret != hipSuccess) { p = nullptr; }
248+
#elif defined(AMREX_USE_CUDA)
249+
auto ret = cudaMalloc(&p, nbytes);
250+
if (ret != cudaSuccess) { p = nullptr; }
251+
#else
252+
p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext());
253+
#endif
254+
if (!p) {
255+
freeUnused_protected();
256+
AMREX_HIP_OR_CUDA_OR_SYCL
257+
(AMREX_HIP_SAFE_CALL ( hipMalloc(&p, nbytes));,
258+
AMREX_CUDA_SAFE_CALL(cudaMalloc(&p, nbytes));,
259+
p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), Gpu::Device::syclContext()));
260+
}
221261
}
222262
}
223263
#else
224264
p = std::malloc(nbytes);
265+
if (!p) {
266+
freeUnused_protected();
267+
p = std::malloc(nbytes);
268+
}
225269
#ifndef _WIN32
226270
#if defined(__GNUC__) && !defined(__clang__)
227271
#pragma GCC diagnostic push

Src/Base/AMReX_CArena.H

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ public:
6565

6666
std::size_t freeUnused () final;
6767

68+
//! Return the amount of memory that can be freed
69+
[[nodiscard]] std::size_t freeableMemory () const;
70+
6871
/**
6972
* \brief Does the device have enough free memory for allocating this
7073
* much memory? For CPU builds, this always return true. This is not a

Src/Base/AMReX_CArena.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,16 @@ CArena::alloc_protected (std::size_t nbytes)
6565
if (free_it == m_freelist.end())
6666
{
6767
// Both freeUnused_protected and allocate_system may invalidate free_it.
68-
// All unused memory allocations are combined with the new one to reduce fragmentation.
69-
const auto freed_bytes = (freeunused_called || !arena_info.defragmentation)
70-
? std::size_t(0) : freeUnused_protected();
7168

72-
const std::size_t N = std::max(m_hunk, freed_bytes + nbytes);
69+
std::size_t N = std::max(m_hunk, nbytes);
70+
71+
if ((!freeunused_called) && arena_info.defragmentation) {
72+
auto freeable_nbytes = freeableMemory();
73+
if (freeable_nbytes >= N) {
74+
freeUnused_protected();
75+
N = freeable_nbytes;
76+
}
77+
}
7378

7479
vp = allocate_system(N);
7580

@@ -355,6 +360,19 @@ CArena::freeUnused ()
355360
return freeUnused_protected();
356361
}
357362

363+
std::size_t
364+
CArena::freeableMemory () const
365+
{
366+
std::size_t nbytes = 0;
367+
for (auto const& [p, sz] : m_alloc) {
368+
auto it = m_freelist.find(Node(p,nullptr,0));
369+
if (it != m_freelist.end() && it->owner() == p && it->size() == sz) {
370+
nbytes += sz;
371+
}
372+
}
373+
return nbytes;
374+
}
375+
358376
std::size_t
359377
CArena::freeUnused_protected ()
360378
{

0 commit comments

Comments
 (0)