Skip to content

Refactor: Enhance GPU Memory Leak Test for read_region #874

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: branch-25.06
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,81 @@ def get_used_gpu_memory_mib():

img = open_image_cucim(testimg_tiff_stripe_4096x4096_256_jpeg)

mem_usage_history = [get_used_gpu_memory_mib()]
mem_usage_history = [
get_used_gpu_memory_mib()
] # Memory before loop (image loaded)

for i in range(10):
_ = img.read_region(device="cuda")
num_iterations = 10
warmup_iterations = (
3 # Number of iterations to run before establishing a baseline
)

for i in range(num_iterations):
region_data = img.read_region(device="cuda")
# Explicitly delete the CuPy array
del region_data
# Force CuPy to free unused blocks from its memory pool
cp.get_default_memory_pool().free_all_blocks()
mem_usage_history.append(get_used_gpu_memory_mib())

print(mem_usage_history)
print(f"Full memory usage history (MiB): {mem_usage_history}")

# mem_usage_history[0] is before any read_region calls
# mem_usage_history[k] is after the k-th iteration (read_region, del,
# free_all_blocks)

# Baseline memory after warmup_iterations (e.g., after 3rd iteration)
# Ensure warmup_iterations is less than num_iterations
if warmup_iterations >= num_iterations:
pytest.fail(
"warmup_iterations must be less than num_iterations for this test "
"logic"
)

# Memory after the warmup period (e.g., after 3rd call, so index 3)
mem_after_warmup = mem_usage_history[warmup_iterations]
# Memory after all iterations (e.g., after 10th call, so index 10)
mem_at_end = mem_usage_history[num_iterations]

# The difference in memory usage should be less than 180MB.
# Note: Since we cannot measure GPU memory usage for a process,
# we use a rough number.
# (experimentally measured, assuming that each image load
# consumes around 50MB of GPU memory).
assert mem_usage_history[5] - mem_usage_history[9] < 180.0
# Calculate the increase in memory after the warmup period
memory_increase_after_warmup = mem_at_end - mem_after_warmup

print(
f"Memory after warmup ({warmup_iterations} iterations): "
f"{mem_after_warmup:.2f} MiB"
)
print(f"Memory at end ({num_iterations} iterations): {mem_at_end:.2f} MiB")
print(
f"Memory increase after warmup: {memory_increase_after_warmup:.2f} MiB"
)

# The increase in memory after the warm-up phase and explicit freeing
# should be minimal, ideally close to zero for a perfectly clean operation.
# This threshold (leak_threshold_mib, e.g., 30.0 MiB) defines an acceptable
# upper bound for the *cumulative* memory increase observed over the
# (num_iterations - warmup_iterations) test iterations.
# It accounts for potential minor non-reclaimable memory that might
# accumulate due to factors like fragmentation, persistent driver/runtime
# overheads, or small, consistent allocation patterns within the tested
# function, even with explicit attempts to free memory.
#
# For instance, a 30.0 MiB threshold over 7 active test iterations
# (10 total iterations - 3 warmup iterations) allows for an average of
# roughly 4.3 MiB of such net memory growth per iteration during the
# measurement phase.
# This approach is significantly different from a previous version of this
# test, which used a 180MB threshold for a non-cumulative comparison
# (i.e., `memory_at_iteration_5 - memory_at_iteration_9`), which could
# be affected by transient spikes rather than sustained growth.
# If the `read_region` operation has a consistent memory leak (i.e., memory
# that is allocated and not freed properly on an ongoing basis), the
# `memory_increase_after_warmup` is expected to exceed this threshold.
leak_threshold_mib = 30.0
assert memory_increase_after_warmup < leak_threshold_mib, (
f"Memory increase ({memory_increase_after_warmup:.2f} MiB) "
f"exceeded threshold ({leak_threshold_mib} MiB) "
f"over {num_iterations - warmup_iterations} iterations after warmup."
)


def test_read_region_cpu_memleak(testimg_tiff_stripe_4096x4096_256):
Expand Down