|
55 | 55 | app = modal.App("eval_from_generations_modal") |
56 | 56 | gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]} |
57 | 57 |
|
58 | | -cuda_version = "12.4.0" # should be no greater than host CUDA version |
| 58 | +cuda_version = "12.8.0" # should be no greater than host CUDA version |
59 | 59 | flavor = "devel" # includes full CUDA toolkit |
60 | 60 | operating_sys = "ubuntu22.04" |
61 | 61 | tag = f"{cuda_version}-{flavor}-{operating_sys}" |
|
67 | 67 | "g++-10", |
68 | 68 | "clang" |
69 | 69 | ) |
70 | | - .pip_install( |
71 | | - "numpy", |
72 | | - "packaging", |
73 | | - "pydra_config", |
74 | | - "torch==2.5.0", |
75 | | - "tqdm", |
76 | | - "datasets", |
77 | | - "transformers", |
78 | | - "pytest", |
79 | | - "ninja", |
80 | | - "utils", |
81 | | - "python-dotenv", |
82 | | - ) |
| 70 | + .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt")) |
83 | 71 | .add_local_dir( |
84 | 72 | KERNEL_BENCH_PATH, |
85 | 73 | remote_path="/root/KernelBench" |
@@ -165,17 +153,18 @@ class WorkArgs: |
165 | 153 | # Modal Evaluation Class |
166 | 154 | # GPU must be specified here for all instances |
167 | 155 | # Retries are configured at the class level to handle GPU attachment failures |
168 | | -# @modal.concurrent: Each container handles exactly ONE evaluation at a time - prevents memory leaks |
| 156 | +# scaledown_window=5 kills idle containers after 5 seconds |
| 157 | +# Combined with 10s sleep between batches, this prevents container reuse and GPU corruption spread |
169 | 158 | @app.cls( |
170 | | - image=image, |
| 159 | + image=image, |
171 | 160 | gpu="A10G", |
| 161 | + scaledown_window=5, # Kill idle containers after 5 seconds |
172 | 162 | retries=modal.Retries( |
173 | 163 | max_retries=3, |
174 | 164 | backoff_coefficient=2.0, |
175 | 165 | initial_delay=1.0, |
176 | 166 | ) |
177 | 167 | ) |
178 | | -@modal.concurrent(max_inputs=1) # One input per container - prevents GPU memory leaks |
179 | 168 | class ModalEvaluator: |
180 | 169 |
|
181 | 170 | @modal.method() |
@@ -230,11 +219,11 @@ def evaluate_single_sample_modal( |
230 | 219 | backend=backend, |
231 | 220 | precision=get_torch_dtype_from_string(precision), |
232 | 221 | ) |
233 | | - |
234 | | - # Force cleanup and exit to prevent container reuse and memory leaks |
| 222 | + |
| 223 | + # Cleanup GPU cache before returning |
235 | 224 | torch.cuda.empty_cache() |
236 | | - |
237 | | - return result # Never reached, but needed for type checking |
| 225 | + |
| 226 | + return result |
238 | 227 |
|
239 | 228 |
|
240 | 229 | def fetch_ref_arch_from_problem_id( |
@@ -482,7 +471,8 @@ def batch_eval_modal( |
482 | 471 | evaluator_cls = ModalEvaluator.with_options(gpu=config.gpu) if config.gpu != "A10G" else ModalEvaluator |
483 | 472 |
|
484 | 473 | # Spawn all tasks in parallel |
485 | | - # Each spawn creates a NEW container instance with a GPU |
| 474 | + # Modal assigns these to available containers (may reuse warm containers from previous batches) |
| 475 | + # To prevent GPU corruption spread, we sleep between batches to ensure containers scale down |
486 | 476 | futures = [] |
487 | 477 | for item in work_items: |
488 | 478 | if item is None: |
@@ -538,7 +528,14 @@ def batch_eval_modal( |
538 | 528 |
|
539 | 529 | print("-" * 128) |
540 | 530 | print(f"[Modal Batch] Evaluation took {end_time - start_time:.2f} seconds") |
541 | | - |
| 531 | + |
| 532 | + # Wait for containers to scale down before next batch |
| 533 | + # This prevents container reuse and GPU corruption from spreading between batches |
| 534 | + if len(total_work) > 0: # Only sleep if there are more batches |
| 535 | + scaledown_wait = 10 # Wait 10 seconds (2x the scaledown_window) to ensure containers are killed |
| 536 | + print(f"[Modal] Waiting {scaledown_wait}s for containers to scale down before next batch...") |
| 537 | + time.sleep(scaledown_wait) |
| 538 | + |
542 | 539 | pbar.update(len(curr_work_batch)) |
543 | 540 |
|
544 | 541 |
|
|
0 commit comments