Skip to content

Commit 89544df

Browse files
simonguoziruipythonomar22nathanjpaek
authored
Dependency Upgrades (#84)
* upgrade pytorch version * pip install from reqs * remove tvm * update readme * updating modal image to 12.8 cuda --------- Co-authored-by: pythonomar22 <[email protected]> Co-authored-by: nathanjp <[email protected]>
1 parent 018c599 commit 89544df

File tree

7 files changed

+39
-92
lines changed

7 files changed

+39
-92
lines changed

requirements.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Frameworks
2-
torch==2.5.0
2+
# we use latest PyTorch stable release
3+
torch==2.9.0
4+
35
# we shall upgrade torch for blackwell when it is stable
46
transformers
57
datasets
@@ -8,7 +10,6 @@ modal
810
# DSLs
911
nvidia-cutlass-dsl
1012
tilelang
11-
apache-tvm
1213

1314
# helper
1415
tqdm
@@ -22,6 +23,7 @@ einops
2223
dotenv
2324
numpy
2425

25-
openai
26+
# use litellm for cloud providers and openai for local
27+
openai
2628
litellm[proxy]
2729

results/timing/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ This folder contains a set of baseline timing results for the KernelBench proble
66
Since KernelBench measures the speedup between Runtime(refernece architecture) and Runtime(LLM-generated architecture), it is important to measure the baseline reference module runtime.
77

88
We have provided a set of baseline results for the KernelBench problems on a variety of hardware as well as various PyTorch configurations.
9-
All baseline are ran with PyTorch `2.5.0+cu124` and CUDA `12.4`.
9+
All (current) baseline are ran with PyTorch `2.5.0+cu124` and CUDA `12.4`.
10+
11+
Note: we will update it soon with PyTorch `2.9.0` and CUDA `12.8`
1012

1113
For timing, we measure wall clock time. We warm up 3 times and collect runtime statistics for 100 trials.
1214

scripts/eval_from_generations.py

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
app = modal.App("eval_from_generations_modal")
5656
gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}
5757

58-
cuda_version = "12.4.0" # should be no greater than host CUDA version
58+
cuda_version = "12.8.0" # should be no greater than host CUDA version
5959
flavor = "devel" # includes full CUDA toolkit
6060
operating_sys = "ubuntu22.04"
6161
tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -67,19 +67,7 @@
6767
"g++-10",
6868
"clang"
6969
)
70-
.pip_install(
71-
"numpy",
72-
"packaging",
73-
"pydra_config",
74-
"torch==2.5.0",
75-
"tqdm",
76-
"datasets",
77-
"transformers",
78-
"pytest",
79-
"ninja",
80-
"utils",
81-
"python-dotenv",
82-
)
70+
.pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
8371
.add_local_dir(
8472
KERNEL_BENCH_PATH,
8573
remote_path="/root/KernelBench"
@@ -165,17 +153,18 @@ class WorkArgs:
165153
# Modal Evaluation Class
166154
# GPU must be specified here for all instances
167155
# Retries are configured at the class level to handle GPU attachment failures
168-
# @modal.concurrent: Each container handles exactly ONE evaluation at a time - prevents memory leaks
156+
# scaledown_window=5 kills idle containers after 5 seconds
157+
# Combined with 10s sleep between batches, this prevents container reuse and GPU corruption spread
169158
@app.cls(
170-
image=image,
159+
image=image,
171160
gpu="A10G",
161+
scaledown_window=5, # Kill idle containers after 5 seconds
172162
retries=modal.Retries(
173163
max_retries=3,
174164
backoff_coefficient=2.0,
175165
initial_delay=1.0,
176166
)
177167
)
178-
@modal.concurrent(max_inputs=1) # One input per container - prevents GPU memory leaks
179168
class ModalEvaluator:
180169

181170
@modal.method()
@@ -230,11 +219,11 @@ def evaluate_single_sample_modal(
230219
backend=backend,
231220
precision=get_torch_dtype_from_string(precision),
232221
)
233-
234-
# Force cleanup and exit to prevent container reuse and memory leaks
222+
223+
# Cleanup GPU cache before returning
235224
torch.cuda.empty_cache()
236-
237-
return result # Never reached, but needed for type checking
225+
226+
return result
238227

239228

240229
def fetch_ref_arch_from_problem_id(
@@ -482,7 +471,8 @@ def batch_eval_modal(
482471
evaluator_cls = ModalEvaluator.with_options(gpu=config.gpu) if config.gpu != "A10G" else ModalEvaluator
483472

484473
# Spawn all tasks in parallel
485-
# Each spawn creates a NEW container instance with a GPU
474+
# Modal assigns these to available containers (may reuse warm containers from previous batches)
475+
# To prevent GPU corruption spread, we sleep between batches to ensure containers scale down
486476
futures = []
487477
for item in work_items:
488478
if item is None:
@@ -538,7 +528,14 @@ def batch_eval_modal(
538528

539529
print("-" * 128)
540530
print(f"[Modal Batch] Evaluation took {end_time - start_time:.2f} seconds")
541-
531+
532+
# Wait for containers to scale down before next batch
533+
# This prevents container reuse and GPU corruption from spreading between batches
534+
if len(total_work) > 0: # Only sleep if there are more batches
535+
scaledown_wait = 10 # Wait 10 seconds (2x the scaledown_window) to ensure containers are killed
536+
print(f"[Modal] Waiting {scaledown_wait}s for containers to scale down before next batch...")
537+
time.sleep(scaledown_wait)
538+
542539
pbar.update(len(curr_work_batch))
543540

544541

scripts/generate_and_eval_single_sample_modal.py

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def verbose_logging(self):
8686
def __repr__(self):
8787
return f"EvalConfig({self.to_dict()})"
8888

89-
cuda_version = "12.4.0" # should be no greater than host CUDA version
89+
cuda_version = "12.8.0" # should be no greater than host CUDA version
9090
flavor = "devel" # includes full CUDA toolkit
9191
operating_sys = "ubuntu22.04"
9292
tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -96,29 +96,10 @@ def __repr__(self):
9696
.apt_install("git",
9797
"gcc-10",
9898
"g++-10",
99-
"clang" # note i skip a step
99+
"clang" # note i skip a step
100100
)
101-
.pip_install( # required to build flash-attn
102-
"numpy",
103-
"openai",
104-
"packaging",
105-
"pydra_config",
106-
"torch==2.5.0",
107-
"tqdm",
108-
"datasets",
109-
"transformers",
110-
"pytest",
111-
"ninja",
112-
"utils",
113-
"tilelang",
114-
"apache-tvm",
115-
"python-dotenv",
116-
"nvidia-cutlass-dsl",
117-
"litellm[proxy]", # Unified LLM interface
118-
"einops", # for numerics
119-
120-
)
121-
.add_local_python_source("src")
101+
.pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
102+
.add_local_python_source("src")
122103
)
123104

124105
@app.cls(image=image)

scripts/generate_baseline_time_modal.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
batch_size = 10
5656
gpu = "L40S"
5757
timeout = 1800
58-
cuda_version = "12.4.0" # should be no greater than host CUDA version
58+
cuda_version = "12.8.0" # should be no greater than host CUDA version
5959
flavor = "devel" # includes full CUDA toolkit
6060
operating_sys = "ubuntu22.04"
6161
tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -65,28 +65,14 @@
6565
.apt_install("git",
6666
"gcc-10",
6767
"g++-10",
68-
"clang" # note i skip a step
68+
"clang" # note i skip a step
6969
)
70-
.pip_install( # required to build flash-attn
71-
# Let's unify these dependencies somewhere
72-
"numpy",
73-
"packaging",
74-
"pydra_config",
75-
"torch==2.5.0",
76-
"tqdm",
77-
"datasets",
78-
"transformers",
79-
"pytest",
80-
"ninja",
81-
"utils",
82-
"einops",
83-
"python-dotenv",
84-
)
70+
.pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
8571
.add_local_dir(
8672
KERNEL_BENCH_PATH,
8773
remote_path="/root/KernelBench"
8874
)
89-
.add_local_python_source("src")
75+
.add_local_python_source("src")
9076
)
9177

9278
def write_batch_to_json(entries_to_write: list, f_path: str):

scripts/run_and_check.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,29 +27,15 @@
2727
REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
2828
KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
2929

30-
cuda_version = "12.4.0"
30+
cuda_version = "12.8.0"
3131
flavor = "devel"
3232
operating_sys = "ubuntu22.04"
3333
tag = f"{cuda_version}-{flavor}-{operating_sys}"
3434

3535
image = (
3636
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
3737
.apt_install("git", "gcc-10", "g++-10", "clang")
38-
.pip_install(
39-
"numpy",
40-
"packaging",
41-
"pydra_config",
42-
"torch==2.5.0",
43-
"tqdm",
44-
"datasets",
45-
"transformers",
46-
"pytest",
47-
"ninja",
48-
"utils",
49-
"einops",
50-
"python-dotenv",
51-
"litellm[proxy]",
52-
)
38+
.pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
5339
.add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
5440
.add_local_python_source("src")
5541
.add_local_python_source("scripts")

src/utils.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,7 @@ def query_server(
7171
):
7272
"""
7373
Query various sort of LLM inference API providers
74-
Supports:
75-
- OpenAI
76-
- Deepseek
77-
- Together
78-
- Sambanova
79-
- Anthropic
80-
- Gemini / Google AI Studio
81-
- Fireworks (OpenAI compatbility)
74+
Done through liteLLM:
8275
- Local Server (SGLang, vLLM, Tokasaurus)
8376
"""
8477
# Local Server (SGLang, vLLM, Tokasaurus) - special handling

0 commit comments

Comments
 (0)