feat(runs): auto-assign preset GPUs

xraymemory · xraymemory · commit d86a2997a709 · 2026-05-23T14:33:50.000-04:00
diff --git a/README.md b/README.md
@@ -195,7 +195,8 @@ run_experiments full_8gpu --jobs rf3,protenix
 Standalone presets are available for each model/model family: `boltz`,
 `boltz1`, `boltz2`, `boltz2_xrd`, `boltz2_md`, `rf3`, and `protenix`.
 Additional comparison presets include `protenix_dual`, `rf3_protenix`, and RF3
-variants.
+variants. Single-job presets default to `gpu_count = 8`, so on an 8-GPU pod
+they use the whole machine.
 
 Presets live in `experiments/*.toml` in your local checkout and on the pod at
 `/home/dev/workspace/experiments/*.toml`. To modify an experiment, edit or copy
@@ -210,10 +211,15 @@ run_experiments --preset my_rf3
 For one-off changes, use `--set` instead of editing TOML:
 
 ```bash
-run_experiments rf3 --set jobs.rf3.gpus=0,1
+run_experiments rf3 --set jobs.rf3.gpu_count=4
 run_experiments rf3 --set jobs.rf3.args.gradient-weights="0.0 0.01 0.02"
 ```
 
+Presets usually declare `gpu_count = N`, not fixed GPU IDs. The runner assigns
+visible GPUs automatically in job order, so the same preset works on different
+pod sizes. Use explicit `gpus = "0,1"` only when you need to pin a job to
+specific devices.
+
 Defaults: inputs come from `/mnt/diffuse-shared/raw/sampleworks/...`, checkpoints
 from `/mnt/diffuse-shared/raw/checkpoints`, results go to
 `/mnt/diffuse-shared/results/sampleworks/<pod>/<target>/`, and MSA caches go to
diff --git a/experiments/boltz.toml b/experiments/boltz.toml
@@ -18,13 +18,13 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 4
 output_subdir = "boltz2_xrd"
 args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "2,3"
+gpu_count = 4
 output_subdir = "boltz2_md"
 args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/boltz1.toml b/experiments/boltz1.toml
@@ -22,6 +22,6 @@ align-to-input = true
 [[jobs]]
 name = "boltz1"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "boltz1"
 args = {}
diff --git a/experiments/boltz2.toml b/experiments/boltz2.toml
@@ -18,13 +18,13 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 4
 output_subdir = "boltz2_xrd"
 args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "2,3"
+gpu_count = 4
 output_subdir = "boltz2_md"
 args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/boltz2_md.toml b/experiments/boltz2_md.toml
@@ -21,6 +21,6 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "boltz2_md"
 args = {}
diff --git a/experiments/boltz2_xrd.toml b/experiments/boltz2_xrd.toml
@@ -21,6 +21,6 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "boltz2_xrd"
 args = {}
diff --git a/experiments/full_8gpu.toml b/experiments/full_8gpu.toml
@@ -18,27 +18,27 @@ align-to-input = true
 [[jobs]]
 name = "boltz2_xrd"
 env = "boltz"
-gpus = "0,1"
+gpu_count = 2
 output_subdir = "boltz2_xrd"
 args = { model = "boltz2", method = "X-RAY DIFFRACTION", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "boltz2_md"
 env = "boltz"
-gpus = "2,3"
+gpu_count = 2
 output_subdir = "boltz2_md"
 args = { model = "boltz2", method = "MD", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
 
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "4,5"
+gpu_count = 2
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1" }
 
 [[jobs]]
 name = "protenix"
 env = "protenix"
-gpus = "6,7"
+gpu_count = 2
 output_subdir = "protenix"
 args = { model = "protenix", gradient-weights = "0.0 0.05 0.1 0.2 0.35 0.5" }
diff --git a/experiments/protenix.toml b/experiments/protenix.toml
@@ -20,6 +20,6 @@ align-to-input = true
 [[jobs]]
 name = "protenix"
 env = "protenix"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "protenix"
 args = {}
diff --git a/experiments/protenix_dual.toml b/experiments/protenix_dual.toml
@@ -22,13 +22,13 @@ align-to-input = true
 [[jobs]]
 name = "protenix_tiny"
 env = "protenix"
-gpus = "2,3"
+gpu_count = 4
 output_subdir = "protenix_tiny"
 args = { model-checkpoint = "${PROTENIX_TINY_CHECKPOINT}" }
 
 [[jobs]]
 name = "protenix_mini"
 env = "protenix"
-gpus = "6,7"
+gpu_count = 4
 output_subdir = "protenix_mini"
 args = { model-checkpoint = "${PROTENIX_MINI_CHECKPOINT}" }
diff --git a/experiments/rf3.toml b/experiments/rf3.toml
@@ -22,6 +22,6 @@ align-to-input = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "0,1"
+gpu_count = 8
 output_subdir = "rf3"
 args = {}
diff --git a/experiments/rf3_partial.toml b/experiments/rf3_partial.toml
@@ -19,6 +19,6 @@ align-to-input = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "4"
+gpu_count = 8
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/experiments/rf3_partial_chiral_off.toml b/experiments/rf3_partial_chiral_off.toml
@@ -21,6 +21,6 @@ disable-chiral-features = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "5"
+gpu_count = 8
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" }
diff --git a/experiments/rf3_protenix.toml b/experiments/rf3_protenix.toml
@@ -17,13 +17,13 @@ align-to-input = true
 [[jobs]]
 name = "rf3"
 env = "rf3"
-gpus = "0,1,2,3"
+gpu_count = 4
 output_subdir = "rf3"
 args = { model = "rf3", gradient-weights = "0.0 0.01 0.02 0.05 0.1" }
 
 [[jobs]]
 name = "protenix"
 env = "protenix"
-gpus = "4,5,6,7"
+gpu_count = 4
 output_subdir = "protenix"
 args = { model = "protenix", partial-diffusion-step = 120, gradient-weights = "0.0 0.1 0.2 0.5" }
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
@@ -110,7 +110,7 @@ def _build_parser() -> argparse.ArgumentParser:
             "Override a value in the loaded preset. Examples: "
             "--set defaults.DATA_DIR=/data/foo, "
             "--set jobs.rf3.args.gradient-weights='0.0 0.01', "
-            "--set jobs.0.gpus=5"
+            "--set jobs.0.gpu_count=4"
         ),
     )
     parser.add_argument(
@@ -224,7 +224,10 @@ def _print_show(preset: Preset) -> None:
     for j in preset.jobs:
         print(f"  - name: {j.name}")
         print(f"    env: {j.env}")
-        print(f"    gpus: {j.gpus}")
+        if j.gpus:
+            print(f"    gpus: {j.gpus}")
+        else:
+            print(f"    gpu_count: {j.gpu_count}")
         print(f"    output_subdir: {j.output_subdir}")
         print("    args:")
         for k, v in j.args.items():
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
@@ -501,8 +501,9 @@ def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
         Job(
             name=str(j["name"]),
             env=str(j["env"]),
-            gpus=str(j["gpus"]),
             output_subdir=str(j["output_subdir"]),
+            gpus=str(j.get("gpus", "")),
+            gpu_count=_optional_int(j.get("gpu_count")),
             args=dict(j.get("args", {})),
         )
         for j in raw_jobs
@@ -514,3 +515,8 @@ def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
         shared_args=dict(raw.get("shared_args", {})),
         jobs=jobs,
     )
+
+
+def _optional_int(value: Any) -> int | None:
+    """Return ``value`` as an int, preserving ``None`` for absent fields."""
+    return None if value is None else int(value)
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
@@ -34,6 +34,9 @@ class JobInvocation:
         by ``run_grid_search.py``.
     env : dict of str to str
         Process environment, including ``CUDA_VISIBLE_DEVICES``.
+    gpus : str
+        Resolved CUDA-visible GPU assignment. For jobs that declare
+        ``gpu_count``, this is the concrete auto-assigned GPU list.
     log_path : Path
         File to tee stdout+stderr into.
     output_dir : Path
@@ -44,6 +47,7 @@ class JobInvocation:
     job: Job
     argv: list[str]
     env: dict[str, str]
+    gpus: str
     log_path: Path
     output_dir: Path
 
@@ -67,20 +71,82 @@ def build_invocations(preset: Preset, *, results_dir: Path) -> list[JobInvocatio
     list of JobInvocation
         One :class:`JobInvocation` per job, in declaration order.
     """
+    gpu_assignments = _resolve_gpu_assignments(preset.jobs)
     invocations: list[JobInvocation] = []
     for job in preset.jobs:
         args = preset.effective_args(job)
         args.setdefault("output-dir", str(results_dir / job.output_subdir))
         argv = _build_argv(job.env, args)
-        env = _job_env(job.env, {**os.environ, "CUDA_VISIBLE_DEVICES": job.gpus})
+        gpus = gpu_assignments[job.name]
+        env = _job_env(job.env, {**os.environ, "CUDA_VISIBLE_DEVICES": gpus})
         log_path = results_dir / f"{job.name}_run.log"
         output_dir = Path(args["output-dir"])
         invocations.append(
-            JobInvocation(job=job, argv=argv, env=env, log_path=log_path, output_dir=output_dir)
+            JobInvocation(
+                job=job,
+                argv=argv,
+                env=env,
+                gpus=gpus,
+                log_path=log_path,
+                output_dir=output_dir,
+            )
         )
     return invocations
 
 
+def _resolve_gpu_assignments(jobs: list[Job]) -> dict[str, str]:
+    """Resolve explicit ``gpus`` and automatic ``gpu_count`` declarations.
+
+    Explicit assignments reserve those GPU tokens. Jobs with ``gpu_count`` then
+    consume remaining visible GPU IDs in preset declaration order. When GPU
+    discovery is unavailable (for local dry-runs/tests), synthetic ordinals are
+    generated so command construction stays deterministic.
+    """
+    explicit: dict[str, str] = {job.name: job.gpus for job in jobs if job.gpus}
+    reserved = {gpu for value in explicit.values() for gpu in _split_gpu_list(value)}
+    total_auto = sum(job.gpu_count or 0 for job in jobs)
+    available = _detect_available_gpus()
+    if available:
+        pool = [gpu for gpu in available if gpu not in reserved]
+        if len(pool) < total_auto:
+            raise RuntimeError(
+                "Not enough visible GPUs for preset auto-assignment. "
+                f"Visible GPUs: {available}. Reserved GPUs: {sorted(reserved)}. "
+                f"Auto-requested GPUs: {total_auto}."
+            )
+    elif _cuda_visible_devices_disables_gpus() and total_auto:
+        raise RuntimeError(
+            "CUDA_VISIBLE_DEVICES disables GPU access, so gpu_count auto-assignment "
+            "cannot allocate any GPUs."
+        )
+    else:
+        pool = _synthetic_gpu_pool(reserved, total_auto)
+
+    assignments: dict[str, str] = {}
+    cursor = 0
+    for job in jobs:
+        if job.gpus:
+            assignments[job.name] = job.gpus
+            continue
+        count = job.gpu_count or 0
+        assigned = pool[cursor : cursor + count]
+        cursor += count
+        assignments[job.name] = ",".join(assigned)
+    return assignments
+
+
+def _synthetic_gpu_pool(reserved: set[str], count: int) -> list[str]:
+    """Return deterministic CUDA ordinals when real GPU discovery is unavailable."""
+    pool: list[str] = []
+    candidate = 0
+    while len(pool) < count:
+        token = str(candidate)
+        if token not in reserved:
+            pool.append(token)
+        candidate += 1
+    return pool
+
+
 def _split_gpu_list(value: str) -> list[str]:
     """Split a comma-separated GPU assignment into normalized tokens.
 
@@ -123,12 +189,10 @@ def _detect_available_gpus() -> list[str]:
         available. Empty means validation should be skipped.
     """
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
-    if cuda_visible and cuda_visible.lower() not in {
-        "all",
-        "none",
-        "void",
-        "nodevfiles",
-    }:
+    cuda_visible_key = cuda_visible.lower()
+    if _cuda_visible_devices_disables_gpus():
+        return []
+    if cuda_visible and cuda_visible_key != "all":
         return _split_gpu_list(cuda_visible)
 
     try:
@@ -145,6 +209,15 @@ def _detect_available_gpus() -> list[str]:
     return [line.strip() for line in result.stdout.splitlines() if line.strip()]
 
 
+def _cuda_visible_devices_disables_gpus() -> bool:
+    """Return True when CUDA_VISIBLE_DEVICES explicitly hides all GPUs."""
+    return os.environ.get("CUDA_VISIBLE_DEVICES", "").strip().lower() in {
+        "none",
+        "void",
+        "nodevfiles",
+    }
+
+
 def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
     """Fail fast when a preset asks for GPUs not present in this pod.
 
@@ -166,7 +239,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
 
     requested: dict[str, list[str]] = {}
     for inv in invocations:
-        for gpu in _split_gpu_list(inv.job.gpus):
+        for gpu in _split_gpu_list(inv.gpus):
             requested.setdefault(gpu, []).append(inv.job.name)
 
     requested_tokens = list(requested)
@@ -183,7 +256,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
         raise RuntimeError(
             "Preset requests GPUs that are not visible in this pod. "
             f"Visible GPUs: {', '.join(available)}. {details}. "
-            "Edit the preset's jobs.*.gpus values or run a smaller --jobs subset."
+            "Edit the preset's jobs.*.gpus/gpu_count values or run a smaller --jobs subset."
         )
 
     allow_oversubscription = os.environ.get(
@@ -517,9 +590,9 @@ def _print_dry_run(inv: JobInvocation) -> None:
     inv : JobInvocation
         Invocation to print.
     """
-    print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr)
+    print(f"# job: {inv.job.name}  (env={inv.job.env}, gpus={inv.gpus})", file=sys.stderr)
     print(f"# log: {inv.log_path}", file=sys.stderr)
-    print(f"CUDA_VISIBLE_DEVICES={inv.job.gpus} {_shell_join(inv.argv)}")
+    print(f"CUDA_VISIBLE_DEVICES={inv.gpus} {_shell_join(inv.argv)}")
     print(file=sys.stderr)
 
 
@@ -540,7 +613,7 @@ def _print_launch_summary(preset: Preset, invocations: list[JobInvocation]) -> N
         print(f"  {preset.description}", file=sys.stderr)
     for inv in invocations:
         print(
-            f"  - {inv.job.name}: env={inv.job.env}, gpus={inv.job.gpus}, log={inv.log_path}",
+            f"  - {inv.job.name}: env={inv.job.env}, gpus={inv.gpus}, log={inv.log_path}",
             file=sys.stderr,
         )
     print(bar, file=sys.stderr)
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
diff --git a/tests/runs/test_runner.py b/tests/runs/test_runner.py