fix(runs): address lint and runner review issues

xraymemory · xraymemory · commit bb65db4a753a · 2026-05-23T14:04:23.000-04:00
diff --git a/Dockerfile b/Dockerfile
@@ -105,7 +105,7 @@ COPY docker-entrypoint.sh /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/entrypoint.sh
 
 # ============================================================================
-# Bake in model checkpoints from pre-built base image on Docker Hub
+# Bake in model checkpoints from pre-built Harbor image
 # ============================================================================
 # Checkpoints (~10 GB) rarely change, so this layer is placed before pixi
 # installs to stay cached even when dependencies update.
diff --git a/run_experiments b/run_experiments
@@ -39,6 +39,13 @@ find_sampleworks_root_upwards() {
 resolve_repo_root() {
     local source_override="${SAMPLEWORKS_SOURCE_DIR:-}"
     if [[ -n "$source_override" ]]; then
+        if ! is_sampleworks_root "$source_override"; then
+            cat >&2 <<EOF
+SAMPLEWORKS_SOURCE_DIR does not point to a Sampleworks checkout:
+  $source_override
+EOF
+            return 2
+        fi
         printf '%s\n' "$source_override"
         return 0
     fi
@@ -191,8 +198,17 @@ for checkpoint_var_and_file in \
     fi
 done
 
+needs_runtime_paths=1
+for arg in "$@"; do
+    case "$arg" in
+        --dry-run|--show|--list|-h|--help)
+            needs_runtime_paths=0
+            ;;
+    esac
+done
+
 source_proteins_csv="${PROTEINS_CSV:-$DATA_DIR/proteins.csv}"
-if [[ -f "$source_proteins_csv" ]]; then
+if [[ "$needs_runtime_paths" -eq 1 && -f "$source_proteins_csv" ]]; then
     # The shared proteins.csv currently contains absolute /data/inputs paths,
     # while ACTL mounts the dataset at /mnt/diffuse-shared. Rewrite a per-run
     # manifest instead of requiring non-root scientists to create /data symlinks.
@@ -227,15 +243,6 @@ if [[ -n "$explicit_jobs" ]]; then
     display_target="$display_target --jobs $explicit_jobs"
 fi
 
-needs_runtime_paths=1
-for arg in "$@"; do
-    case "$arg" in
-        --dry-run|--show|--list|-h|--help)
-            needs_runtime_paths=0
-            ;;
-    esac
-done
-
 if [[ "$needs_runtime_paths" -eq 1 ]]; then
     if [[ ! -f "${PROTEINS_CSV:-$source_proteins_csv}" ]]; then
         cat >&2 <<EOF
diff --git a/run_grid_search.py b/run_grid_search.py
@@ -72,11 +72,15 @@ def detect_gpus() -> list[str]:
     """Return CUDA GPU identifiers visible to this grid-search process.
 
     ``CUDA_VISIBLE_DEVICES`` wins when set because CUDA remaps those entries to
-    local process ordinals. Otherwise, ``nvidia-smi`` is used as a best-effort
-    discovery mechanism and ``["0"]`` is returned as a CPU/test fallback.
+    local process ordinals. Explicit CUDA "no device" sentinel values return an
+    empty list. Otherwise, ``nvidia-smi`` is used as a best-effort discovery
+    mechanism and ``["0"]`` is returned as a CPU/test fallback.
     """
-    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
-    if cuda_visible:
+    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
+    cuda_visible_key = cuda_visible.lower()
+    if cuda_visible_key in {"none", "void", "nodevfiles"}:
+        return []
+    if cuda_visible and cuda_visible_key != "all":
         gpus = [g.strip() for g in cuda_visible.split(",") if g.strip()]
         try:
             result = subprocess.run(
@@ -85,9 +89,7 @@ def detect_gpus() -> list[str]:
                 text=True,
             )
             if result.returncode == 0:
-                visible = [
-                    g.strip() for g in result.stdout.strip().split("\n") if g.strip()
-                ]
+                visible = [g.strip() for g in result.stdout.strip().split("\n") if g.strip()]
                 if all(g.isdigit() for g in gpus + visible):
                     missing = sorted(set(gpus).difference(visible), key=int)
                     if missing:
@@ -361,6 +363,10 @@ def main(args: argparse.Namespace):
     log.info(f"Detected {len(gpus)} GPUs: {gpus}")
     if args.max_parallel != "auto":
         gpus = gpus[: int(args.max_parallel)]
+    if not gpus:
+        raise ValueError(
+            "No CUDA GPUs are visible; unset CUDA_VISIBLE_DEVICES=none or use a GPU pod"
+        )
 
     log_args(args, gpus)
 
diff --git a/src/sampleworks/runs/cli.py b/src/sampleworks/runs/cli.py
@@ -84,7 +84,11 @@ def _build_parser() -> argparse.ArgumentParser:
         default="",
         help="Preset name from experiments/ or path to a .toml file. Default: full_8gpu.",
     )
-    parser.add_argument("--list", action="store_true", help="List experiments/*.toml presets and exit")
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List experiments/*.toml presets and exit",
+    )
     parser.add_argument("--show", action="store_true", help="Print the resolved preset and exit")
     parser.add_argument(
         "--dry-run",
diff --git a/src/sampleworks/runs/loader.py b/src/sampleworks/runs/loader.py
@@ -1,9 +1,11 @@
 """Load presets from TOML and apply runtime overrides.
 
 Resolution order for every string value (defaults block and ``args``):
-  1. ``${VAR}`` references are resolved against the process environment,
+  1. ``--set <dotted-path>=<value>`` CLI overrides are applied to the raw TOML
+     dict by :func:`load_preset`, so overridden values participate in
+     interpolation.
+  2. ``${VAR}`` references are resolved against the process environment,
      with the preset's ``[defaults]`` block filling in any unset keys.
-  2. ``--set <dotted-path>=<value>`` CLI overrides are applied last.
 """
 
 from __future__ import annotations
@@ -19,6 +21,7 @@
 
 
 _EXPERIMENTS_DIR_NAME = "experiments"
+_MAX_EXPAND_ITERATIONS = 32
 _VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
 _TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
 
@@ -325,10 +328,15 @@ def _find_in_list(items: list[Any], key: str, *, where: str) -> int:
     Raises
     ------
     KeyError
-        If no element with the given name exists.
+        If no element with the given name or index exists.
     """
     if key.isdigit() or (key.startswith("-") and key[1:].isdigit()):
-        return int(key)
+        index = int(key)
+        try:
+            items[index]
+        except IndexError:
+            raise KeyError(f"No list element at index {index} at {where!r}") from None
+        return index
     for i, item in enumerate(items):
         if isinstance(item, dict) and item.get("name") == key:
             return i
@@ -442,6 +450,8 @@ def _expand(text: str, env: dict[str, str]) -> str:
     ------
     KeyError
         If a referenced variable is not in ``env``.
+    ValueError
+        If recursive variable interpolation does not converge.
     """
 
     def repl(match: re.Match[str]) -> str:
@@ -451,12 +461,16 @@ def repl(match: re.Match[str]) -> str:
             raise KeyError(f"Undefined variable ${{{var}}} in preset (no env var, no default)")
         return env[var]
 
-    prev = None
     current = text
-    while prev != current:
-        prev = current
-        current = _VAR_PATTERN.sub(repl, current)
-    return current
+    for _ in range(_MAX_EXPAND_ITERATIONS):
+        expanded = _VAR_PATTERN.sub(repl, current)
+        if expanded == current:
+            return expanded
+        current = expanded
+    raise ValueError(
+        f"Variable expansion did not converge for {text!r}; check for circular "
+        "${VAR} references in [defaults], environment variables, or --set overrides."
+    )
 
 
 def _build_preset(*, name: str, raw: dict[str, Any]) -> Preset:
diff --git a/src/sampleworks/runs/runner.py b/src/sampleworks/runs/runner.py
@@ -17,6 +17,8 @@
 
 DEFAULT_GRID_SEARCH_SCRIPT = "/app/run_grid_search.py"
 WORKSPACE_GRID_SEARCH_SCRIPT = "/home/dev/workspace/run_grid_search.py"
+PROCESS_SHUTDOWN_TIMEOUT_SECONDS = 10
+TEE_THREAD_JOIN_TIMEOUT_SECONDS = 5
 
 
 @dataclass(frozen=True)
@@ -172,9 +174,7 @@ def _validate_gpu_assignments(invocations: list[JobInvocation]) -> None:
         return
 
     available_set = set(available)
-    unavailable = {
-        gpu: names for gpu, names in requested.items() if gpu not in available_set
-    }
+    unavailable = {gpu: names for gpu, names in requested.items() if gpu not in available_set}
     if unavailable:
         details = ", ".join(
             f"GPU {gpu} requested by {', '.join(names)}"
@@ -413,6 +413,7 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
     int
         ``0`` if all jobs exited 0 (or ``dry_run`` was set), ``1`` otherwise.
     """
+    results_dir = results_dir.resolve()
     results_dir.mkdir(parents=True, exist_ok=True)
     invocations = build_invocations(preset, results_dir=results_dir)
     _validate_gpu_assignments(invocations)
@@ -444,23 +445,33 @@ def _terminate_all(jobs: list[_RunningJob]) -> None:
     Parameters
     ----------
     jobs : list of _RunningJob
-        Jobs whose subprocesses should be SIGTERM'd, waited on, and whose tee
-        threads should be joined.
+        Jobs whose subprocesses should be SIGTERM'd, escalated to SIGKILL if
+        needed, and whose tee threads should be joined with bounded waits.
     """
     for j in jobs:
         if j.proc.poll() is None:
             j.proc.terminate()
     for j in jobs:
-        j.proc.wait()
-        j.tee_thread.join()
+        try:
+            j.proc.wait(timeout=PROCESS_SHUTDOWN_TIMEOUT_SECONDS)
+        except subprocess.TimeoutExpired:
+            j.proc.kill()
+            try:
+                j.proc.wait(timeout=PROCESS_SHUTDOWN_TIMEOUT_SECONDS)
+            except subprocess.TimeoutExpired:
+                print(
+                    f"[{_ts()}] {j.inv.job.name} did not exit after SIGKILL",
+                    file=sys.stderr,
+                )
+        j.tee_thread.join(timeout=TEE_THREAD_JOIN_TIMEOUT_SECONDS)
 
 
 def _prepare_pixi_env(pixi_env: str) -> None:
     """Prepare a pixi environment before parallel job launch.
 
-    ``pixi run`` is deliberately called once per env even when the interpreter
-    directory already exists, because pixi may still need to materialize PyPI
-    packages into that environment after image startup.
+    Preparation is skipped when a baked interpreter is already available, when
+    prebuilt environments are required, or when ``SAMPLEWORKS_SKIP_ENV_PREPARE``
+    is truthy. Otherwise, ``pixi run`` is called once for the environment.
 
     Parameters
     ----------
@@ -576,24 +587,39 @@ def _spawn(inv: JobInvocation) -> _RunningJob:
     inv.log_path.parent.mkdir(parents=True, exist_ok=True)
     inv.output_dir.mkdir(parents=True, exist_ok=True)
     log_file = open(inv.log_path, "wb")
+    proc: subprocess.Popen[bytes] | None = None
+    thread: threading.Thread | None = None
     try:
         proc = subprocess.Popen(
             inv.argv,
             env=inv.env,
+            cwd=str(_pixi_project_dir()),
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             bufsize=0,
         )
+        if proc.stdout is None:
+            raise RuntimeError(f"Job {inv.job.name!r} started without a stdout pipe")
+        thread = threading.Thread(
+            target=_tee,
+            args=(inv.job.name, proc.stdout, log_file),
+            daemon=True,
+        )
+        thread.start()
     except BaseException:
         log_file.close()
+        if proc is not None and proc.poll() is None:
+            proc.kill()
+            try:
+                proc.wait(timeout=PROCESS_SHUTDOWN_TIMEOUT_SECONDS)
+            except subprocess.TimeoutExpired:
+                print(
+                    f"[{_ts()}] {inv.job.name} did not exit after failed spawn cleanup",
+                    file=sys.stderr,
+                )
         raise
-    assert proc.stdout is not None
-    thread = threading.Thread(
-        target=_tee,
-        args=(inv.job.name, proc.stdout, log_file),
-        daemon=True,
-    )
-    thread.start()
+    if proc is None or thread is None:
+        raise RuntimeError(f"Job {inv.job.name!r} failed to initialize")
     print(f"[{_ts()}] launched {inv.job.name} (pid {proc.pid})", file=sys.stderr)
     return _RunningJob(inv=inv, proc=proc, tee_thread=thread)
 
diff --git a/src/sampleworks/runs/schema.py b/src/sampleworks/runs/schema.py
@@ -1,8 +1,9 @@
 """Dataclasses for the preset schema.
 
 A preset describes one or more parallel ``run_grid_search.py`` jobs. Each job
-is launched in its configured model environment with ``CUDA_VISIBLE_DEVICES``
-set to the job's GPU assignment.
+runs in its configured model environment, either through ``pixi run`` or a
+baked environment Python, with ``CUDA_VISIBLE_DEVICES`` set to the job's GPU
+assignment.
 """
 
 from __future__ import annotations
diff --git a/tests/runs/conftest.py b/tests/runs/conftest.py
@@ -2,10 +2,17 @@
 
 from __future__ import annotations
 
+import os
+
 import pytest
 
 
 @pytest.fixture(autouse=True)
 def force_pixi_argv(monkeypatch: pytest.MonkeyPatch) -> None:
     """Keep argv assertions deterministic on machines with /app/.pixi present."""
+    monkeypatch.delenv("SAMPLEWORKS_GRID_SEARCH_SCRIPT", raising=False)
+    monkeypatch.delenv("SAMPLEWORKS_PIXI_PROJECT_DIR", raising=False)
+    for var in list(os.environ):
+        if var.startswith("SAMPLEWORKS_") and var.endswith("_PYTHON"):
+            monkeypatch.delenv(var, raising=False)
     monkeypatch.setenv("SAMPLEWORKS_FORCE_PIXI", "1")
diff --git a/tests/runs/test_cli.py b/tests/runs/test_cli.py
@@ -39,13 +39,15 @@ def test_dry_run_does_not_invoke_subprocess(
 ) -> None:
     """``--dry-run`` prints commands and CUDA assignment instead of executing."""
     monkeypatch.setenv("HOME", str(tmp_path))
-    exit_code = cli.main([
-        "--preset",
-        "rf3_partial",
-        "--dry-run",
-        "--results-dir",
-        str(tmp_path),
-    ])
+    exit_code = cli.main(
+        [
+            "--preset",
+            "rf3_partial",
+            "--dry-run",
+            "--results-dir",
+            str(tmp_path),
+        ]
+    )
     assert exit_code == 0
     out = capsys.readouterr().out
     assert "pixi run -e rf3 python /app/run_grid_search.py" in out
diff --git a/tests/runs/test_loader.py b/tests/runs/test_loader.py
@@ -183,6 +183,27 @@ def test_set_with_unknown_top_level_key_raises(monkeypatch: pytest.MonkeyPatch)
         loader.load_preset("rf3_partial", overrides=["job.rf3.gpus=0"])
 
 
+def test_set_with_out_of_range_job_index_raises(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Out-of-range list indices in overrides fail with a clear ``KeyError``."""
+    monkeypatch.setenv("HOME", "/home/test")
+    with pytest.raises(KeyError, match="index 99"):
+        loader.load_preset("rf3_partial", overrides=["jobs.99.gpus=0"])
+
+
+def test_cyclic_variable_expansion_raises(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    """Cyclic ``${VAR}`` references fail fast instead of looping forever."""
+    bad = tmp_path / "cycle.toml"
+    bad.write_text(
+        "[shared_args]\n"
+        'proteins = "${A}"\n'
+        '[[jobs]]\nname = "j"\nenv = "rf3"\ngpus = "0"\noutput_subdir = "j"\nargs = {}\n'
+    )
+    monkeypatch.setenv("A", "${B}")
+    monkeypatch.setenv("B", "${A}")
+    with pytest.raises(ValueError, match="did not converge"):
+        loader.load_preset(str(bad))
+
+
 def test_bad_env_rejected(tmp_path: Path) -> None:
     """Preset jobs reject unsupported pixi environment names."""
     bad = tmp_path / "bad.toml"