Skip to content

Commit d5d065a

Browse files
mag-asteraclaude
andcommitted
fix(runs): address CodeRabbit review findings
- cli.py: `--only` filter now preserves `shared_args`; previously the filtered Preset dropped them, silently losing every job's shared flags. - loader.py: `--set` now rejects unknown top-level keys. A typo like `--set job.rf3.gpus=0` (note missing 's') used to auto-create an unused `job` dict and silently no-op; now it raises KeyError with the valid keys listed. - runner.py: handle partial spawn failures. If one job fails to spawn midway, already-launched jobs are terminated and joined instead of orphaned. Also wraps Popen in try/close to avoid log-file handle leak if the subprocess fails to start. - presets/rf3_partial_chiral_off.toml: change `output_subdir = "."` (write to RESULTS_DIR root) to `"rf3"` (subdir under RESULTS_DIR) for consistency with the other RF3 presets and collision safety when RESULTS_DIR is overridden to a shared location. Skipped (faithful to bash original): rf3_protenix.toml asymmetric partial-diffusion-step — the source `run_rf3_protenix_mdc_actl.sh` deliberately set it only for the Protenix job, not RF3. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c3f0d12 commit d5d065a

5 files changed

Lines changed: 45 additions & 9 deletions

File tree

src/sampleworks/runs/cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def _filter_only(preset: Preset, only: str) -> Preset:
8787
name=preset.name,
8888
description=preset.description,
8989
defaults=preset.defaults,
90+
shared_args=preset.shared_args,
9091
jobs=keep,
9192
)
9293

src/sampleworks/runs/loader.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,17 @@ def _apply_overrides(raw: dict[str, Any], overrides: list[str]) -> dict[str, Any
6464
return raw
6565

6666

67+
_TOP_LEVEL_KEYS = frozenset({"description", "defaults", "shared_args", "jobs"})
68+
69+
6770
def _set_dotted(obj: dict[str, Any], dotted: str, value: Any) -> None:
6871
"""Set ``obj`` at ``a.b.c`` to ``value``. Job name lookup is allowed under ``jobs``."""
6972
parts = dotted.split(".")
73+
if parts[0] not in _TOP_LEVEL_KEYS:
74+
raise KeyError(
75+
f"--set: unknown top-level key {parts[0]!r} in {dotted!r}. "
76+
f"Valid top-level keys: {sorted(_TOP_LEVEL_KEYS)}"
77+
)
7078
cursor: Any = obj
7179
for i, part in enumerate(parts[:-1]):
7280
cursor = _index(cursor, part, where=".".join(parts[: i + 1]))

src/sampleworks/runs/presets/rf3_partial_chiral_off.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ disable-chiral-features = true
2222
name = "rf3"
2323
env = "rf3"
2424
gpus = "5"
25-
output_subdir = "."
25+
output_subdir = "rf3"
2626
args = { model = "rf3", gradient-weights = "0.0 0.005 0.01 0.02 0.035 0.05 0.1 0.2 0.35 0.5", model-checkpoint = "${RF3_CHECKPOINT}" }

src/sampleworks/runs/runner.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,26 @@ def run(preset: Preset, *, results_dir: Path, dry_run: bool = False) -> int:
6464
return 0
6565

6666
_print_launch_summary(preset, invocations)
67-
processes = [_spawn(inv) for inv in invocations]
67+
processes: list[_RunningJob] = []
68+
try:
69+
for inv in invocations:
70+
processes.append(_spawn(inv))
71+
except BaseException:
72+
_terminate_all(processes)
73+
raise
6874
return _wait_all(processes)
6975

7076

77+
def _terminate_all(jobs: list[_RunningJob]) -> None:
78+
"""Terminate any already-launched jobs (used when a later spawn fails)."""
79+
for j in jobs:
80+
if j.proc.poll() is None:
81+
j.proc.terminate()
82+
for j in jobs:
83+
j.proc.wait()
84+
j.tee_thread.join()
85+
86+
7187
def _print_dry_run(inv: JobInvocation) -> None:
7288
print(f"# job: {inv.job.name} (env={inv.job.env}, gpus={inv.job.gpus})", file=sys.stderr)
7389
print(f"# log: {inv.log_path}", file=sys.stderr)
@@ -99,13 +115,17 @@ class _RunningJob:
99115
def _spawn(inv: JobInvocation) -> _RunningJob:
100116
inv.log_path.parent.mkdir(parents=True, exist_ok=True)
101117
log_file = open(inv.log_path, "wb")
102-
proc = subprocess.Popen(
103-
inv.argv,
104-
env=inv.env,
105-
stdout=subprocess.PIPE,
106-
stderr=subprocess.STDOUT,
107-
bufsize=0,
108-
)
118+
try:
119+
proc = subprocess.Popen(
120+
inv.argv,
121+
env=inv.env,
122+
stdout=subprocess.PIPE,
123+
stderr=subprocess.STDOUT,
124+
bufsize=0,
125+
)
126+
except BaseException:
127+
log_file.close()
128+
raise
109129
assert proc.stdout is not None
110130
thread = threading.Thread(
111131
target=_tee,

tests/runs/test_loader.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,13 @@ def test_set_without_equals_raises(monkeypatch: pytest.MonkeyPatch) -> None:
130130
loader.load_preset("rf3_partial", overrides=["bogus_no_equals"])
131131

132132

133+
def test_set_with_unknown_top_level_key_raises(monkeypatch: pytest.MonkeyPatch) -> None:
134+
"""Typos like ``--set job.rf3.gpus=0`` (missing 's' in jobs) must not silently no-op."""
135+
monkeypatch.setenv("HOME", "/home/test")
136+
with pytest.raises(KeyError, match="unknown top-level key"):
137+
loader.load_preset("rf3_partial", overrides=["job.rf3.gpus=0"])
138+
139+
133140
def test_bad_env_rejected(tmp_path: Path) -> None:
134141
bad = tmp_path / "bad.toml"
135142
bad.write_text(

0 commit comments

Comments
 (0)