From ae34dd99b41ebcbfb5a5f5eb19c5ea1f04f836ab Mon Sep 17 00:00:00 2001 From: Stephen Chu Date: Wed, 10 Jun 2026 16:03:10 -0700 Subject: [PATCH] fix(cleanup): preserve baseline seed experiments on standard cleanup delete_ci_experiments() deleted every experiment linked to the dataset, including the baseline-haiku / baseline-sonnet seeds that setup.py creates as the demo's Haiku-vs-Sonnet "before" reference. So a standard cleanup emptied the experiment list even though the README says setup does not need to be re-run afterward. Skip experiments whose name starts with "baseline-" so the seeds survive; CI/Engine experiments are still swept. Updates the docstring, step message, and README Cleanup section to match. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 2 +- scripts/cleanup.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 2cc221c..a04da2a 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,7 @@ python -m scripts.cleanup This does four things: 1. **Resets dataset to original 3 examples** — deletes all examples and re-uploads the canonical 3, removing anything Engine added -2. **Deletes all experiments** — CI/CD generates fresh before/after experiments on every PR, so nothing needs to be preserved between demos +2. **Deletes CI/Engine experiments** — keeps the `baseline-*` seed experiments from `setup.py` (the Haiku-vs-Sonnet "before" reference); CI/CD regenerates before/after experiments on every PR 3. **Removes Engine-added online evaluators** — uses saved run rule IDs from `.demo_state.json` to delete only evaluators Engine added, leaving the 5 from `setup.py` in place 4. **Resets main to the `baseline` tag** — force-resets to remove Engine's merged PR, restoring the buggy agent state diff --git a/scripts/cleanup.py b/scripts/cleanup.py index 22adab3..9967846 100644 --- a/scripts/cleanup.py +++ b/scripts/cleanup.py @@ -2,7 +2,7 @@ Resets the demo to a clean state so it can be run again without re-running setup: 1. Resets dataset to the original 3 examples (deletes Engine-added examples) - 2. Deletes all experiments — CI/CD generates fresh before/after on every PR + 2. Deletes CI/Engine experiments (keeps the baseline-* seeds from setup.py) 3. Removes Engine-added online evaluators (keeps the 5 registered by setup.py) 4. Force-resets main back to the 'baseline' tag (removes Engine's merged PR) @@ -69,22 +69,30 @@ def reset_dataset() -> None: # ── 2. Delete 'after' experiments ───────────────────────────────────────────── def delete_ci_experiments() -> None: - """Delete all experiments linked to the dataset. + """Delete CI/Engine experiments linked to the dataset. - CI/CD generates fresh before/after experiments on every PR, so there is - no experiment worth keeping between demos. + CI/CD generates fresh before/after experiments on every PR, so those are + not worth keeping between demos. The `baseline-*` experiments seeded by + setup.py ARE preserved: they're the demo's Haiku-vs-Sonnet "before" + reference and would otherwise have to be reseeded after every cleanup. """ from langsmith import Client - print(f"\n[2/3] Removing all experiments from demo datasets...") + print(f"\n[2/3] Removing CI/Engine experiments (keeping baseline seeds)...") ls_client = Client() total_deleted = 0 + total_kept = 0 for name in (DATASET_NAME, TOOL_ADHERENCE_DATASET_NAME): datasets = list(ls_client.list_datasets(dataset_name=name)) if not datasets: continue experiments = list(ls_client.list_projects(reference_dataset_id=datasets[0].id)) for exp in experiments: + # Preserve the baseline-* seeds setup.py creates (the Haiku/Sonnet + # "before" reference); only sweep CI/Engine experiments. + if exp.name.startswith("baseline-"): + total_kept += 1 + continue for attempt in range(3): try: ls_client.delete_project(project_name=exp.name) @@ -97,7 +105,7 @@ def delete_ci_experiments() -> None: else: print(f" Failed to delete '{exp.name}': {e}") break - print(f" Deleted {total_deleted} experiment(s) across both datasets.") + print(f" Deleted {total_deleted} experiment(s), kept {total_kept} baseline seed(s).") # ── 3. Delete Engine-added online evaluators ───────────────────────────────────