diff --git a/.gitignore b/.gitignore index 82761a4..d21422e 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,10 @@ RELEASE_NOTES.md jobs/ trajectories/ run-bench-test.sh +benchmark/terminalbench/.local-tarballs/ +open-agent-sdk*.tgz +packages/cli/open-agent-sdk-cli*.tgz +packages/core/open-agent-sdk*.tgz # Next.js build output .next/ diff --git a/benchmark/autoresearch/README.md b/benchmark/autoresearch/README.md new file mode 100644 index 0000000..da101c7 --- /dev/null +++ b/benchmark/autoresearch/README.md @@ -0,0 +1,71 @@ +# Autoresearch for Terminal-Bench + +This directory adapts the `program.md`-driven workflow popularized by +Karpathy's `autoresearch` project to Open Agent SDK and Terminal-bench. + +## Design + +- The optimizing agent edits a narrow search surface: + - `packages/cli/src/index.ts` + - selected tool descriptions/formatters in `packages/core/src/tools/` + - `packages/core/src/agent/react-loop.ts` +- The evaluator is fixed: + - `benchmark/autoresearch/evaluate.sh` + - Harbor + `terminal-bench@2.0` + - task list defaults to `benchmark/terminalbench/task-lists/smoke-5.txt` +- Results are append-only in `benchmark/autoresearch/results.tsv` + +This is similar in spirit to +https://github.com/karpathy/autoresearch: +- immutable benchmark +- narrow editable surface +- repeated keep/revert loop +- a Markdown `program.md` that acts like lightweight org code + +## Recommended Loop + +1. Read `program.md` and `scope.md` +2. Make one small hypothesis-driven change +3. Commit it +4. Run: + +```bash +bash ./benchmark/autoresearch/run-experiment.sh --tag "" +``` + +5. If you want automatic rollback on regressions: + +```bash +bash ./benchmark/autoresearch/run-experiment.sh \ + --tag "" \ + --revert-on-regress +``` + +The script will: +- run `bun test` +- run `evaluate.sh` +- append a row to `results.tsv` +- compare the latest row to the previous row +- emit `KEEP` or `REVERT` +- optionally `git reset --hard HEAD~1` while preserving the results log + +By default it also: +- packs the current local SDK/CLI into tarballs +- serves them over a temporary local HTTP server +- exports `OAS_LOCAL_TARBALL_URL` + +This makes Harbor evaluate the latest local code even when task images are already +pre-warmed. Disable this only if you intentionally want to benchmark the code +already baked into the images: + +```bash +bash ./benchmark/autoresearch/run-experiment.sh \ + --tag "" \ + --no-local-tarballs +``` + +## Cost Control + +For Terminal-bench, benchmark cost matters more than in single-metric toy setups. +Use the existing pre-warmed image path under `benchmark/terminalbench/` so +experiments do not repeatedly reinstall Bun and the OAS CLI during agent setup. diff --git a/benchmark/autoresearch/evaluate.sh b/benchmark/autoresearch/evaluate.sh index 4bb30ea..acb26b2 100755 --- a/benchmark/autoresearch/evaluate.sh +++ b/benchmark/autoresearch/evaluate.sh @@ -94,12 +94,71 @@ echo "=== autoresearch evaluate ===" echo "tasks=$TASK_COUNT k=$K model=$MODEL tag=$TAG" echo "" -# ── Helper: extract reward from harbor result.json ── -# Harbor prints "Results written to /result.json" in stdout. -# The run-level result.json contains stats.evals.*.metrics[0].mean -# The trial-level result.json contains verifier_result.reward -# -# We parse the run-level result.json for the mean reward. +# ── Helper: extract reward from a task-level Harbor result.json ── +extract_reward_from_result_file() { + local result_file="$1" + + if [ ! -f "$result_file" ]; then + echo "-1" + return + fi + + python3 - "$result_file" <<'PY' 2>/dev/null +import json +import sys + +path = sys.argv[1] + +try: + with open(path) as f: + d = json.load(f) + + vr = d.get("verifier_result") or {} + rewards = vr.get("rewards") or {} + reward = vr.get("reward", rewards.get("reward")) + + if reward is not None: + print(int(float(reward) >= 0.5)) + elif d.get("exception_info"): + print(-1) + else: + print(0) +except Exception: + print(-1) +PY +} + +# ── Helper: find the newest task-level result.json produced after a marker ── +find_latest_task_result() { + local task_name="$1" + local marker_file="$2" + + python3 - "$REPO_ROOT" "$task_name" "$marker_file" <<'PY' 2>/dev/null +import glob +import os +import sys + +repo_root, task_name, marker_file = sys.argv[1:] +marker_mtime = os.path.getmtime(marker_file) + +pattern = os.path.join(repo_root, "jobs", "*", f"{task_name}__*", "result.json") +candidates = [] + +for path in glob.glob(pattern): + try: + mtime = os.path.getmtime(path) + except OSError: + continue + if mtime >= marker_mtime: + candidates.append((mtime, path)) + +if candidates: + candidates.sort() + print(candidates[-1][1]) +PY +} + +# ── Helper: fallback to parsing Harbor stdout when artifacts are unavailable ── extract_reward_from_output() { local run_output="$1" @@ -117,32 +176,7 @@ extract_reward_from_output() { trial_result=$(find "$result_dir" -mindepth 2 -name "result.json" 2>/dev/null | head -1) if [ -n "$trial_result" ] && [ -f "$trial_result" ]; then - # Check verifier_result.reward in trial result - local reward - reward=$(python3 -c " -import json, sys -try: - d = json.load(open('$trial_result')) - vr = d.get('verifier_result') or {} - # Harbor stores reward in different formats: - # verifier_result.reward (flat) - # verifier_result.rewards.reward (nested) - r = vr.get('reward') - if r is None: - rewards = vr.get('rewards') or {} - r = rewards.get('reward') - if r is not None: - print(int(float(r) >= 0.5)) - sys.exit(0) - # No verifier result — check if there was an exception - if d.get('exception_info'): - print(-1) - else: - print(0) -except Exception: - print(-1) -" 2>/dev/null) - echo "${reward:--1}" + extract_reward_from_result_file "$trial_result" return fi @@ -179,6 +213,8 @@ except Exception: # ── Helper: run one trial, return 1=pass 0=fail -1=error ── run_single_trial() { local task_name="$1" + local marker_file + marker_file="$(mktemp)" # Build harbor command as array local -a cmd=( @@ -236,9 +272,18 @@ run_single_trial() { rc=$? set -e + local latest_result reward + latest_result="$(find_latest_task_result "$task_name" "$marker_file")" + rm -f "$marker_file" + + if [ -n "$latest_result" ]; then + reward="$(extract_reward_from_result_file "$latest_result")" + echo "${reward:--1}" + return + fi + if [ "$rc" -ne 0 ]; then # Harbor exited non-zero, but still may have written result.json - local reward reward=$(extract_reward_from_output "$run_output") if [ "$reward" != "-1" ]; then echo "$reward" diff --git a/benchmark/autoresearch/program.md b/benchmark/autoresearch/program.md index 5107a60..c2a9869 100644 --- a/benchmark/autoresearch/program.md +++ b/benchmark/autoresearch/program.md @@ -25,7 +25,7 @@ When k=1, all metrics collapse to the same number (simple pass rate). 4. Create a git branch: `git checkout -b autoresearch/` (use a short, descriptive tag). 5. Run a **baseline evaluation**: ```bash - ./benchmark/autoresearch/evaluate.sh -k 3 --tag baseline --output benchmark/autoresearch/results.tsv + bash ./benchmark/autoresearch/run-experiment.sh --tag baseline ``` 6. Review the baseline numbers before proceeding. @@ -73,11 +73,17 @@ git commit -m "experiment: " ### Step 5: Evaluate +Preferred: +```bash +bash ./benchmark/autoresearch/run-experiment.sh --tag "" +``` + +Manual fallback: ```bash ./benchmark/autoresearch/evaluate.sh -k 3 --tag "" --output benchmark/autoresearch/results.tsv ``` -Wait for it to complete. The script outputs pass@k, pass^k, and avg_trial_rate. +Wait for it to complete. The scripts output pass@k, pass^k, and avg_trial_rate. ### Step 6: Analyze & Decide @@ -92,11 +98,16 @@ Read the results and compare against the previous baseline in `results.tsv`. - pass@k decreased (lost capability) - pass^k decreased AND pass@k didn't improve (net negative) -To revert: +To revert manually: ```bash git reset --hard HEAD~1 ``` +If you use the helper script below, it can decide and revert automatically: +```bash +bash ./benchmark/autoresearch/run-experiment.sh --tag "" --revert-on-regress +``` + Record failed experiments in `results.tsv` anyway — append `[REVERTED]` to the description. **Prioritization:** diff --git a/benchmark/autoresearch/run-experiment.sh b/benchmark/autoresearch/run-experiment.sh new file mode 100755 index 0000000..03f32aa --- /dev/null +++ b/benchmark/autoresearch/run-experiment.sh @@ -0,0 +1,329 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +OUTPUT="${REPO_ROOT}/benchmark/autoresearch/results.tsv" +TASKS_FILE="${REPO_ROOT}/benchmark/terminalbench/task-lists/smoke-5.txt" +MODEL="MiniMax-M2.5" +K=3 +TAG="" +SLEEP_BETWEEN=3 +SKIP_TESTS=false +REVERT_ON_REGRESS=false +FULL_TESTS=false +TEST_CMD=( + "bun" "test" + "packages/core/tests/agent/react-loop.test.ts" + "packages/core/tests/agent/react-loop-system-prompt.test.ts" + "packages/core/tests/agent/compact.test.ts" + "packages/core/tests/agent/compact-auto-trigger.test.ts" + "packages/core/tests/tools/bash.test.ts" + "packages/core/tests/tools/read.test.ts" + "packages/core/tests/tools/write.test.ts" + "packages/core/tests/tools/edit.test.ts" + "packages/core/tests/tools/glob.test.ts" + "packages/core/tests/tools/grep.test.ts" + "packages/core/tests/integration.test.ts" +) +OUTPUT_REL="" +USE_LOCAL_TARBALLS=true +TARBALL_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs" +TARBALL_PORT="8765" +TARBALL_HOST="host.docker.internal" +TARBALL_SERVER_PID="" +TARBALL_SERVER_LOG="" + +usage() { + cat <<'EOF' +Usage: run-experiment.sh --tag