From 45728c5bbab6eae61a08ecbc0ed385e10a631d28 Mon Sep 17 00:00:00 2001 From: octane0411 Date: Tue, 24 Mar 2026 00:23:27 +0800 Subject: [PATCH 1/4] feat(benchmark): add terminalbench prewarm workflow --- .gitignore | 4 + benchmark/terminalbench/README.md | 19 ++ .../open_agent_sdk_harbor/agent.py | 3 + .../install-open-agent-sdk.sh.j2 | 41 ++- benchmark/terminalbench/prewarm-images.sh | 287 ++++++++++++++++++ .../scripts/pack-local-tarballs.sh | 70 +++++ 6 files changed, 421 insertions(+), 3 deletions(-) create mode 100755 benchmark/terminalbench/prewarm-images.sh create mode 100755 benchmark/terminalbench/scripts/pack-local-tarballs.sh diff --git a/.gitignore b/.gitignore index 82761a4..d21422e 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,10 @@ RELEASE_NOTES.md jobs/ trajectories/ run-bench-test.sh +benchmark/terminalbench/.local-tarballs/ +open-agent-sdk*.tgz +packages/cli/open-agent-sdk-cli*.tgz +packages/core/open-agent-sdk*.tgz # Next.js build output .next/ diff --git a/benchmark/terminalbench/README.md b/benchmark/terminalbench/README.md index 4a6f1b0..1c8c29a 100644 --- a/benchmark/terminalbench/README.md +++ b/benchmark/terminalbench/README.md @@ -78,6 +78,25 @@ Scripts: - `benchmark/terminalbench/scripts/run-terminalbench-overnight.sh` - `benchmark/terminalbench/scripts/cleanup-terminalbench-images.sh` +- `benchmark/terminalbench/scripts/pack-local-tarballs.sh` + +To pre-install the local SDK/CLI build into cached task images and avoid repeated +registry installs on every trial: + +```bash +./benchmark/terminalbench/prewarm-images.sh \ + --tasks-file benchmark/terminalbench/task-lists/smoke-5.txt \ + --pack-local-tarballs +``` + +Notes: + +- This builds fresh local tarballs, serves them temporarily on `host.docker.internal`, + and bakes `bun`, `oas`, `uv`, and `pytest` into each task image. +- Override `--tarball-host` or `--tarball-port` if your Docker runtime cannot reach + `host.docker.internal:8765`. +- After pre-warm, normal Harbor runs reuse the pre-installed `oas` fast path and do + not need `OAS_LOCAL_TARBALL_URL`. The overnight runner always sources **main workspace** `.env` (via git common dir), so it works correctly even when executed from a git worktree. diff --git a/benchmark/terminalbench/open_agent_sdk_harbor/agent.py b/benchmark/terminalbench/open_agent_sdk_harbor/agent.py index 7ac2d49..5e33bc4 100644 --- a/benchmark/terminalbench/open_agent_sdk_harbor/agent.py +++ b/benchmark/terminalbench/open_agent_sdk_harbor/agent.py @@ -120,6 +120,9 @@ def _install_agent_template_path(self) -> Path: def _setup_env(self) -> dict[str, str]: """Pass mirror/local-install env vars to install script.""" env = super()._setup_env() + version = self.version() + if version: + env["OAS_PACKAGE_VERSION"] = version for key in ("OAS_GITHUB_MIRROR", "OAS_NPM_REGISTRIES", "OAS_LOCAL_TARBALL_URL"): val = os.environ.get(key) if val: diff --git a/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2 b/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2 index f9d3fe0..1950f4f 100644 --- a/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2 +++ b/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2 @@ -1,6 +1,15 @@ #!/bin/bash set -euo pipefail +# Fast path: skip install if oas CLI is already available (pre-warmed image) +# unless this run explicitly requests a fresh local tarball install. +if [ -z "${OAS_LOCAL_TARBALL_URL:-}" ] && command -v oas &>/dev/null && command -v bun &>/dev/null; then + echo "oas CLI already installed (pre-warmed image), skipping setup" + echo "Bun ready: $(bun --version)" + echo "CLI ready: $(which oas)" + exit 0 +fi + retry() { local attempts="$1" local sleep_sec="$2" @@ -18,6 +27,26 @@ retry() { done } +download_tarball_with_fallback() { + local dest="$1" + shift + local tarball_url_base="${OAS_LOCAL_TARBALL_URL%/}" + + for name in "$@"; do + [ -z "$name" ] && continue + local url="${tarball_url_base}/${name}" + echo "Fetching local tarball: ${url}" + if curl -fsSL "$url" -o "$dest"; then + echo "Downloaded ${name}" + return 0 + fi + echo "Missing ${name}, trying next candidate..." + done + + echo "ERROR: failed to download tarball from ${tarball_url_base}" >&2 + return 1 +} + install_cli_with_registry_fallback() { # Comma-separated registries, can be overridden by environment. # Example: @@ -65,8 +94,13 @@ export PATH="$HOME/.bun/bin:$PATH" if [ -n "${OAS_LOCAL_TARBALL_URL:-}" ]; then echo "Installing from local tarballs at ${OAS_LOCAL_TARBALL_URL}" mkdir -p /tmp/oas-local && cd /tmp/oas-local - curl -fsSL "${OAS_LOCAL_TARBALL_URL}/open-agent-sdk-0.1.0-alpha.1.tgz" -o sdk.tgz - curl -fsSL "${OAS_LOCAL_TARBALL_URL}/open-agent-sdk-cli-0.1.0-alpha.1.tgz" -o cli.tgz + package_version="${OAS_PACKAGE_VERSION:-0.1.0-alpha.1}" + download_tarball_with_fallback sdk.tgz \ + "open-agent-sdk.tgz" \ + "open-agent-sdk-${package_version}.tgz" + download_tarball_with_fallback cli.tgz \ + "open-agent-sdk-cli.tgz" \ + "open-agent-sdk-cli-${package_version}.tgz" mkdir -p sdk cli tar xzf sdk.tgz -C sdk --strip-components=1 tar xzf cli.tgz -C cli --strip-components=1 @@ -94,4 +128,5 @@ fi echo "Bun ready: $(bun --version)" echo "CLI ready: $(which oas)" -echo "CLI version: $(oas --version 2>&1 || echo 'version check failed')" +help_output="$(oas --help 2>&1 || true)" +echo "CLI help: $(printf '%s\n' "$help_output" | sed -n '1p')" diff --git a/benchmark/terminalbench/prewarm-images.sh b/benchmark/terminalbench/prewarm-images.sh new file mode 100755 index 0000000..d5cdecb --- /dev/null +++ b/benchmark/terminalbench/prewarm-images.sh @@ -0,0 +1,287 @@ +#!/usr/bin/env bash +set -euo pipefail + +# +# prewarm-images.sh — Pre-install bun + oas CLI + pytest into task Docker images +# +# Eliminates repeated ~100s setup overhead per task by baking dependencies +# into the task Docker images. The original images are backed up as +# oas-original/: and the pre-warmed images replace the originals, +# so Harbor uses them transparently without any config changes. +# +# Usage: +# ./benchmark/terminalbench/prewarm-images.sh +# ./benchmark/terminalbench/prewarm-images.sh --tasks-file benchmark/terminalbench/task-lists/smoke-5.txt +# ./benchmark/terminalbench/prewarm-images.sh --all +# ./benchmark/terminalbench/prewarm-images.sh --force +# ./benchmark/terminalbench/prewarm-images.sh --restore # restore original images +# + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +TASKS_FILE="${REPO_ROOT}/benchmark/terminalbench/task-lists/smoke-5.txt" +PREWARM_ALL=false +FORCE=false +RESTORE=false +BACKUP_PREFIX="oas-original" +PYPI_MIRROR="https://pypi.tuna.tsinghua.edu.cn/simple" +PACK_LOCAL_TARBALLS=false +TARBALL_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs" +TARBALL_PORT="8765" +TARBALL_HOST="host.docker.internal" +PACKAGE_VERSION="" +SERVER_LOG="" +TARBALL_SERVER_PID="" +DOCKER_ADD_HOST_ARGS=() + +usage() { + cat <<'EOF' +Usage: prewarm-images.sh [options] + +Options: + --tasks-file FILE Task list file (default: smoke-5.txt) + --all Pre-warm ALL cached task images + --force Force rebuild even if already pre-warmed + --restore Restore original images from backup + --pypi-mirror URL PyPI mirror for pytest install (default: tsinghua) + --pack-local-tarballs + Build repo-local SDK/CLI tarballs and serve them temporarily + --tarball-dir DIR Directory used for generated local tarballs + --tarball-port N HTTP port for temporary tarball server (default: 8765) + --tarball-host HOST Hostname containers should use for tarball server + (default: host.docker.internal) + -h, --help Show help +EOF +} + +while (($#)); do + case "$1" in + --tasks-file) TASKS_FILE="${2:-}"; shift 2 ;; + --all) PREWARM_ALL=true; shift ;; + --force) FORCE=true; shift ;; + --restore) RESTORE=true; shift ;; + --pypi-mirror) PYPI_MIRROR="${2:-}"; shift 2 ;; + --pack-local-tarballs) PACK_LOCAL_TARBALLS=true; shift ;; + --tarball-dir) TARBALL_DIR="${2:-}"; shift 2 ;; + --tarball-port) TARBALL_PORT="${2:-}"; shift 2 ;; + --tarball-host) TARBALL_HOST="${2:-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown argument: $1" >&2; usage; exit 1 ;; + esac +done + +cleanup() { + rm -f "$TASK_MAP" + if [ -n "${TARBALL_SERVER_PID:-}" ]; then + kill "$TARBALL_SERVER_PID" >/dev/null 2>&1 || true + wait "$TARBALL_SERVER_PID" 2>/dev/null || true + fi + if [ -n "${SERVER_LOG:-}" ]; then + rm -f "$SERVER_LOG" + fi +} + +# Load env +MAIN_GIT_DIR="$(git -C "$REPO_ROOT" rev-parse --git-common-dir)" +MAIN_ENV_FILE="$(cd "$MAIN_GIT_DIR/.." && pwd)/.env" +if [ -f "$MAIN_ENV_FILE" ]; then + set -a; source "$MAIN_ENV_FILE"; set +a +fi + +# Collect unique docker images from tasks +TASK_MAP=$(mktemp) +trap cleanup EXIT + +if [ "$PREWARM_ALL" = true ]; then + while IFS= read -r toml; do + image=$(grep 'docker_image' "$toml" 2>/dev/null | head -1 | sed 's/.*= *"//;s/"//') + [ -z "$image" ] && continue + task_name=$(basename "$(dirname "$toml")") + printf '%s\t%s\n' "$task_name" "$image" >> "$TASK_MAP" + done < <(find ~/.cache/harbor/tasks/ -name "task.toml" 2>/dev/null) +else + if [ ! -f "$TASKS_FILE" ]; then + echo "Tasks file not found: $TASKS_FILE" >&2 + exit 1 + fi + while IFS= read -r task_name; do + [ -z "$task_name" ] && continue + [[ "$task_name" == \#* ]] && continue + toml=$(find ~/.cache/harbor/tasks/ -path "*/$task_name/task.toml" 2>/dev/null | head -1) + if [ -z "$toml" ]; then + echo "WARN: task '$task_name' not found in cache, skipping" + continue + fi + image=$(grep -E '^docker_image|^# original_docker_image' "$toml" | head -1 | sed 's/.*= *"//;s/"//') + printf '%s\t%s\n' "$task_name" "$image" >> "$TASK_MAP" + done < "$TASKS_FILE" +fi + +UNIQUE_IMAGES=$(cut -f2 "$TASK_MAP" | sort -u) +UNIQUE_COUNT=$(echo "$UNIQUE_IMAGES" | grep -c . || true) +TASK_COUNT=$(wc -l < "$TASK_MAP" | tr -d ' ') + +# --restore: swap back original images +if [ "$RESTORE" = true ]; then + echo "=== Restoring original images ===" + RESTORED=0 + while IFS= read -r image; do + [ -z "$image" ] && continue + image_base=$(echo "$image" | sed 's|.*/||') + backup="${BACKUP_PREFIX}/${image_base}" + if docker image inspect "$backup" &>/dev/null; then + docker tag "$backup" "$image" + echo " [OK] $backup -> $image" + RESTORED=$((RESTORED + 1)) + fi + done <<< "$UNIQUE_IMAGES" + echo "Restored: $RESTORED" + exit 0 +fi + +echo "=== Pre-warm Docker images ===" +echo "Tasks: $TASK_COUNT Unique images: $UNIQUE_COUNT" +echo "" + +if [ "$PACK_LOCAL_TARBALLS" = true ]; then + PACK_SCRIPT="${REPO_ROOT}/benchmark/terminalbench/scripts/pack-local-tarballs.sh" + if [ ! -f "$PACK_SCRIPT" ]; then + echo "ERROR: pack script not found: $PACK_SCRIPT" >&2 + exit 1 + fi + if ! command -v python3 >/dev/null 2>&1; then + echo "ERROR: python3 is required for --pack-local-tarballs" >&2 + exit 1 + fi + + echo "=== Preparing local tarballs ===" + bash "$PACK_SCRIPT" --output-dir "$TARBALL_DIR" + + SERVER_LOG=$(mktemp) + python3 -m http.server "$TARBALL_PORT" --bind 0.0.0.0 --directory "$TARBALL_DIR" \ + >"$SERVER_LOG" 2>&1 & + TARBALL_SERVER_PID=$! + sleep 1 + if ! kill -0 "$TARBALL_SERVER_PID" >/dev/null 2>&1; then + echo "ERROR: failed to start local tarball server on port $TARBALL_PORT" >&2 + cat "$SERVER_LOG" >&2 || true + exit 1 + fi + + OAS_LOCAL_TARBALL_URL="http://${TARBALL_HOST}:${TARBALL_PORT}" + export OAS_LOCAL_TARBALL_URL + DOCKER_ADD_HOST_ARGS=(--add-host "${TARBALL_HOST}:host-gateway") + echo "Serving local tarballs from: $OAS_LOCAL_TARBALL_URL" + echo "" +fi + +if command -v python3 >/dev/null 2>&1; then + PACKAGE_VERSION="$(python3 -c 'import json, sys; print(json.load(open(sys.argv[1]))["version"])' "${REPO_ROOT}/package.json")" +fi + +INSTALL_SCRIPT="${REPO_ROOT}/benchmark/terminalbench/open_agent_sdk_harbor/install-open-agent-sdk.sh.j2" +if [ ! -f "$INSTALL_SCRIPT" ]; then + echo "ERROR: install script not found: $INSTALL_SCRIPT" >&2 + exit 1 +fi + +WARMED=0 +SKIPPED=0 +FAILED=0 + +while IFS= read -r image; do + [ -z "$image" ] && continue + + image_base=$(echo "$image" | sed 's|.*/||') + backup="${BACKUP_PREFIX}/${image_base}" + tasks=$(awk -F'\t' -v img="$image" '$2==img {printf "%s ", $1}' "$TASK_MAP") + + # Check if already pre-warmed (backup exists = already done) + if [ "$FORCE" != true ] && docker image inspect "$backup" &>/dev/null; then + echo "[SKIP] $image (already pre-warmed, use --force to rebuild)" + SKIPPED=$((SKIPPED + 1)) + continue + fi + + echo "[BUILD] $image" + echo " tasks: $tasks" + + # Pull original if needed + if ! docker image inspect "$image" &>/dev/null; then + echo " pulling $image ..." + if ! docker pull "$image"; then + echo " FAIL: pull failed" + FAILED=$((FAILED + 1)) + continue + fi + fi + + if docker image inspect "$backup" &>/dev/null; then + echo " restoring original image from backup before rebuild" + docker tag "$backup" "$image" + else + # Backup original image on first pre-warm + docker tag "$image" "$backup" + fi + + container_name="oas-prewarm-$$" + + # Build combined setup script + SETUP_SCRIPT=$(mktemp) + cat > "$SETUP_SCRIPT" << 'SETUP_HEADER' +#!/bin/bash +set -euo pipefail +SETUP_HEADER + + # Append agent install script (strip jinja) + sed 's/{%.*%}//g; s/{{.*}}//g' "$INSTALL_SCRIPT" >> "$SETUP_SCRIPT" + + # Append pytest pre-install + cat >> "$SETUP_SCRIPT" << SETUP_FOOTER + +# Pre-install uv + pytest for verifier +echo "=== Pre-installing uv + pytest ===" +if ! command -v curl &>/dev/null; then + apt-get update -qq && apt-get install -y -qq curl >/dev/null 2>&1 || true +fi +curl -LsSf https://astral.sh/uv/0.9.5/install.sh | sh +export PATH="\$HOME/.local/bin:\$PATH" +UV_INDEX_URL="${PYPI_MIRROR}" UV_HTTP_TIMEOUT=300 uvx \\ + -p 3.13 \\ + -w pytest==8.4.1 \\ + -w pytest-json-ctrf==0.3.5 \\ + pytest --version +echo "=== Pre-warm complete ===" +SETUP_FOOTER + + # Run in container + if docker run --name "$container_name" \ + "${DOCKER_ADD_HOST_ARGS[@]}" \ + -e "OAS_GITHUB_MIRROR=${OAS_GITHUB_MIRROR:-}" \ + -e "OAS_NPM_REGISTRIES=${OAS_NPM_REGISTRIES:-}" \ + -e "OAS_LOCAL_TARBALL_URL=${OAS_LOCAL_TARBALL_URL:-}" \ + -e "OAS_PACKAGE_VERSION=${PACKAGE_VERSION}" \ + "$image" \ + bash -c "$(cat "$SETUP_SCRIPT")" 2>&1 | tail -5; then + + # Replace original image with pre-warmed version + docker commit "$container_name" "$image" > /dev/null + docker rm -f "$container_name" > /dev/null + echo " OK: pre-warmed (backup: $backup)" + WARMED=$((WARMED + 1)) + else + echo " FAIL: setup exited with error" + # Restore original from backup + docker tag "$backup" "$image" + docker rm -f "$container_name" &>/dev/null || true + FAILED=$((FAILED + 1)) + fi + rm -f "$SETUP_SCRIPT" + +done <<< "$UNIQUE_IMAGES" + +echo "" +echo "=== Summary ===" +echo " Warmed: $WARMED Skipped: $SKIPPED Failed: $FAILED" +echo "" +echo "Done. Pre-warmed images replace originals — no config changes needed." +echo "Use --restore to revert to original images." diff --git a/benchmark/terminalbench/scripts/pack-local-tarballs.sh b/benchmark/terminalbench/scripts/pack-local-tarballs.sh new file mode 100755 index 0000000..eb1dfeb --- /dev/null +++ b/benchmark/terminalbench/scripts/pack-local-tarballs.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" +OUTPUT_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs" +SKIP_BUILD=false +NPM_CACHE_DIR="" + +usage() { + cat <<'EOF' +Usage: pack-local-tarballs.sh [options] + +Options: + --output-dir DIR Directory to write tarballs into + --skip-build Reuse existing packages/core/dist without rebuilding + -h, --help Show help +EOF +} + +while (($#)); do + case "$1" in + --output-dir) OUTPUT_DIR="${2:-}"; shift 2 ;; + --skip-build) SKIP_BUILD=true; shift ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown argument: $1" >&2; usage; exit 1 ;; + esac +done + +if ! command -v bun >/dev/null 2>&1; then + echo "bun not found in PATH" >&2 + exit 1 +fi + +if ! command -v npm >/dev/null 2>&1; then + echo "npm not found in PATH" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" +rm -f "$OUTPUT_DIR"/open-agent-sdk*.tgz "$OUTPUT_DIR"/open-agent-sdk-cli*.tgz +NPM_CACHE_DIR="${OUTPUT_DIR}/.npm-cache" +mkdir -p "$NPM_CACHE_DIR" + +if [ "$SKIP_BUILD" != true ]; then + echo "=== Building core package ===" + ( + cd "$REPO_ROOT/packages/core" + bun run build + ) +fi + +echo "=== Packing local tarballs ===" +CORE_TARBALL="$( + cd "$REPO_ROOT/packages/core" + NPM_CONFIG_CACHE="$NPM_CACHE_DIR" npm pack --silent --pack-destination "$OUTPUT_DIR" +)" +CLI_TARBALL="$( + cd "$REPO_ROOT/packages/cli" + NPM_CONFIG_CACHE="$NPM_CACHE_DIR" npm pack --silent --pack-destination "$OUTPUT_DIR" +)" + +cp "$OUTPUT_DIR/$CORE_TARBALL" "$OUTPUT_DIR/open-agent-sdk.tgz" +cp "$OUTPUT_DIR/$CLI_TARBALL" "$OUTPUT_DIR/open-agent-sdk-cli.tgz" + +echo "Output dir: $OUTPUT_DIR" +echo "Core tarball: $CORE_TARBALL" +echo "CLI tarball: $CLI_TARBALL" +echo "Stable aliases:" +echo " - open-agent-sdk.tgz" +echo " - open-agent-sdk-cli.tgz" From b923d8e3cdbbeba4cdbd06ca4a587162921673f5 Mon Sep 17 00:00:00 2001 From: octane0411 Date: Tue, 24 Mar 2026 00:23:46 +0800 Subject: [PATCH 2/4] feat(autoresearch): add smoke-5 experiment runner --- benchmark/autoresearch/README.md | 71 +++++ benchmark/autoresearch/evaluate.sh | 111 +++++--- benchmark/autoresearch/program.md | 17 +- benchmark/autoresearch/run-experiment.sh | 316 +++++++++++++++++++++++ 4 files changed, 479 insertions(+), 36 deletions(-) create mode 100644 benchmark/autoresearch/README.md create mode 100755 benchmark/autoresearch/run-experiment.sh diff --git a/benchmark/autoresearch/README.md b/benchmark/autoresearch/README.md new file mode 100644 index 0000000..da101c7 --- /dev/null +++ b/benchmark/autoresearch/README.md @@ -0,0 +1,71 @@ +# Autoresearch for Terminal-Bench + +This directory adapts the `program.md`-driven workflow popularized by +Karpathy's `autoresearch` project to Open Agent SDK and Terminal-bench. + +## Design + +- The optimizing agent edits a narrow search surface: + - `packages/cli/src/index.ts` + - selected tool descriptions/formatters in `packages/core/src/tools/` + - `packages/core/src/agent/react-loop.ts` +- The evaluator is fixed: + - `benchmark/autoresearch/evaluate.sh` + - Harbor + `terminal-bench@2.0` + - task list defaults to `benchmark/terminalbench/task-lists/smoke-5.txt` +- Results are append-only in `benchmark/autoresearch/results.tsv` + +This is similar in spirit to +https://github.com/karpathy/autoresearch: +- immutable benchmark +- narrow editable surface +- repeated keep/revert loop +- a Markdown `program.md` that acts like lightweight org code + +## Recommended Loop + +1. Read `program.md` and `scope.md` +2. Make one small hypothesis-driven change +3. Commit it +4. Run: + +```bash +bash ./benchmark/autoresearch/run-experiment.sh --tag "" +``` + +5. If you want automatic rollback on regressions: + +```bash +bash ./benchmark/autoresearch/run-experiment.sh \ + --tag "" \ + --revert-on-regress +``` + +The script will: +- run `bun test` +- run `evaluate.sh` +- append a row to `results.tsv` +- compare the latest row to the previous row +- emit `KEEP` or `REVERT` +- optionally `git reset --hard HEAD~1` while preserving the results log + +By default it also: +- packs the current local SDK/CLI into tarballs +- serves them over a temporary local HTTP server +- exports `OAS_LOCAL_TARBALL_URL` + +This makes Harbor evaluate the latest local code even when task images are already +pre-warmed. Disable this only if you intentionally want to benchmark the code +already baked into the images: + +```bash +bash ./benchmark/autoresearch/run-experiment.sh \ + --tag "" \ + --no-local-tarballs +``` + +## Cost Control + +For Terminal-bench, benchmark cost matters more than in single-metric toy setups. +Use the existing pre-warmed image path under `benchmark/terminalbench/` so +experiments do not repeatedly reinstall Bun and the OAS CLI during agent setup. diff --git a/benchmark/autoresearch/evaluate.sh b/benchmark/autoresearch/evaluate.sh index 4bb30ea..acb26b2 100755 --- a/benchmark/autoresearch/evaluate.sh +++ b/benchmark/autoresearch/evaluate.sh @@ -94,12 +94,71 @@ echo "=== autoresearch evaluate ===" echo "tasks=$TASK_COUNT k=$K model=$MODEL tag=$TAG" echo "" -# ── Helper: extract reward from harbor result.json ── -# Harbor prints "Results written to /result.json" in stdout. -# The run-level result.json contains stats.evals.*.metrics[0].mean -# The trial-level result.json contains verifier_result.reward -# -# We parse the run-level result.json for the mean reward. +# ── Helper: extract reward from a task-level Harbor result.json ── +extract_reward_from_result_file() { + local result_file="$1" + + if [ ! -f "$result_file" ]; then + echo "-1" + return + fi + + python3 - "$result_file" <<'PY' 2>/dev/null +import json +import sys + +path = sys.argv[1] + +try: + with open(path) as f: + d = json.load(f) + + vr = d.get("verifier_result") or {} + rewards = vr.get("rewards") or {} + reward = vr.get("reward", rewards.get("reward")) + + if reward is not None: + print(int(float(reward) >= 0.5)) + elif d.get("exception_info"): + print(-1) + else: + print(0) +except Exception: + print(-1) +PY +} + +# ── Helper: find the newest task-level result.json produced after a marker ── +find_latest_task_result() { + local task_name="$1" + local marker_file="$2" + + python3 - "$REPO_ROOT" "$task_name" "$marker_file" <<'PY' 2>/dev/null +import glob +import os +import sys + +repo_root, task_name, marker_file = sys.argv[1:] +marker_mtime = os.path.getmtime(marker_file) + +pattern = os.path.join(repo_root, "jobs", "*", f"{task_name}__*", "result.json") +candidates = [] + +for path in glob.glob(pattern): + try: + mtime = os.path.getmtime(path) + except OSError: + continue + if mtime >= marker_mtime: + candidates.append((mtime, path)) + +if candidates: + candidates.sort() + print(candidates[-1][1]) +PY +} + +# ── Helper: fallback to parsing Harbor stdout when artifacts are unavailable ── extract_reward_from_output() { local run_output="$1" @@ -117,32 +176,7 @@ extract_reward_from_output() { trial_result=$(find "$result_dir" -mindepth 2 -name "result.json" 2>/dev/null | head -1) if [ -n "$trial_result" ] && [ -f "$trial_result" ]; then - # Check verifier_result.reward in trial result - local reward - reward=$(python3 -c " -import json, sys -try: - d = json.load(open('$trial_result')) - vr = d.get('verifier_result') or {} - # Harbor stores reward in different formats: - # verifier_result.reward (flat) - # verifier_result.rewards.reward (nested) - r = vr.get('reward') - if r is None: - rewards = vr.get('rewards') or {} - r = rewards.get('reward') - if r is not None: - print(int(float(r) >= 0.5)) - sys.exit(0) - # No verifier result — check if there was an exception - if d.get('exception_info'): - print(-1) - else: - print(0) -except Exception: - print(-1) -" 2>/dev/null) - echo "${reward:--1}" + extract_reward_from_result_file "$trial_result" return fi @@ -179,6 +213,8 @@ except Exception: # ── Helper: run one trial, return 1=pass 0=fail -1=error ── run_single_trial() { local task_name="$1" + local marker_file + marker_file="$(mktemp)" # Build harbor command as array local -a cmd=( @@ -236,9 +272,18 @@ run_single_trial() { rc=$? set -e + local latest_result reward + latest_result="$(find_latest_task_result "$task_name" "$marker_file")" + rm -f "$marker_file" + + if [ -n "$latest_result" ]; then + reward="$(extract_reward_from_result_file "$latest_result")" + echo "${reward:--1}" + return + fi + if [ "$rc" -ne 0 ]; then # Harbor exited non-zero, but still may have written result.json - local reward reward=$(extract_reward_from_output "$run_output") if [ "$reward" != "-1" ]; then echo "$reward" diff --git a/benchmark/autoresearch/program.md b/benchmark/autoresearch/program.md index 5107a60..c2a9869 100644 --- a/benchmark/autoresearch/program.md +++ b/benchmark/autoresearch/program.md @@ -25,7 +25,7 @@ When k=1, all metrics collapse to the same number (simple pass rate). 4. Create a git branch: `git checkout -b autoresearch/` (use a short, descriptive tag). 5. Run a **baseline evaluation**: ```bash - ./benchmark/autoresearch/evaluate.sh -k 3 --tag baseline --output benchmark/autoresearch/results.tsv + bash ./benchmark/autoresearch/run-experiment.sh --tag baseline ``` 6. Review the baseline numbers before proceeding. @@ -73,11 +73,17 @@ git commit -m "experiment: " ### Step 5: Evaluate +Preferred: +```bash +bash ./benchmark/autoresearch/run-experiment.sh --tag "" +``` + +Manual fallback: ```bash ./benchmark/autoresearch/evaluate.sh -k 3 --tag "" --output benchmark/autoresearch/results.tsv ``` -Wait for it to complete. The script outputs pass@k, pass^k, and avg_trial_rate. +Wait for it to complete. The scripts output pass@k, pass^k, and avg_trial_rate. ### Step 6: Analyze & Decide @@ -92,11 +98,16 @@ Read the results and compare against the previous baseline in `results.tsv`. - pass@k decreased (lost capability) - pass^k decreased AND pass@k didn't improve (net negative) -To revert: +To revert manually: ```bash git reset --hard HEAD~1 ``` +If you use the helper script below, it can decide and revert automatically: +```bash +bash ./benchmark/autoresearch/run-experiment.sh --tag "" --revert-on-regress +``` + Record failed experiments in `results.tsv` anyway — append `[REVERTED]` to the description. **Prioritization:** diff --git a/benchmark/autoresearch/run-experiment.sh b/benchmark/autoresearch/run-experiment.sh new file mode 100755 index 0000000..bd6f2a8 --- /dev/null +++ b/benchmark/autoresearch/run-experiment.sh @@ -0,0 +1,316 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +OUTPUT="${REPO_ROOT}/benchmark/autoresearch/results.tsv" +TASKS_FILE="${REPO_ROOT}/benchmark/terminalbench/task-lists/smoke-5.txt" +MODEL="MiniMax-M2.5" +K=3 +TAG="" +SLEEP_BETWEEN=3 +SKIP_TESTS=false +REVERT_ON_REGRESS=false +FULL_TESTS=false +TEST_CMD=( + "bun" "test" + "packages/core/tests/agent/react-loop.test.ts" + "packages/core/tests/agent/react-loop-system-prompt.test.ts" + "packages/core/tests/agent/compact.test.ts" + "packages/core/tests/agent/compact-auto-trigger.test.ts" + "packages/core/tests/tools/bash.test.ts" + "packages/core/tests/tools/read.test.ts" + "packages/core/tests/tools/write.test.ts" + "packages/core/tests/tools/edit.test.ts" + "packages/core/tests/tools/glob.test.ts" + "packages/core/tests/tools/grep.test.ts" + "packages/core/tests/integration.test.ts" +) +OUTPUT_REL="" +USE_LOCAL_TARBALLS=true +TARBALL_DIR="${REPO_ROOT}/benchmark/terminalbench/.local-tarballs" +TARBALL_PORT="8765" +TARBALL_HOST="host.docker.internal" +TARBALL_SERVER_PID="" +TARBALL_SERVER_LOG="" + +usage() { + cat <<'EOF' +Usage: run-experiment.sh --tag