From e0e0b33d7ac5210f4298f2567926dfc4842d5da1 Mon Sep 17 00:00:00 2001 From: benediktstroebl <50178209+benediktstroebl@users.noreply.github.com> Date: Sun, 19 Apr 2026 08:53:28 -0700 Subject: [PATCH 1/2] Update harbor-adapter-creator: src layout, task.toml [task] section, parity runs schema, new reference adapters --- skills/harbor-adapter-creator/SKILL.md | 174 +++++++++++++++++-------- 1 file changed, 118 insertions(+), 56 deletions(-) diff --git a/skills/harbor-adapter-creator/SKILL.md b/skills/harbor-adapter-creator/SKILL.md index 6f7379f..e9d8a96 100644 --- a/skills/harbor-adapter-creator/SKILL.md +++ b/skills/harbor-adapter-creator/SKILL.md @@ -21,23 +21,30 @@ Adapters convert external benchmarks (SimpleQA, GAIA, AiderPolyglot, CodePDE, sp ## Adapter Directory Structure +Adapters now use a `src` layout (Python package under `src//`): + ``` -adapters// -├── adapter.py # Core conversion logic (adapter class) -├── run_adapter.py # CLI entry point -├── run_.yaml # Job config for oracle and parity experiment runs -├── README.md # Benchmark docs, license, parity, citation -├── parity_experiment.json # Parity tracking results (JSON array) -├── adapter_metadata.json # Adapter metadata (JSON array) -└── template/ # Task template files - ├── task.toml - ├── instruction.md - ├── environment/ - │ └── Dockerfile - ├── tests/ - │ └── test.sh - └── solution/ - └── solve.sh +adapters// +├── .python-version # Python version (created by uv init) +├── pyproject.toml # Python package config +├── README.md # Benchmark docs, license, parity, citation +├── adapter_metadata.json # Adapter metadata (JSON array) +├── parity_experiment.json # Parity tracking results (JSON array) +├── run_.yaml # Job config for oracle and parity experiment runs +└── src/ + └── / # adapter-name with dashes → underscores + ├── __init__.py + ├── adapter.py # Core conversion logic (adapter class) + ├── main.py # CLI entry point + └── task-template/ # Template files copied into each task + ├── task.toml + ├── instruction.md + ├── environment/ + │ └── Dockerfile + ├── tests/ + │ └── test.sh + └── solution/ + └── solve.sh ``` All template files, parity_experiment.json, adapter_metadata.json, and README.md are required. The validator (`harbor adapters validate`) checks for all of them. The YAML job config is not validated but is expected for parity experiments. @@ -56,7 +63,7 @@ The interactive `AdapterWizard` prompts for: - **Source URL** -- Link to original benchmark paper or repo - **License** -- Dataset license for the README -If all fields are provided as CLI arguments, the wizard runs non-interactively. The wizard renders Jinja2 templates from `src/harbor/cli/template-adapter/` and places the result under `adapters//`. +If all fields are provided as CLI arguments, the wizard runs non-interactively. The wizard renders Jinja2 templates from `src/harbor/cli/template-adapter/` and places the result under `adapters//` using the src layout above. After scaffolding, validate: @@ -102,8 +109,8 @@ class MyBenchmarkAdapter: self.task_dir = Path(task_dir) self._config = kwargs - # Locate template files relative to this file - self.template_dir = Path(__file__).parent / "template" + # Locate template files relative to this file (inside src//) + self.template_dir = Path(__file__).parent / "task-template" if not self.template_dir.exists(): raise FileNotFoundError(f"Template directory not found: {self.template_dir}") @@ -191,9 +198,9 @@ When source benchmarks include per-instance files (images, CSVs, databases): 3. Reference files from `instruction.md` using the container path (e.g., `/app/files/data.csv`) 4. Mention the attachment in the instruction so the agent knows about it -## Writing run_adapter.py +## Writing main.py -The CLI entry point follows a standard pattern with helper functions for argument parsing, ID collection, repo cloning, and benchmark processing. The default output directory is `datasets/`. +The CLI entry point lives at `src//main.py` and follows a standard pattern. Run it via `uv run python -m .main`. The default output directory is `datasets/`. ```python #!/usr/bin/env python3 @@ -202,7 +209,6 @@ import sys from pathlib import Path from loguru import logger -sys.path.insert(0, str(Path(__file__).parent.parent)) from mybenchmark.adapter import MyBenchmarkAdapter @@ -228,6 +234,10 @@ def _parse_args() -> argparse.Namespace: "--limit", type=int, help="Limit the number of tasks to generate", ) + parser.add_argument( + "--overwrite", action="store_true", + help="Overwrite existing task directories", + ) return parser.parse_args() @@ -329,9 +339,13 @@ python3 llm_judge.py # reads ground_truth.json, writes reward.txt ## task.toml Configuration +Every generated `task.toml` **must** contain a `name` field under the `[task]` section. Harbor uses this to identify the task when it's added to a dataset; tasks without a `name` cannot be registered. + ```toml version = "1.0" -source = "mybenchmark/instance-42" + +[task] +name = "mybenchmark/task-001" # required: /, must be stable across runs [metadata] author_name = "Your Name" @@ -358,18 +372,46 @@ allow_internet = true # network access (default: true) # env = { SECRET = "..." } # env vars for solve.sh (OracleAgent) ``` +**Task naming requirements:** +- `name` must be unique within the dataset and **stable across adapter runs** (unstable names churn registry digests on republish) +- Format: `/`, e.g., `mybenchmark/task-001` +- Sanitize upstream identifiers: lowercase, replace spaces/slashes/special characters with hyphens +- If the upstream lacks stable identifiers, mint a deterministic scheme (e.g., `{dataset}-1`, `{dataset}-2`) from a reproducible sort +- `version = "1.0"` is the schema version — leave it alone + ## parity_experiment.json -A JSON **array** of experiment objects tracking how Harbor results compare to the original benchmark. Required fields per entry: `adapter_name`, `agent`, `model`, `date`, `metrics`. +A JSON **array** of experiment objects tracking how Harbor results compare to the original benchmark. Required fields per entry: + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `adapter_name` | string | Yes | Adapter name (e.g., `"mybenchmark"`) | +| `agent` | string | Yes | Agent with version (e.g., `"codex@1.0"`) | +| `model` | string | Yes | Full model identifier | +| `date` | string | Yes | Experiment date | +| `adapted_benchmark_size` | integer | Yes | Total tasks converted by adapter | +| `parity_benchmark_size` | integer | Yes | Tasks used for parity | +| `number_of_runs` | integer | Yes | Runs per side (must match on both sides) | +| `notes` | string | No | Additional context | +| `original_parity_repo` | string | Yes | Fork URL for reproducing parity on original | +| `adapter_pr` | string[] | Yes | All adapter PR links in harbor repo | +| `dataset_pr` | string[] | Yes | All PR links in harbor-datasets repo | +| `parity_pr` | string[] | Yes | All PR links to HuggingFace parity dataset | +| `metrics` | object[] | Yes | Metric comparison objects | + +Each `metrics` entry needs `benchmark_name`, `metric`, `original` (mean ± stderr string), `harbor` (mean ± stderr string), `original_runs` (array of per-run scores), and `harbor_runs` (array of per-run scores). ```json [ { "adapter_name": "mybenchmark", - "agent": "codex@0.77.0", - "model": "openai/gpt-4o-2025-03-01", + "agent": "codex@1.0", + "model": "openai/gpt-5-2025-06-01", "date": "2026-01-15", - "notes": "50 tasks; averaged over 3 trials", + "adapted_benchmark_size": 500, + "parity_benchmark_size": 50, + "number_of_runs": 3, + "notes": "50-task sample; averaged over 3 runs", "original_parity_repo": "https://github.com/example/mybenchmark/tree/harbor-parity", "adapter_pr": [ "https://github.com/harbor-framework/harbor/pull/123" @@ -384,23 +426,21 @@ A JSON **array** of experiment objects tracking how Harbor results compare to th { "benchmark_name": "MyBenchmark", "metric": "Accuracy", - "original": "0.72 +/- 0.03", - "harbor": "0.71 +/- 0.02", - "original_trials": [0.73, 0.69, 0.74], - "harbor_trials": [0.72, 0.70, 0.71] + "original": "72.0 +/- 1.5", + "harbor": "71.3 +/- 1.2", + "original_runs": [73.0, 69.0, 74.0], + "harbor_runs": [72.0, 70.0, 71.9] } ] } ] ``` -The `adapter_pr`, `dataset_pr`, and `parity_pr` fields in `parity_experiment.json` must be arrays of URLs when present. Each metric object needs `benchmark_name`, `metric`, and at least one of `original`, `tb_adapter`, or `harbor`. - -The `harbor_adapter` entry must include `parity_matching_agents` (array of `"agent@version+model"` strings). In the harbor-datasets `registry.json`, use `"version": "parity"` to enable `harbor jobs start -d mybenchmark@parity`. +**Important:** the per-run arrays are named `original_runs` and `harbor_runs` (not `_trials`). The validator enforces `*_runs` naming. ## adapter_metadata.json -A JSON **array** describing the adapter and its relationship to the original benchmark. Required fields per entry: `adapter_name`, `adapter_builders`, `original_benchmark`, `harbor_adapter`. +A JSON **array** describing the adapter and its relationship to the original benchmark. ```json [ @@ -426,8 +466,10 @@ A JSON **array** describing the adapter and its relationship to the original ben "parity_benchmark_size": 50, "parity_sampling_rate": 0.1, "registry_benchmark_size": 500, - "parity_costs": "150", - "parity_matching_agents": ["codex@0.77.0+openai/gpt-4o-2025-03-01"], + "added_agents": ["None"], + "parity_matching_agents": ["codex@1.0+openai/gpt-5-2025-06-01"], + "parity_unmatching_agents": ["None"], + "parity_costs": "$150", "notes": "Full benchmark adapted. Parity on 10% sample." } ] @@ -435,24 +477,28 @@ A JSON **array** describing the adapter and its relationship to the original ben ] ``` +**New required fields in `harbor_adapter`:** +- `added_agents`: custom agents added for this adapter (`["None"]` if none) +- `parity_unmatching_agents`: agents that were tested but did not achieve parity (`["None"]` if all matched) + ## Validation The `harbor adapters validate` command (backed by `scripts/validate_adapter.py`) checks: -1. **Required files exist**: `adapter.py`, `run_adapter.py`, `README.md`, `parity_experiment.json`, `adapter_metadata.json`, `template/` directory -2. **Template structure**: `template/task.toml`, `template/instruction.md`, `template/environment/Dockerfile`, `template/tests/test.sh`, `template/solution/solve.sh` -3. **test.sh writes reward**: Template `tests/test.sh` must contain a write to `/logs/verifier/reward.txt` -4. **parity_experiment.json schema**: Valid JSON array with required fields (`adapter_name`, `agent`, `model`, `date`, `metrics`) -5. **adapter_metadata.json schema**: Valid JSON array with required fields (`adapter_name`, `adapter_builders`, `original_benchmark`, `harbor_adapter`) -6. **README.md compliance**: Checks for "Overview", "Parity", "Citation" sections; warns about unreplaced template placeholders like `{{ADAPTER_ID}}` -7. **Cross-validation**: Compares `adapter_name` between metadata and parity files +1. **Required files exist**: `adapter_metadata.json`, `parity_experiment.json`, `README.md`, `run_.yaml` +2. **src layout**: `src//adapter.py`, `src//main.py`, `src//task-template/` +3. **Template structure**: `task-template/task.toml`, `task-template/instruction.md`, `task-template/environment/Dockerfile`, `task-template/tests/test.sh`, `task-template/solution/solve.sh` +4. **test.sh writes reward**: Template `tests/test.sh` must contain a write to `/logs/verifier/reward.txt` +5. **parity_experiment.json schema**: Valid JSON array with required fields +6. **adapter_metadata.json schema**: Valid JSON array with required fields +7. **README.md compliance**: Checks for "Overview", "Parity", "Citation" sections ```bash # Validate adapter structure harbor adapters validate adapters/mybenchmark # Generate a single task to test -python3 adapters/mybenchmark/run_adapter.py \ +uv run python -m mybenchmark.main \ --output-dir datasets/mybenchmark \ --task-ids instance-001 @@ -466,29 +512,45 @@ harbor trials start -p datasets/mybenchmark/mybenchmark-instance-001 --agent ora harbor jobs start -p datasets/mybenchmark --agent oracle ``` +## GPU Tasks + +For adapters with GPU tasks, add a `docker-compose.yaml` in the task's `environment/` directory with nvidia device reservations. For cloud/Modal runs, also set `gpus` in `task.toml`. See the [featurebench adapter](https://github.com/harbor-framework/harbor/tree/main/adapters/featurebench) for a comprehensive example — it handles 44 GPU tasks across multiple repos with separate `_docker_cpu.yaml`, `_docker_gpu.yaml`, and `_modal.yaml` config files. + +## Reference Adapters by Scenario + +| Scenario | Example adapter | When to use | +|----------|----------------|-------------| +| Compatible agent already exists | `adapters/adebench/` | Upstream already supports Claude-Code / Codex / OpenHands / Gemini-CLI | +| Fork upstream + add LLM agent | `adapters/evoeval/` | LLM-based benchmark with no Harbor-compatible agent | +| Custom agent, separate dataset | `adapters/bixbench/`, `adapters/financeagent/` | Custom interaction semantics; financeagent also demos LLM-as-a-Judge | +| Custom agent in-place | `adapters/medagentbench/` | Custom HTTPAgent, no separate dataset | +| Multi-agent workflow | `adapters/cooperbench/` | Multiple agents coordinate via messaging / sidecars | +| GPU tasks | `adapters/featurebench/` | Comprehensive Docker + Modal GPU example | + ## Post-Implementation Workflow After generating valid tasks, complete these steps before submitting: -1. **Oracle Verification** — Run oracle agent across all tasks; must reach 100% pass rate. -2. **Parity Planning** — Select a representative sample (typically 10%) for parity runs. -3. **Parity Experiments** — Run target agent+model against both original harness and Harbor, using `run_.yaml`. Record all trial scores. -4. **Results Documentation** — Fill `parity_experiment.json`; upload results to the HuggingFace parity dataset. -5. **Dataset Registration** — Fork `harbor-datasets`, place tasks under `datasets//`, open a PR with `"version": "parity"` in `registry.json`. -6. **Adapter PR** — Open a PR to the Harbor repo with adapter code. -7. **README Thoroughness** — Ensure README covers Overview, Parity results, License, and Citation. +1. **Oracle Verification** — Run oracle agent across all tasks; must reach 100% pass rate. Create a WIP PR with a screenshot of results. +2. **Parity Planning** — Contact the team before running parity. They determine agents, models, number of runs, and API key provisioning. +3. **Parity Experiments** — Run target agent+model against both original harness and Harbor, using `run_.yaml`. Record all run scores. +4. **Results Documentation** — Fill `parity_experiment.json`; upload results to the HuggingFace parity dataset via the `upload-parity-experiments` skill. +5. **Dataset Registration** — Fork `harbor-datasets`, place tasks under `datasets//`, run `harbor init` to create `dataset.toml`, open a PR. +6. **Adapter PR** — Change title from `[WIP]` to `[Ready for Review] Adapter: {adapter_name}`, request review from `@Slimshilin`. +7. **README Thoroughness** — Ensure README covers Overview, Parity results, License, and Citation. The README is parsed by automation — do not add, rename, reorder, or remove template sections. See [references/adapter-anatomy.md](references/adapter-anatomy.md#6-post-implementation-workflow) for commands and YAML job config format. ## Gotchas - **Missing reward.txt write**: The most common failure. test.sh must always write to `/logs/verifier/reward.txt`, even on error paths. -- **Wrong parity_experiment.json format**: It is a JSON array `[{...}]`, not a plain object `{...}`. Using the singular filename `parity_experiment.json`, not plural. -- **Forgetting adapter_metadata.json**: This file is required and checked by the validator. Include builder contact info (email). +- **Wrong parity_experiment.json format**: It is a JSON array `[{...}]`, not a plain object `{...}`. Using the singular filename `parity_experiment.json`, not plural. Per-run score arrays are named `original_runs` / `harbor_runs` (not `_trials`). +- **Forgetting adapter_metadata.json**: This file is required and checked by the validator. Include `added_agents` and `parity_unmatching_agents` fields. - **Baking test dependencies into Dockerfile**: Test-only dependencies (pytest, evaluation scripts) should be installed inside `test.sh`, not in the Docker image. The `tests/` directory is uploaded separately at verification time. - **Including tests/ or solution/ in Docker image**: These directories should not be COPYed in the Dockerfile. They are injected by Harbor at runtime. -- **Unreplaced template placeholders**: After scaffolding with `harbor adapters init`, replace all `{{ADAPTER_ID}}`, `{{BENCHMARK_NAME}}`, etc. in README.md and other files. -- **Using /app vs /workspace inconsistently**: Pick one working directory and be consistent between instruction.md, test.sh, and Dockerfile WORKDIR. Some adapters use `/app`, others use `/workspace`. +- **Missing `[task]` section in task.toml**: Every generated task.toml must include a `[task]` block with a `name` field. Without it, tasks cannot be registered. The `name` field does not belong at the top level — it must be under `[task]`. +- **Unstable task names**: Task names must be deterministic across adapter runs. If upstream IDs change or aren't stable, mint a reproducible scheme from a consistent sort. +- **Using /app vs /workspace inconsistently**: Pick one working directory and be consistent between instruction.md, test.sh, and Dockerfile WORKDIR. - **Not escaping shell-unsafe characters**: Answers containing single quotes, backticks, or dollar signs can break test.sh or solve.sh. Escape them when embedding in shell scripts. - **Missing `mkdir -p /logs/verifier`**: Some base images may not have this directory. Create it in test.sh before writing reward.txt. From 3da678a4ca46bb0e86c0711f3a48e7adfcf36303 Mon Sep 17 00:00:00 2001 From: benediktstroebl <50178209+benediktstroebl@users.noreply.github.com> Date: Sun, 19 Apr 2026 08:57:19 -0700 Subject: [PATCH 2/2] Update harbor-task-creator: add [task] section to task.toml, keywords guidance, Step 9 README --- skills/harbor-task-creator/SKILL.md | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/skills/harbor-task-creator/SKILL.md b/skills/harbor-task-creator/SKILL.md index 26d1a69..bcdaf6b 100644 --- a/skills/harbor-task-creator/SKILL.md +++ b/skills/harbor-task-creator/SKILL.md @@ -38,7 +38,7 @@ my-task/ │ └── test_*.py # Pytest files (optional) ├── solution/ │ └── solve.sh # Reference solution (optional) -└── README.md # Human documentation (optional) +└── README.md # Human documentation (fill in before publishing) ``` **Required files:** `task.toml`, `instruction.md`, `environment/` (with Dockerfile or docker-compose.yaml), `tests/test.sh`. @@ -106,6 +106,11 @@ The task.toml file controls timeouts, resources, metadata, and MCP server config ```toml version = "1.0" +[task] +name = "/" # required for registry +description = "One-line description" +keywords = ["python", "debugging", "pytest"] # always populate — 3–8 tokens + [metadata] difficulty = "easy" category = "programming" @@ -117,9 +122,12 @@ timeout_sec = 300.0 timeout_sec = 120.0 ``` +**Always populate `keywords`.** Pick 3–8 lowercase tokens covering the domain (language/framework/benchmark family), the verifier style (`rewardkit`, `judge-grading`, `pytest`), and any notable hardware (`gpu`). Keywords are surfaced in `harbor datasets list` and registry search — leaving them empty makes the task invisible to search. + **Full configuration — see [references/task-toml-reference.md](references/task-toml-reference.md) for every field, type, and default.** Key sections: +- `[task]` — `name` (string, required for registry), `description` (string), `keywords` (string array, always populate) - `[metadata]` — free-form dict; conventional keys: difficulty, category, tags, author info, time estimates - `[agent]` — `timeout_sec` (float, default 600.0): wall-clock seconds for the agent - `[verifier]` — `timeout_sec` (float, default 600.0) and `env` (dict): timeout and environment variables for test.sh @@ -284,7 +292,7 @@ TOTAL=3 echo "scale=2; $SCORE / $TOTAL" | bc > /logs/verifier/reward.txt # Or use reward.json for named sub-scores (pick one approach, not both) -# echo "{\"reward\": $(echo "scale=2; $SCORE/$TOTAL" | bc), \"completeness\": $SCORE}" \ +# echo "{\"reward\": $(echo \"scale=2; $SCORE/$TOTAL\" | bc), \"completeness\": $SCORE}" \ # > /logs/verifier/reward.json ``` @@ -346,6 +354,18 @@ harbor tasks debug -m sonnet --tasks-dir ./tasks After running a job with real agents, use this to diagnose whether repeated failures are caused by an unclear or insufficient instruction. +## Step 9: Update README.md + +`harbor tasks init` leaves `README.md` as a stub. Before publishing or sharing the task, populate it so future humans (and agents) can understand the task without reading every file. Include: + +- **What the agent does** — one paragraph, link to `instruction.md`. +- **Environment** — base image, key installed packages, cached data, hardware (GPU/CPU/RAM), agent timeout. +- **Verifier** — for Reward Kit tasks, a table of reward dimensions with type (programmatic / LLM judge / agent judge) and what each measures; how they're aggregated. +- **Layout** — a tree of the task directory with one-line annotations. +- **Running** — the concrete `harbor run` commands (Oracle + real agent), with the right provider flag if the task needs a GPU. + +Treat this as docs, not marketing — the reader wants to know *what they'd need to change* to modify the task. + ## Quality Checklist Harbor's `QualityChecker` uses an 11-point rubric (defined in `default_rubric.toml`). Each criterion is evaluated by an LLM judge as pass/fail/not_applicable. Verify these before publishing: @@ -376,7 +396,7 @@ Run the checker: `harbor tasks check ./my-task` **GPU tasks:** For ML/CUDA workloads. Set `gpus` and `gpu_types` in `[environment]`. Only Modal and GKE backends support GPUs — `DockerEnvironment` does not. Use `--environment-type modal` when running. -**LLM-as-judge tasks:** Open-ended outputs evaluated by another LLM. Custom logic in test.sh calls an LLM API and writes a normalized score to reward.txt. +**LLM-as-judge tasks:** Open-ended outputs evaluated by another LLM. Custom logic in test.sh calls an LLM API and writes a normalized score to reward.txt. Add the judge API key to `[verifier.env]` in task.toml. ## Common Pitfalls @@ -393,6 +413,9 @@ These are the mistakes that trip up task authors most often: | Using `rewards.json` | Harbor looks for `reward.json` (singular) | Use `reward.txt` or `reward.json` | | Missing output file path in instruction | Agent doesn't know where to write results | Explicitly state every file path the tests will check | | Requesting GPUs on Docker | Runtime error: environment doesn't support GPUs | Use `--environment-type modal` or `gke` for GPU tasks | +| Leaving `keywords = []` in task.toml | Task is invisible to registry search | Always populate with 3–8 lowercase tokens | +| Missing `[task]` section in task.toml | `name` field is silently ignored (not top-level) | Put `name`, `description`, `keywords` under `[task]`, not at TOML top level | +| Leaving README.md as a stub | Teammates can't understand the task at a glance | Fill in before publishing (see Step 9 above) | ## Complete Example Walkthroughs