diff --git a/environments/README.md b/environments/README.md index 503d65b37..f652c5aa0 100644 --- a/environments/README.md +++ b/environments/README.md @@ -60,6 +60,7 @@ This folder contains installable example environments that showcase common usage - **bfcl_v3**: BFCL v3 function-calling eval using task-local dynamic tool schemas and v1 rewards. - **dspy_flights**: Sandboxed DSPy flight-support `program.fn` entrypoint installed from its package `pyproject.toml` and configured against the v1 interception endpoint. - **hello_group_reward_v1**: Deterministic v1 reference for group updates, metrics, rewards, advantages, and cleanup. + - **mle_bench**: MLE-Bench competition-submission taskset using the benchmark CSV submission contract and validator. - **tau2_bench_v1**: `tau2-bench-v1` τ²-bench taskset/user/tool pattern on the v1 harness runtime. ### Composition @@ -94,7 +95,7 @@ This folder contains installable example environments that showcase common usage - **CLI agent sandboxes**: `opencode_harbor`, `terminus_harbor`, `hello_mcp_harbor` - **MCP integration**: `mcp_search_env`, `hello_mcp_harbor` - **RLM (recursive LLM)**: `rlm_secrets` -- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `dspy_flights`, and `tau2-bench-v1`. +- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `mle_bench`, `dspy_flights`, and `tau2-bench-v1`. - `opencode_harbor` uses the packaged `vf.HarborTaskset` + `vf.OpenCode` boundary. These reusable implementations live under `verifiers.v1.packages` and are re-exported from `verifiers.v1`. - **Environment and rubric composition**: `math_group`, `math_python`, `wiki_search` - **Procedural datasets**: `reasoning_gym_env` diff --git a/environments/mle_bench/README.md b/environments/mle_bench/README.md new file mode 100644 index 000000000..90f4821c6 --- /dev/null +++ b/environments/mle_bench/README.md @@ -0,0 +1,41 @@ +# mle-bench + +V1 taskset/harness environment for MLE-Bench competition submissions. + +```python +from mle_bench import load_environment + +env = load_environment() +``` + +The taskset represents each MLE-Bench competition as a sandboxed machine-learning +engineering task. The model receives the benchmark-level instructions plus the +competition description and must create: + +```text +/home/submission/submission.csv +``` + +When run in an image that has MLE-Bench data and the validation server/script +available, the reward calls: + +```bash +/home/validate_submission.sh /home/submission/submission.csv +``` + +and gives reward `1.0` only when the benchmark validator accepts the submission. +This keeps the environment aligned with the official MLE-Bench submission +contract without downloading Kaggle data during import or local unit tests. + +For handoff to the benchmark grader, `grading_submission_row(task)` returns the +JSONL row expected by `mlebench grade`: + +```json +{"competition_id": "spaceship-titanic", "submission_path": "/home/submission/submission.csv"} +``` + +By default, the environment uses the low-complexity/lite split. If the +`mlebench` Python package is installed, descriptions are loaded from its +registry. Otherwise the built-in competition IDs are still exposed so the +environment can be imported and tested without the upstream repo or Kaggle +credentials. diff --git a/environments/mle_bench/mle_bench.py b/environments/mle_bench/mle_bench.py new file mode 100644 index 000000000..ccc8fa95b --- /dev/null +++ b/environments/mle_bench/mle_bench.py @@ -0,0 +1,364 @@ +from __future__ import annotations + +import shlex +from collections.abc import Mapping, Sequence +from typing import Any + +import verifiers.v1 as vf + +TASKSET_ID = "mle-bench" +DEFAULT_SPLIT = "low" +DEFAULT_IMAGE = "mlebench-env" +DEFAULT_WORKDIR = "/home" +DEFAULT_SUBMISSION_PATH = "/home/submission/submission.csv" +DEFAULT_VALIDATE_SCRIPT = "/home/validate_submission.sh" + +LOW_COMPETITIONS = [ + "aerial-cactus-identification", + "aptos2019-blindness-detection", + "denoising-dirty-documents", + "detecting-insults-in-social-commentary", + "dog-breed-identification", + "dogs-vs-cats-redux-kernels-edition", + "histopathologic-cancer-detection", + "jigsaw-toxic-comment-classification-challenge", + "leaf-classification", + "mlsp-2013-birds", + "new-york-city-taxi-fare-prediction", + "nomad2018-predict-transparent-conductors", + "plant-pathology-2020-fgvc7", + "random-acts-of-pizza", + "ranzcr-clip-catheter-line-classification", + "siim-isic-melanoma-classification", + "spooky-author-identification", + "tabular-playground-series-dec-2021", + "tabular-playground-series-may-2022", + "text-normalization-challenge-english-language", + "text-normalization-challenge-russian-language", + "the-icml-2013-whale-challenge-right-whale-redux", +] + +DEV_COMPETITIONS = [ + "invasive-species-monitoring", + "ml2021spring-hw2", + "movie-review-sentiment-analysis-kernels-only", + "paddy-disease-classification", + "plant-seedlings-classification", + "playground-series-s3e18", + "spaceship-titanic", +] + +BENCHMARK_INSTRUCTIONS = """\ +You are participating in MLE-Bench, an offline version of Kaggle competitions +adapted to test machine-learning engineering agents. + +Read the competition-specific instructions, inspect the dataset files, train or +construct a model, and write your final submission as a CSV in the +benchmark-required format. You may use the provided validator to check that the +submission file is structurally valid, but the validator does not reveal the +score. + +Do not hand-label the test set, copy external solutions, or use information that +would violate the benchmark rules. The final score is determined by the +competition metric from the benchmark grader.""" + + +def split_ids(split: str, competition_ids: Sequence[str] | None = None) -> list[str]: + if competition_ids is not None: + return list(competition_ids) + if split == "dev": + return list(DEV_COMPETITIONS) + if split in {"low", "lite"}: + return list(LOW_COMPETITIONS) + if split == "all": + return load_registry_ids() or list(LOW_COMPETITIONS) + raise ValueError(f"Unknown MLE-Bench split: {split}") + + +def load_registry_ids() -> list[str] | None: + try: + from mlebench.registry import registry + + return list(registry.list_competition_ids()) + except Exception: + return None + + +def load_registry_competition(competition_id: str) -> Mapping[str, Any] | None: + try: + from mlebench.registry import registry + + competition = registry.get_competition(competition_id) + return { + "id": competition.id, + "name": competition.name, + "description": competition.description, + "competition_type": competition.competition_type, + "sample_submission": str(competition.sample_submission), + "answers": str(competition.answers), + } + except Exception: + return None + + +def build_prompt( + competition_id: str, + description: str, + workdir: str, + submission_path: str, + validate_script: str, +) -> str: + data_dir = f"{workdir.rstrip('/')}/data" + description_path = f"{data_dir}/description.md" + return ( + f"Competition ID: {competition_id}\n\n" + f"{BENCHMARK_INSTRUCTIONS}\n\n" + f"Competition instructions path: {description_path}\n" + f"Dataset directory: {data_dir}\n" + f"Required submission path: {submission_path}\n" + f"Validation command: {validate_script} {submission_path}\n\n" + "Competition description:\n" + f"{description.strip() or '(description unavailable in this runtime)'}" + ) + + +def make_record( + competition_id: str, + split: str, + image: str, + workdir: str, + submission_path: str, + validate_script: str, +) -> dict[str, Any]: + registry_data = load_registry_competition(competition_id) or {} + description = str(registry_data.get("description") or "") + info = { + "competition_id": competition_id, + "split": split, + "competition_type": registry_data.get("competition_type"), + "sample_submission": registry_data.get("sample_submission"), + "answers": registry_data.get("answers"), + "workdir": workdir, + "submission_path": submission_path, + "validate_script": validate_script, + } + return { + "task_id": competition_id, + "prompt": [ + { + "role": "user", + "content": build_prompt( + competition_id, + description, + workdir, + submission_path, + validate_script, + ), + } + ], + "question": build_prompt( + competition_id, + description, + workdir, + submission_path, + validate_script, + ), + "answer": submission_path, + "info": info, + "sandbox": { + "image": image, + "cpu_cores": 36, + "memory_gb": 440, + "disk_size_gb": 256, + "gpu_count": 1, + "workdir": workdir, + "scope": "rollout", + "timeout_minutes": 1440, + }, + "program": {"env": {"AGENT_WORKDIR": workdir}}, + } + + +def grading_submission_row(task: Mapping[str, Any]) -> dict[str, str]: + info = task["info"] + return { + "competition_id": str(info["competition_id"]), + "submission_path": str(info["submission_path"]), + } + + +def grading_submission_jsonl(task: Mapping[str, Any]) -> str: + import json + + return json.dumps(grading_submission_row(task), sort_keys=True) + "\n" + + +class MLEBenchTaskset(vf.Taskset): + def __init__( + self, + split: str = DEFAULT_SPLIT, + competition_ids: Sequence[str] | None = None, + image: str = DEFAULT_IMAGE, + workdir: str = DEFAULT_WORKDIR, + submission_path: str = DEFAULT_SUBMISSION_PATH, + validate_script: str = DEFAULT_VALIDATE_SCRIPT, + limit: int | None = None, + config: vf.TasksetConfig | None = None, + ): + self.split = split + self.competition_ids = split_ids(split, competition_ids) + if limit is not None and limit >= 0: + self.competition_ids = self.competition_ids[:limit] + self.image = image + self.workdir = workdir + self.submission_path = submission_path + self.validate_script = validate_script + super().__init__( + source=self.load_rows, + taskset_id=TASKSET_ID, + system_prompt=BENCHMARK_INSTRUCTIONS, + metrics=[submission_exists, submission_nonempty, validator_available], + rewards=[valid_submission], + config=config, + ) + + def load_rows(self) -> list[dict[str, Any]]: + return [ + make_record( + competition_id, + self.split, + self.image, + self.workdir, + self.submission_path, + self.validate_script, + ) + for competition_id in self.competition_ids + ] + + @vf.setup(priority=250) + async def capture_sandbox(self, task, state, sandbox=None) -> None: + if sandbox is not None: + state["_mle_bench_sandbox"] = sandbox + + @vf.cleanup(priority=100) + async def cleanup_sandbox(self, task, state) -> None: + state.pop("_mle_bench_sandbox", None) + + +async def submission_exists(task: vf.Task, state: vf.State) -> float: + sandbox = state.get("_mle_bench_sandbox") + if sandbox is None: + return 0.0 + submission_path = str(task["info"]["submission_path"]) + result = await sandbox.execute( + f"test -f {shlex.quote(submission_path)}", + timeout=30, + ) + return 1.0 if result.exit_code == 0 else 0.0 + + +async def submission_nonempty(task: vf.Task, state: vf.State) -> float: + sandbox = state.get("_mle_bench_sandbox") + if sandbox is None: + return 0.0 + submission_path = str(task["info"]["submission_path"]) + result = await sandbox.execute( + f"test -s {shlex.quote(submission_path)}", + timeout=30, + ) + return 1.0 if result.exit_code == 0 else 0.0 + + +async def validator_available(task: vf.Task, state: vf.State) -> float: + sandbox = state.get("_mle_bench_sandbox") + if sandbox is None: + return 0.0 + validate_script = str(task["info"]["validate_script"]) + result = await sandbox.execute( + f"test -x {shlex.quote(validate_script)}", + timeout=30, + ) + return 1.0 if result.exit_code == 0 else 0.0 + + +async def valid_submission(task: vf.Task, state: vf.State) -> float: + sandbox = state.get("_mle_bench_sandbox") + if sandbox is None: + return 0.0 + info = task["info"] + workdir = str(info["workdir"]) + submission_path = str(info["submission_path"]) + validate_script = str(info["validate_script"]) + command = f"{shlex.quote(validate_script)} {shlex.quote(submission_path)}" + result = await sandbox.execute(command, timeout=300, working_dir=workdir) + state["validation_stdout"] = result.stdout or "" + state["validation_stderr"] = result.stderr or "" + state["validation_exit_code"] = result.exit_code + return 1.0 if result.exit_code == 0 and validator_accepts(result.stdout) else 0.0 + + +def validator_accepts(stdout: str | None) -> bool: + for line in (stdout or "").splitlines(): + if line.strip().lower() == "submission is valid.": + return True + return False + + +def load_taskset( + split: str = DEFAULT_SPLIT, + competition_ids: Sequence[str] | None = None, + image: str = DEFAULT_IMAGE, + workdir: str = DEFAULT_WORKDIR, + submission_path: str = DEFAULT_SUBMISSION_PATH, + validate_script: str = DEFAULT_VALIDATE_SCRIPT, + limit: int | None = None, + config: vf.TasksetConfig | None = None, +) -> MLEBenchTaskset: + return MLEBenchTaskset( + split=split, + competition_ids=competition_ids, + image=image, + workdir=workdir, + submission_path=submission_path, + validate_script=validate_script, + limit=limit, + config=config, + ) + + +def load_harness( + max_turns: int | None = None, + config: vf.HarnessConfig | None = None, +) -> vf.OpenCode: + return vf.OpenCode( + sandbox=True, + max_turns=max_turns, + config=config, + ) + + +def load_environment( + split: str = DEFAULT_SPLIT, + competition_ids: Sequence[str] | None = None, + image: str = DEFAULT_IMAGE, + workdir: str = DEFAULT_WORKDIR, + submission_path: str = DEFAULT_SUBMISSION_PATH, + validate_script: str = DEFAULT_VALIDATE_SCRIPT, + limit: int | None = None, + max_turns: int | None = None, + config: vf.EnvConfig | None = None, +) -> vf.Env: + config = config or vf.EnvConfig() + return vf.Env( + taskset=load_taskset( + split=split, + competition_ids=competition_ids, + image=image, + workdir=workdir, + submission_path=submission_path, + validate_script=validate_script, + limit=limit, + config=config.taskset, + ), + harness=load_harness(max_turns=max_turns, config=config.harness), + ) diff --git a/environments/mle_bench/pyproject.toml b/environments/mle_bench/pyproject.toml new file mode 100644 index 000000000..853b12bfe --- /dev/null +++ b/environments/mle_bench/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "mle-bench" +version = "0.1.0" +description = "MLE-Bench competition-submission environment" +license = "Apache-2.0" +tags = ["mle", "kaggle", "coding", "benchmark"] +requires-python = ">=3.11" +dependencies = [ + "verifiers", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["mle_bench.py", "README.md", "pyproject.toml"] + +[project.entry-points."verifiers.environments"] +mle-bench = "mle_bench:load_environment" + +[tool.verifiers.eval] +num_examples = 7 +rollouts_per_example = 1 + +[tool.uv.sources] +verifiers = { path = "../..", editable = true } diff --git a/tests/test_mle_bench_environment.py b/tests/test_mle_bench_environment.py new file mode 100644 index 000000000..f424c1717 --- /dev/null +++ b/tests/test_mle_bench_environment.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import importlib +import sys +from pathlib import Path + +import verifiers.v1 as vf + + +def load_module(monkeypatch): + env_dir = Path(__file__).parents[1] / "environments" / "mle_bench" + monkeypatch.syspath_prepend(str(env_dir)) + sys.modules.pop("mle_bench", None) + return importlib.import_module("mle_bench") + + +class Result: + def __init__(self, exit_code: int, stdout: str = "", stderr: str = ""): + self.exit_code = exit_code + self.stdout = stdout + self.stderr = stderr + + +class RecordingSandbox: + def __init__(self, results: list[Result]): + self.results = list(results) + self.calls = [] + + async def execute(self, command: str, **kwargs): + self.calls.append({"command": command, **kwargs}) + return self.results.pop(0) + + +def test_mle_bench_loads_low_split_without_registry(monkeypatch): + module = load_module(monkeypatch) + monkeypatch.setattr(module, "load_registry_competition", lambda _id: None) + + env = module.load_environment(limit=2, image="local-mlebench") + rows = list(env.taskset.source()) + + assert isinstance(env, vf.Env) + assert isinstance(env.harness, vf.OpenCode) + assert env.taskset.taskset_id == "mle-bench" + assert [row["task_id"] for row in rows] == module.LOW_COMPETITIONS[:2] + assert rows[0]["sandbox"]["image"] == "local-mlebench" + assert rows[0]["sandbox"]["workdir"] == "/home" + assert rows[0]["program"]["env"]["AGENT_WORKDIR"] == "/home" + assert "/home/submission/submission.csv" in rows[0]["prompt"][0]["content"] + + +def test_mle_bench_prompt_uses_configured_paths(monkeypatch): + module = load_module(monkeypatch) + monkeypatch.setattr(module, "load_registry_competition", lambda _id: None) + + row = module.make_record( + "custom-competition", + "dev", + "mlebench-env", + "/workspace", + "/workspace/out/final.csv", + "/workspace/tools/check.sh", + ) + prompt = row["prompt"][0]["content"] + + assert "/workspace/data/description.md" in prompt + assert "/workspace/data" in prompt + assert "/workspace/out/final.csv" in prompt + assert "/workspace/tools/check.sh /workspace/out/final.csv" in prompt + assert "/home/submission/submission.csv" not in prompt + assert "/home/validate_submission.sh" not in prompt + + +def test_mle_bench_uses_registry_metadata_when_available(monkeypatch): + module = load_module(monkeypatch) + + def fake_registry(competition_id): + return { + "id": competition_id, + "name": "Example Competition", + "description": "Predict the label.", + "competition_type": "simple", + "sample_submission": "/data/sample_submission.csv", + "answers": "/data/test.csv", + } + + monkeypatch.setattr(module, "load_registry_competition", fake_registry) + row = module.make_record( + "example-competition", + "dev", + "mlebench-env", + "/home", + "/home/submission/submission.csv", + "/home/validate_submission.sh", + ) + + assert row["info"]["competition_type"] == "simple" + assert row["info"]["sample_submission"] == "/data/sample_submission.csv" + assert "Predict the label." in row["prompt"][0]["content"] + + +def test_mle_bench_grading_submission_row(monkeypatch): + module = load_module(monkeypatch) + monkeypatch.setattr(module, "load_registry_competition", lambda _id: None) + row = module.make_record( + "spaceship-titanic", + "dev", + "mlebench-env", + "/home", + "/home/submission/submission.csv", + "/home/validate_submission.sh", + ) + + assert module.grading_submission_row(row) == { + "competition_id": "spaceship-titanic", + "submission_path": "/home/submission/submission.csv", + } + assert module.grading_submission_jsonl(row) == ( + '{"competition_id": "spaceship-titanic", ' + '"submission_path": "/home/submission/submission.csv"}\n' + ) + + +def test_mle_bench_split_ids_reject_unknown_split(monkeypatch): + module = load_module(monkeypatch) + + try: + module.split_ids("missing") + except ValueError as exc: + assert "Unknown MLE-Bench split" in str(exc) + else: + raise AssertionError("expected ValueError") + + +async def test_mle_bench_valid_submission_records_validator_output(monkeypatch): + module = load_module(monkeypatch) + taskset = module.load_taskset(competition_ids=["spaceship-titanic"]) + task = vf.Task(list(taskset.source())[0]) + state = vf.State.for_task(task) + sandbox = RecordingSandbox([Result(0, "Submission is valid.")]) + state["_mle_bench_sandbox"] = sandbox + + reward = await module.valid_submission(task, state) + + assert reward == 1.0 + assert state["validation_exit_code"] == 0 + assert state["validation_stdout"] == "Submission is valid." + assert sandbox.calls[0]["command"] == ( + "/home/validate_submission.sh /home/submission/submission.csv" + ) + assert sandbox.calls[0]["working_dir"] == "/home" + + +async def test_mle_bench_valid_submission_uses_configured_workdir(monkeypatch): + module = load_module(monkeypatch) + taskset = module.load_taskset( + competition_ids=["spaceship-titanic"], + workdir="/workspace", + submission_path="/workspace/submission/submission.csv", + validate_script="/workspace/validate_submission.sh", + ) + task = vf.Task(list(taskset.source())[0]) + state = vf.State.for_task(task) + sandbox = RecordingSandbox([Result(0, "Submission is valid.")]) + state["_mle_bench_sandbox"] = sandbox + + assert await module.valid_submission(task, state) == 1.0 + assert sandbox.calls[0]["command"] == ( + "/workspace/validate_submission.sh /workspace/submission/submission.csv" + ) + assert sandbox.calls[0]["working_dir"] == "/workspace" + + +async def test_mle_bench_invalid_submission_gets_zero_reward(monkeypatch): + module = load_module(monkeypatch) + taskset = module.load_taskset(competition_ids=["spaceship-titanic"]) + task = vf.Task(list(taskset.source())[0]) + state = vf.State.for_task(task) + state["_mle_bench_sandbox"] = RecordingSandbox([Result(1, "bad csv")]) + + assert await module.valid_submission(task, state) == 0.0 + assert state["validation_exit_code"] == 1 + + +async def test_mle_bench_invalid_stdout_does_not_match_valid(monkeypatch): + module = load_module(monkeypatch) + taskset = module.load_taskset(competition_ids=["spaceship-titanic"]) + task = vf.Task(list(taskset.source())[0]) + state = vf.State.for_task(task) + state["_mle_bench_sandbox"] = RecordingSandbox( + [Result(0, "Submission invalid! bad csv")] + ) + + assert await module.valid_submission(task, state) == 0.0 + + +def test_mle_bench_validator_accepts_exact_success_line(monkeypatch): + module = load_module(monkeypatch) + + assert module.validator_accepts("Submission is valid.\n") + assert not module.validator_accepts("Submission invalid! bad csv") + assert not module.validator_accepts("not valid") + + +async def test_mle_bench_submission_exists_metric(monkeypatch): + module = load_module(monkeypatch) + taskset = module.load_taskset(competition_ids=["spaceship-titanic"]) + task = vf.Task(list(taskset.source())[0]) + state = vf.State.for_task(task) + sandbox = RecordingSandbox([Result(0)]) + state["_mle_bench_sandbox"] = sandbox + + assert await module.submission_exists(task, state) == 1.0 + assert sandbox.calls[0]["command"] == "test -f /home/submission/submission.csv" + + +async def test_mle_bench_sandbox_metrics(monkeypatch): + module = load_module(monkeypatch) + taskset = module.load_taskset(competition_ids=["spaceship-titanic"]) + task = vf.Task(list(taskset.source())[0]) + state = vf.State.for_task(task) + sandbox = RecordingSandbox([Result(0), Result(1)]) + state["_mle_bench_sandbox"] = sandbox + + assert await module.submission_nonempty(task, state) == 1.0 + assert await module.validator_available(task, state) == 0.0 + assert sandbox.calls[0]["command"] == "test -s /home/submission/submission.csv" + assert sandbox.calls[1]["command"] == "test -x /home/validate_submission.sh"