diff --git a/environments/README.md b/environments/README.md
index 503d65b37..f652c5aa0 100644
--- a/environments/README.md
+++ b/environments/README.md
@@ -60,6 +60,7 @@ This folder contains installable example environments that showcase common usage
   - **bfcl_v3**: BFCL v3 function-calling eval using task-local dynamic tool schemas and v1 rewards.
   - **dspy_flights**: Sandboxed DSPy flight-support `program.fn` entrypoint installed from its package `pyproject.toml` and configured against the v1 interception endpoint.
   - **hello_group_reward_v1**: Deterministic v1 reference for group updates, metrics, rewards, advantages, and cleanup.
+  - **mle_bench**: MLE-Bench competition-submission taskset using the benchmark CSV submission contract and validator.
   - **tau2_bench_v1**: `tau2-bench-v1` τ²-bench taskset/user/tool pattern on the v1 harness runtime.
 
 ### Composition
@@ -94,7 +95,7 @@ This folder contains installable example environments that showcase common usage
 - **CLI agent sandboxes**: `opencode_harbor`, `terminus_harbor`, `hello_mcp_harbor`
 - **MCP integration**: `mcp_search_env`, `hello_mcp_harbor`
 - **RLM (recursive LLM)**: `rlm_secrets`
-- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `dspy_flights`, and `tau2-bench-v1`.
+- **Taskset/Harness v1**: use this pattern for new environments that need reusable tasksets, reusable harnesses, framework programs, endpoint interception, or sandboxed Python/command programs. Examples include `dspy_rlm`, `openai_agents_env`, `langchain_deep_agents_wikispeedia`, `reverse_text`, `alphabet_sort`, `wiki_search`, `math_python`, `mcp_search_env`, `opencode_harbor`, `bfcl_v3`, `hello_subagent_v1`, `nested_harness_v1`, `hello_self_judge_v1`, `hello_parallel_sandbox_v1`, `hello_group_reward_v1`, `hello_rlm_v1`, `rlm_swe_v1`, `mle_bench`, `dspy_flights`, and `tau2-bench-v1`.
   - `opencode_harbor` uses the packaged `vf.HarborTaskset` + `vf.OpenCode` boundary. These reusable implementations live under `verifiers.v1.packages` and are re-exported from `verifiers.v1`.
 - **Environment and rubric composition**: `math_group`, `math_python`, `wiki_search`
 - **Procedural datasets**: `reasoning_gym_env`
diff --git a/environments/mle_bench/README.md b/environments/mle_bench/README.md
new file mode 100644
index 000000000..90f4821c6
--- /dev/null
+++ b/environments/mle_bench/README.md
@@ -0,0 +1,41 @@
+# mle-bench
+
+V1 taskset/harness environment for MLE-Bench competition submissions.
+
+```python
+from mle_bench import load_environment
+
+env = load_environment()
+```
+
+The taskset represents each MLE-Bench competition as a sandboxed machine-learning
+engineering task. The model receives the benchmark-level instructions plus the
+competition description and must create:
+
+```text
+/home/submission/submission.csv
+```
+
+When run in an image that has MLE-Bench data and the validation server/script
+available, the reward calls:
+
+```bash
+/home/validate_submission.sh /home/submission/submission.csv
+```
+
+and gives reward `1.0` only when the benchmark validator accepts the submission.
+This keeps the environment aligned with the official MLE-Bench submission
+contract without downloading Kaggle data during import or local unit tests.
+
+For handoff to the benchmark grader, `grading_submission_row(task)` returns the
+JSONL row expected by `mlebench grade`:
+
+```json
+{"competition_id": "spaceship-titanic", "submission_path": "/home/submission/submission.csv"}
+```
+
+By default, the environment uses the low-complexity/lite split. If the
+`mlebench` Python package is installed, descriptions are loaded from its
+registry. Otherwise the built-in competition IDs are still exposed so the
+environment can be imported and tested without the upstream repo or Kaggle
+credentials.
diff --git a/environments/mle_bench/mle_bench.py b/environments/mle_bench/mle_bench.py
new file mode 100644
index 000000000..ccc8fa95b
--- /dev/null
+++ b/environments/mle_bench/mle_bench.py
@@ -0,0 +1,364 @@
+from __future__ import annotations
+
+import shlex
+from collections.abc import Mapping, Sequence
+from typing import Any
+
+import verifiers.v1 as vf
+
+TASKSET_ID = "mle-bench"
+DEFAULT_SPLIT = "low"
+DEFAULT_IMAGE = "mlebench-env"
+DEFAULT_WORKDIR = "/home"
+DEFAULT_SUBMISSION_PATH = "/home/submission/submission.csv"
+DEFAULT_VALIDATE_SCRIPT = "/home/validate_submission.sh"
+
+LOW_COMPETITIONS = [
+    "aerial-cactus-identification",
+    "aptos2019-blindness-detection",
+    "denoising-dirty-documents",
+    "detecting-insults-in-social-commentary",
+    "dog-breed-identification",
+    "dogs-vs-cats-redux-kernels-edition",
+    "histopathologic-cancer-detection",
+    "jigsaw-toxic-comment-classification-challenge",
+    "leaf-classification",
+    "mlsp-2013-birds",
+    "new-york-city-taxi-fare-prediction",
+    "nomad2018-predict-transparent-conductors",
+    "plant-pathology-2020-fgvc7",
+    "random-acts-of-pizza",
+    "ranzcr-clip-catheter-line-classification",
+    "siim-isic-melanoma-classification",
+    "spooky-author-identification",
+    "tabular-playground-series-dec-2021",
+    "tabular-playground-series-may-2022",
+    "text-normalization-challenge-english-language",
+    "text-normalization-challenge-russian-language",
+    "the-icml-2013-whale-challenge-right-whale-redux",
+]
+
+DEV_COMPETITIONS = [
+    "invasive-species-monitoring",
+    "ml2021spring-hw2",
+    "movie-review-sentiment-analysis-kernels-only",
+    "paddy-disease-classification",
+    "plant-seedlings-classification",
+    "playground-series-s3e18",
+    "spaceship-titanic",
+]
+
+BENCHMARK_INSTRUCTIONS = """\
+You are participating in MLE-Bench, an offline version of Kaggle competitions
+adapted to test machine-learning engineering agents.
+
+Read the competition-specific instructions, inspect the dataset files, train or
+construct a model, and write your final submission as a CSV in the
+benchmark-required format. You may use the provided validator to check that the
+submission file is structurally valid, but the validator does not reveal the
+score.
+
+Do not hand-label the test set, copy external solutions, or use information that
+would violate the benchmark rules. The final score is determined by the
+competition metric from the benchmark grader."""
+
+
+def split_ids(split: str, competition_ids: Sequence[str] | None = None) -> list[str]:
+    if competition_ids is not None:
+        return list(competition_ids)
+    if split == "dev":
+        return list(DEV_COMPETITIONS)
+    if split in {"low", "lite"}:
+        return list(LOW_COMPETITIONS)
+    if split == "all":
+        return load_registry_ids() or list(LOW_COMPETITIONS)
+    raise ValueError(f"Unknown MLE-Bench split: {split}")
+
+
+def load_registry_ids() -> list[str] | None:
+    try:
+        from mlebench.registry import registry
+
+        return list(registry.list_competition_ids())
+    except Exception:
+        return None
+
+
+def load_registry_competition(competition_id: str) -> Mapping[str, Any] | None:
+    try:
+        from mlebench.registry import registry
+
+        competition = registry.get_competition(competition_id)
+        return {
+            "id": competition.id,
+            "name": competition.name,
+            "description": competition.description,
+            "competition_type": competition.competition_type,
+            "sample_submission": str(competition.sample_submission),
+            "answers": str(competition.answers),
+        }
+    except Exception:
+        return None
+
+
+def build_prompt(
+    competition_id: str,
+    description: str,
+    workdir: str,
+    submission_path: str,
+    validate_script: str,
+) -> str:
+    data_dir = f"{workdir.rstrip('/')}/data"
+    description_path = f"{data_dir}/description.md"
+    return (
+        f"Competition ID: {competition_id}\n\n"
+        f"{BENCHMARK_INSTRUCTIONS}\n\n"
+        f"Competition instructions path: {description_path}\n"
+        f"Dataset directory: {data_dir}\n"
+        f"Required submission path: {submission_path}\n"
+        f"Validation command: {validate_script} {submission_path}\n\n"
+        "Competition description:\n"
+        f"{description.strip() or '(description unavailable in this runtime)'}"
+    )
+
+
+def make_record(
+    competition_id: str,
+    split: str,
+    image: str,
+    workdir: str,
+    submission_path: str,
+    validate_script: str,
+) -> dict[str, Any]:
+    registry_data = load_registry_competition(competition_id) or {}
+    description = str(registry_data.get("description") or "")
+    info = {
+        "competition_id": competition_id,
+        "split": split,
+        "competition_type": registry_data.get("competition_type"),
+        "sample_submission": registry_data.get("sample_submission"),
+        "answers": registry_data.get("answers"),
+        "workdir": workdir,
+        "submission_path": submission_path,
+        "validate_script": validate_script,
+    }
+    return {
+        "task_id": competition_id,
+        "prompt": [
+            {
+                "role": "user",
+                "content": build_prompt(
+                    competition_id,
+                    description,
+                    workdir,
+                    submission_path,
+                    validate_script,
+                ),
+            }
+        ],
+        "question": build_prompt(
+            competition_id,
+            description,
+            workdir,
+            submission_path,
+            validate_script,
+        ),
+        "answer": submission_path,
+        "info": info,
+        "sandbox": {
+            "image": image,
+            "cpu_cores": 36,
+            "memory_gb": 440,
+            "disk_size_gb": 256,
+            "gpu_count": 1,
+            "workdir": workdir,
+            "scope": "rollout",
+            "timeout_minutes": 1440,
+        },
+        "program": {"env": {"AGENT_WORKDIR": workdir}},
+    }
+
+
+def grading_submission_row(task: Mapping[str, Any]) -> dict[str, str]:
+    info = task["info"]
+    return {
+        "competition_id": str(info["competition_id"]),
+        "submission_path": str(info["submission_path"]),
+    }
+
+
+def grading_submission_jsonl(task: Mapping[str, Any]) -> str:
+    import json
+
+    return json.dumps(grading_submission_row(task), sort_keys=True) + "\n"
+
+
+class MLEBenchTaskset(vf.Taskset):
+    def __init__(
+        self,
+        split: str = DEFAULT_SPLIT,
+        competition_ids: Sequence[str] | None = None,
+        image: str = DEFAULT_IMAGE,
+        workdir: str = DEFAULT_WORKDIR,
+        submission_path: str = DEFAULT_SUBMISSION_PATH,
+        validate_script: str = DEFAULT_VALIDATE_SCRIPT,
+        limit: int | None = None,
+        config: vf.TasksetConfig | None = None,
+    ):
+        self.split = split
+        self.competition_ids = split_ids(split, competition_ids)
+        if limit is not None and limit >= 0:
+            self.competition_ids = self.competition_ids[:limit]
+        self.image = image
+        self.workdir = workdir
+        self.submission_path = submission_path
+        self.validate_script = validate_script
+        super().__init__(
+            source=self.load_rows,
+            taskset_id=TASKSET_ID,
+            system_prompt=BENCHMARK_INSTRUCTIONS,
+            metrics=[submission_exists, submission_nonempty, validator_available],
+            rewards=[valid_submission],
+            config=config,
+        )
+
+    def load_rows(self) -> list[dict[str, Any]]:
+        return [
+            make_record(
+                competition_id,
+                self.split,
+                self.image,
+                self.workdir,
+                self.submission_path,
+                self.validate_script,
+            )
+            for competition_id in self.competition_ids
+        ]
+
+    @vf.setup(priority=250)
+    async def capture_sandbox(self, task, state, sandbox=None) -> None:
+        if sandbox is not None:
+            state["_mle_bench_sandbox"] = sandbox
+
+    @vf.cleanup(priority=100)
+    async def cleanup_sandbox(self, task, state) -> None:
+        state.pop("_mle_bench_sandbox", None)
+
+
+async def submission_exists(task: vf.Task, state: vf.State) -> float:
+    sandbox = state.get("_mle_bench_sandbox")
+    if sandbox is None:
+        return 0.0
+    submission_path = str(task["info"]["submission_path"])
+    result = await sandbox.execute(
+        f"test -f {shlex.quote(submission_path)}",
+        timeout=30,
+    )
+    return 1.0 if result.exit_code == 0 else 0.0
+
+
+async def submission_nonempty(task: vf.Task, state: vf.State) -> float:
+    sandbox = state.get("_mle_bench_sandbox")
+    if sandbox is None:
+        return 0.0
+    submission_path = str(task["info"]["submission_path"])
+    result = await sandbox.execute(
+        f"test -s {shlex.quote(submission_path)}",
+        timeout=30,
+    )
+    return 1.0 if result.exit_code == 0 else 0.0
+
+
+async def validator_available(task: vf.Task, state: vf.State) -> float:
+    sandbox = state.get("_mle_bench_sandbox")
+    if sandbox is None:
+        return 0.0
+    validate_script = str(task["info"]["validate_script"])
+    result = await sandbox.execute(
+        f"test -x {shlex.quote(validate_script)}",
+        timeout=30,
+    )
+    return 1.0 if result.exit_code == 0 else 0.0
+
+
+async def valid_submission(task: vf.Task, state: vf.State) -> float:
+    sandbox = state.get("_mle_bench_sandbox")
+    if sandbox is None:
+        return 0.0
+    info = task["info"]
+    workdir = str(info["workdir"])
+    submission_path = str(info["submission_path"])
+    validate_script = str(info["validate_script"])
+    command = f"{shlex.quote(validate_script)} {shlex.quote(submission_path)}"
+    result = await sandbox.execute(command, timeout=300, working_dir=workdir)
+    state["validation_stdout"] = result.stdout or ""
+    state["validation_stderr"] = result.stderr or ""
+    state["validation_exit_code"] = result.exit_code
+    return 1.0 if result.exit_code == 0 and validator_accepts(result.stdout) else 0.0
+
+
+def validator_accepts(stdout: str | None) -> bool:
+    for line in (stdout or "").splitlines():
+        if line.strip().lower() == "submission is valid.":
+            return True
+    return False
+
+
+def load_taskset(
+    split: str = DEFAULT_SPLIT,
+    competition_ids: Sequence[str] | None = None,
+    image: str = DEFAULT_IMAGE,
+    workdir: str = DEFAULT_WORKDIR,
+    submission_path: str = DEFAULT_SUBMISSION_PATH,
+    validate_script: str = DEFAULT_VALIDATE_SCRIPT,
+    limit: int | None = None,
+    config: vf.TasksetConfig | None = None,
+) -> MLEBenchTaskset:
+    return MLEBenchTaskset(
+        split=split,
+        competition_ids=competition_ids,
+        image=image,
+        workdir=workdir,
+        submission_path=submission_path,
+        validate_script=validate_script,
+        limit=limit,
+        config=config,
+    )
+
+
+def load_harness(
+    max_turns: int | None = None,
+    config: vf.HarnessConfig | None = None,
+) -> vf.OpenCode:
+    return vf.OpenCode(
+        sandbox=True,
+        max_turns=max_turns,
+        config=config,
+    )
+
+
+def load_environment(
+    split: str = DEFAULT_SPLIT,
+    competition_ids: Sequence[str] | None = None,
+    image: str = DEFAULT_IMAGE,
+    workdir: str = DEFAULT_WORKDIR,
+    submission_path: str = DEFAULT_SUBMISSION_PATH,
+    validate_script: str = DEFAULT_VALIDATE_SCRIPT,
+    limit: int | None = None,
+    max_turns: int | None = None,
+    config: vf.EnvConfig | None = None,
+) -> vf.Env:
+    config = config or vf.EnvConfig()
+    return vf.Env(
+        taskset=load_taskset(
+            split=split,
+            competition_ids=competition_ids,
+            image=image,
+            workdir=workdir,
+            submission_path=submission_path,
+            validate_script=validate_script,
+            limit=limit,
+            config=config.taskset,
+        ),
+        harness=load_harness(max_turns=max_turns, config=config.harness),
+    )
diff --git a/environments/mle_bench/pyproject.toml b/environments/mle_bench/pyproject.toml
new file mode 100644
index 000000000..853b12bfe
--- /dev/null
+++ b/environments/mle_bench/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "mle-bench"
+version = "0.1.0"
+description = "MLE-Bench competition-submission environment"
+license = "Apache-2.0"
+tags = ["mle", "kaggle", "coding", "benchmark"]
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["mle_bench.py", "README.md", "pyproject.toml"]
+
+[project.entry-points."verifiers.environments"]
+mle-bench = "mle_bench:load_environment"
+
+[tool.verifiers.eval]
+num_examples = 7
+rollouts_per_example = 1
+
+[tool.uv.sources]
+verifiers = { path = "../..", editable = true }
diff --git a/tests/test_mle_bench_environment.py b/tests/test_mle_bench_environment.py
new file mode 100644
index 000000000..f424c1717
--- /dev/null
+++ b/tests/test_mle_bench_environment.py
@@ -0,0 +1,227 @@
+from __future__ import annotations
+
+import importlib
+import sys
+from pathlib import Path
+
+import verifiers.v1 as vf
+
+
+def load_module(monkeypatch):
+    env_dir = Path(__file__).parents[1] / "environments" / "mle_bench"
+    monkeypatch.syspath_prepend(str(env_dir))
+    sys.modules.pop("mle_bench", None)
+    return importlib.import_module("mle_bench")
+
+
+class Result:
+    def __init__(self, exit_code: int, stdout: str = "", stderr: str = ""):
+        self.exit_code = exit_code
+        self.stdout = stdout
+        self.stderr = stderr
+
+
+class RecordingSandbox:
+    def __init__(self, results: list[Result]):
+        self.results = list(results)
+        self.calls = []
+
+    async def execute(self, command: str, **kwargs):
+        self.calls.append({"command": command, **kwargs})
+        return self.results.pop(0)
+
+
+def test_mle_bench_loads_low_split_without_registry(monkeypatch):
+    module = load_module(monkeypatch)
+    monkeypatch.setattr(module, "load_registry_competition", lambda _id: None)
+
+    env = module.load_environment(limit=2, image="local-mlebench")
+    rows = list(env.taskset.source())
+
+    assert isinstance(env, vf.Env)
+    assert isinstance(env.harness, vf.OpenCode)
+    assert env.taskset.taskset_id == "mle-bench"
+    assert [row["task_id"] for row in rows] == module.LOW_COMPETITIONS[:2]
+    assert rows[0]["sandbox"]["image"] == "local-mlebench"
+    assert rows[0]["sandbox"]["workdir"] == "/home"
+    assert rows[0]["program"]["env"]["AGENT_WORKDIR"] == "/home"
+    assert "/home/submission/submission.csv" in rows[0]["prompt"][0]["content"]
+
+
+def test_mle_bench_prompt_uses_configured_paths(monkeypatch):
+    module = load_module(monkeypatch)
+    monkeypatch.setattr(module, "load_registry_competition", lambda _id: None)
+
+    row = module.make_record(
+        "custom-competition",
+        "dev",
+        "mlebench-env",
+        "/workspace",
+        "/workspace/out/final.csv",
+        "/workspace/tools/check.sh",
+    )
+    prompt = row["prompt"][0]["content"]
+
+    assert "/workspace/data/description.md" in prompt
+    assert "/workspace/data" in prompt
+    assert "/workspace/out/final.csv" in prompt
+    assert "/workspace/tools/check.sh /workspace/out/final.csv" in prompt
+    assert "/home/submission/submission.csv" not in prompt
+    assert "/home/validate_submission.sh" not in prompt
+
+
+def test_mle_bench_uses_registry_metadata_when_available(monkeypatch):
+    module = load_module(monkeypatch)
+
+    def fake_registry(competition_id):
+        return {
+            "id": competition_id,
+            "name": "Example Competition",
+            "description": "Predict the label.",
+            "competition_type": "simple",
+            "sample_submission": "/data/sample_submission.csv",
+            "answers": "/data/test.csv",
+        }
+
+    monkeypatch.setattr(module, "load_registry_competition", fake_registry)
+    row = module.make_record(
+        "example-competition",
+        "dev",
+        "mlebench-env",
+        "/home",
+        "/home/submission/submission.csv",
+        "/home/validate_submission.sh",
+    )
+
+    assert row["info"]["competition_type"] == "simple"
+    assert row["info"]["sample_submission"] == "/data/sample_submission.csv"
+    assert "Predict the label." in row["prompt"][0]["content"]
+
+
+def test_mle_bench_grading_submission_row(monkeypatch):
+    module = load_module(monkeypatch)
+    monkeypatch.setattr(module, "load_registry_competition", lambda _id: None)
+    row = module.make_record(
+        "spaceship-titanic",
+        "dev",
+        "mlebench-env",
+        "/home",
+        "/home/submission/submission.csv",
+        "/home/validate_submission.sh",
+    )
+
+    assert module.grading_submission_row(row) == {
+        "competition_id": "spaceship-titanic",
+        "submission_path": "/home/submission/submission.csv",
+    }
+    assert module.grading_submission_jsonl(row) == (
+        '{"competition_id": "spaceship-titanic", '
+        '"submission_path": "/home/submission/submission.csv"}\n'
+    )
+
+
+def test_mle_bench_split_ids_reject_unknown_split(monkeypatch):
+    module = load_module(monkeypatch)
+
+    try:
+        module.split_ids("missing")
+    except ValueError as exc:
+        assert "Unknown MLE-Bench split" in str(exc)
+    else:
+        raise AssertionError("expected ValueError")
+
+
+async def test_mle_bench_valid_submission_records_validator_output(monkeypatch):
+    module = load_module(monkeypatch)
+    taskset = module.load_taskset(competition_ids=["spaceship-titanic"])
+    task = vf.Task(list(taskset.source())[0])
+    state = vf.State.for_task(task)
+    sandbox = RecordingSandbox([Result(0, "Submission is valid.")])
+    state["_mle_bench_sandbox"] = sandbox
+
+    reward = await module.valid_submission(task, state)
+
+    assert reward == 1.0
+    assert state["validation_exit_code"] == 0
+    assert state["validation_stdout"] == "Submission is valid."
+    assert sandbox.calls[0]["command"] == (
+        "/home/validate_submission.sh /home/submission/submission.csv"
+    )
+    assert sandbox.calls[0]["working_dir"] == "/home"
+
+
+async def test_mle_bench_valid_submission_uses_configured_workdir(monkeypatch):
+    module = load_module(monkeypatch)
+    taskset = module.load_taskset(
+        competition_ids=["spaceship-titanic"],
+        workdir="/workspace",
+        submission_path="/workspace/submission/submission.csv",
+        validate_script="/workspace/validate_submission.sh",
+    )
+    task = vf.Task(list(taskset.source())[0])
+    state = vf.State.for_task(task)
+    sandbox = RecordingSandbox([Result(0, "Submission is valid.")])
+    state["_mle_bench_sandbox"] = sandbox
+
+    assert await module.valid_submission(task, state) == 1.0
+    assert sandbox.calls[0]["command"] == (
+        "/workspace/validate_submission.sh /workspace/submission/submission.csv"
+    )
+    assert sandbox.calls[0]["working_dir"] == "/workspace"
+
+
+async def test_mle_bench_invalid_submission_gets_zero_reward(monkeypatch):
+    module = load_module(monkeypatch)
+    taskset = module.load_taskset(competition_ids=["spaceship-titanic"])
+    task = vf.Task(list(taskset.source())[0])
+    state = vf.State.for_task(task)
+    state["_mle_bench_sandbox"] = RecordingSandbox([Result(1, "bad csv")])
+
+    assert await module.valid_submission(task, state) == 0.0
+    assert state["validation_exit_code"] == 1
+
+
+async def test_mle_bench_invalid_stdout_does_not_match_valid(monkeypatch):
+    module = load_module(monkeypatch)
+    taskset = module.load_taskset(competition_ids=["spaceship-titanic"])
+    task = vf.Task(list(taskset.source())[0])
+    state = vf.State.for_task(task)
+    state["_mle_bench_sandbox"] = RecordingSandbox(
+        [Result(0, "Submission invalid! bad csv")]
+    )
+
+    assert await module.valid_submission(task, state) == 0.0
+
+
+def test_mle_bench_validator_accepts_exact_success_line(monkeypatch):
+    module = load_module(monkeypatch)
+
+    assert module.validator_accepts("Submission is valid.\n")
+    assert not module.validator_accepts("Submission invalid! bad csv")
+    assert not module.validator_accepts("not valid")
+
+
+async def test_mle_bench_submission_exists_metric(monkeypatch):
+    module = load_module(monkeypatch)
+    taskset = module.load_taskset(competition_ids=["spaceship-titanic"])
+    task = vf.Task(list(taskset.source())[0])
+    state = vf.State.for_task(task)
+    sandbox = RecordingSandbox([Result(0)])
+    state["_mle_bench_sandbox"] = sandbox
+
+    assert await module.submission_exists(task, state) == 1.0
+    assert sandbox.calls[0]["command"] == "test -f /home/submission/submission.csv"
+
+
+async def test_mle_bench_sandbox_metrics(monkeypatch):
+    module = load_module(monkeypatch)
+    taskset = module.load_taskset(competition_ids=["spaceship-titanic"])
+    task = vf.Task(list(taskset.source())[0])
+    state = vf.State.for_task(task)
+    sandbox = RecordingSandbox([Result(0), Result(1)])
+    state["_mle_bench_sandbox"] = sandbox
+
+    assert await module.submission_nonempty(task, state) == 1.0
+    assert await module.validator_available(task, state) == 0.0
+    assert sandbox.calls[0]["command"] == "test -s /home/submission/submission.csv"
+    assert sandbox.calls[1]["command"] == "test -x /home/validate_submission.sh"