diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c918677..fa4b432 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,9 +18,9 @@ jobs: with: python-version: "3.12" - run: pip install ruff - - run: ruff check src/ scripts/ + - run: ruff check src/ scripts/ tests/ - test-import: + test: runs-on: ubuntu-latest strategy: matrix: @@ -30,8 +30,8 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - run: pip install -e . - - name: Smoke test + - run: pip install -e ".[dev]" + - name: Registry sanity run: | python -c " from gdb import BenchmarkRegistry @@ -40,4 +40,7 @@ jobs: ids = reg.list_ids() assert len(ids) == 39, f'Expected 39 benchmarks, got {len(ids)}' " - python scripts/run_benchmarks.py --list + - name: pytest + run: pytest tests/ -v + - name: Legacy shim smoke + run: python scripts/run_benchmarks.py --list diff --git a/pyproject.toml b/pyproject.toml index 7037324..c914561 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ hub = [ ] dev = [ "ruff", + "pytest>=7", ] [tool.setuptools.packages.find] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..63a00f8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +"""Shared pytest fixtures for the GDB test suite.""" + +from __future__ import annotations + +import pytest + +from gdb.registry import BenchmarkRegistry + + +@pytest.fixture(scope="session") +def registry() -> BenchmarkRegistry: + """A fully-discovered registry shared across the test session. + + Discovery walks :mod:`gdb.tasks` once; tests that only need to read the + registry (``list``, ``get``, ``list_ids``) can reuse this fixture freely. + """ + reg = BenchmarkRegistry() + reg.discover() + return reg diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py new file mode 100644 index 0000000..21f9a63 --- /dev/null +++ b/tests/test_cli_helpers.py @@ -0,0 +1,256 @@ +"""Tests for pure-function helpers in :mod:`gdb.cli`. + +The CLI has a lot of orchestration code; these tests cover the small, pure +pieces that sit under argparse and above the model/runner layers. They don't +exercise any network, model loading, or benchmark execution. +""" + +from __future__ import annotations + +import argparse +import json + +import pytest + +from gdb.cli import ( + _parse_json_dict_arg, + _parse_model_spec, + _render_markdown_report, + _resolve_benchmark_ids, +) + +# --------------------------------------------------------------------------- +# _parse_model_spec +# --------------------------------------------------------------------------- + + +def test_parse_model_spec_basic(): + name, provider, model_id = _parse_model_spec("openai:gpt-4o") + assert name == "openai:gpt-4o" + assert provider == "openai" + assert model_id == "gpt-4o" + + +def test_parse_model_spec_with_alias(): + name, provider, model_id = _parse_model_spec("myalias=openai:gpt-4o") + assert name == "myalias" + assert provider == "openai" + assert model_id == "gpt-4o" + + +def test_parse_model_spec_strips_whitespace(): + name, provider, model_id = _parse_model_spec( + " myalias = anthropic : claude-haiku-4-5 " + ) + assert name == "myalias" + assert provider == "anthropic" + assert model_id == "claude-haiku-4-5" + + +def test_parse_model_spec_custom_entrypoint_with_colon(): + # model_id can itself contain colons (e.g. ``module.path:attr``) because + # we split only on the first ``:``. This is the shape used by the + # ``custom`` provider. + name, provider, model_id = _parse_model_spec("custom:my_pkg.wrapper:build") + assert provider == "custom" + assert model_id == "my_pkg.wrapper:build" + assert name == "custom:my_pkg.wrapper:build" + + +def test_parse_model_spec_missing_colon_raises(): + with pytest.raises(ValueError, match="Invalid --multi-models spec"): + _parse_model_spec("openai-gpt-4o") + + +def test_parse_model_spec_unknown_provider_raises(): + with pytest.raises(ValueError) as excinfo: + _parse_model_spec("bogus:some-model") + msg = str(excinfo.value) + assert "Unknown provider" in msg + # The error should enumerate the valid providers. + assert "openai" in msg + assert "anthropic" in msg + + +# --------------------------------------------------------------------------- +# _parse_json_dict_arg +# --------------------------------------------------------------------------- + + +def test_parse_json_dict_arg_none_returns_empty_dict(): + assert _parse_json_dict_arg(None, field_name="x") == {} + + +def test_parse_json_dict_arg_empty_string_returns_empty_dict(): + assert _parse_json_dict_arg("", field_name="x") == {} + assert _parse_json_dict_arg(" ", field_name="x") == {} + + +def test_parse_json_dict_arg_dict_is_passthrough(): + d = {"foo": 1, "bar": [2, 3]} + assert _parse_json_dict_arg(d, field_name="x") is d + + +def test_parse_json_dict_arg_json_string(): + assert _parse_json_dict_arg('{"k": "v"}', field_name="x") == {"k": "v"} + + +def test_parse_json_dict_arg_reads_file(tmp_path): + path = tmp_path / "init.json" + path.write_text('{"checkpoint": "/models/foo", "dtype": "bfloat16"}') + result = _parse_json_dict_arg(str(path), field_name="custom init kwargs") + assert result == {"checkpoint": "/models/foo", "dtype": "bfloat16"} + + +def test_parse_json_dict_arg_rejects_non_dict_json(): + with pytest.raises(ValueError, match="JSON object/dict"): + _parse_json_dict_arg("[1, 2, 3]", field_name="x") + + +def test_parse_json_dict_arg_rejects_wrong_type(): + with pytest.raises(ValueError, match="JSON object/dict"): + _parse_json_dict_arg(42, field_name="x") + + +# --------------------------------------------------------------------------- +# _resolve_benchmark_ids +# --------------------------------------------------------------------------- + + +def _ns(**kwargs) -> argparse.Namespace: + return argparse.Namespace(**kwargs) + + +def test_resolve_benchmark_ids_from_suite(registry): + ids = _resolve_benchmark_ids( + _ns(suite="v0-smoke", benchmarks=None), registry + ) + assert "category-1" in ids + assert "svg-1" in ids + + +def test_resolve_benchmark_ids_from_explicit_list(registry): + ids = _resolve_benchmark_ids( + _ns(suite=None, benchmarks=["layout-4", "svg-1"]), registry + ) + assert ids == ["layout-4", "svg-1"] + + +def test_resolve_benchmark_ids_conflicting_args(registry): + with pytest.raises(SystemExit, match="either --suite or --benchmarks"): + _resolve_benchmark_ids( + _ns(suite="v0-all", benchmarks=["layout-4"]), registry + ) + + +def test_resolve_benchmark_ids_no_selection(registry): + with pytest.raises(SystemExit, match="One of --suite or --benchmarks"): + _resolve_benchmark_ids(_ns(suite=None, benchmarks=None), registry) + + +def test_resolve_benchmark_ids_unknown_suite(registry): + with pytest.raises(SystemExit, match="Unknown suite"): + _resolve_benchmark_ids(_ns(suite="v99-bogus", benchmarks=None), registry) + + +# --------------------------------------------------------------------------- +# _render_markdown_report +# --------------------------------------------------------------------------- + + +def test_render_markdown_report_empty_results(): + md = _render_markdown_report({"metadata": {"run_id": "abc"}, "results": {}}) + assert md.startswith("# GDB run report") + assert "## Metadata" in md + assert "- **run_id**: abc" in md + assert "## Results" in md + assert "_(empty)_" in md + + +def test_render_markdown_report_includes_table_header_and_rows(): + report = { + "metadata": {}, + "results": { + "category-1": { + "openai:gpt-4o": { + "count": 10, + "failure_rate": 0.1, + "scores": {"accuracy": 0.8, "macro_f1": 0.75}, + }, + }, + }, + } + md = _render_markdown_report(report) + # Header row + assert "| Benchmark | Model | n | fail_rate | accuracy | macro_f1 |" in md + # Separator row + assert "|---|---|---|---|---|---|" in md + # Data row + assert "| category-1 | openai:gpt-4o | 10 | 10.0% | 0.8000 | 0.7500 |" in md + + +def test_render_markdown_report_handles_missing_metric_as_em_dash(): + # One benchmark reports accuracy; another reports only f1. Each column + # should render "—" when the metric is absent for a given row. + report = { + "metadata": {}, + "results": { + "bench-a": { + "stub": { + "count": 2, + "failure_rate": 0.0, + "scores": {"accuracy": 1.0}, + }, + }, + "bench-b": { + "stub": { + "count": 2, + "failure_rate": 0.0, + "scores": {"f1": 0.5}, + }, + }, + }, + } + md = _render_markdown_report(report) + assert "| bench-a | stub | 2 | 0.0% | 1.0000 | — |" in md + assert "| bench-b | stub | 2 | 0.0% | — | 0.5000 |" in md + + +def test_render_markdown_report_missing_metadata_omits_section(): + md = _render_markdown_report({"results": {}}) + assert "## Metadata" not in md + + +def test_render_markdown_report_output_is_parseable_markdown_table(): + # Sanity: the header + separator + at least one row are present in the + # right order with consistent pipe counts. + report = { + "metadata": {"model": "stub"}, + "results": { + "t1": {"stub": {"count": 1, "failure_rate": 0.0, "scores": {"x": 1.0}}} + }, + } + lines = _render_markdown_report(report).splitlines() + table_lines = [line for line in lines if line.startswith("|")] + assert len(table_lines) >= 3 # header, separator, at least one row + pipe_counts = [line.count("|") for line in table_lines] + assert len(set(pipe_counts)) == 1 # all rows have the same column count + + +# --------------------------------------------------------------------------- +# Sanity: we can still JSON-roundtrip the report renderer's input contract +# (guards against accidental changes to the expected dict shape). +# --------------------------------------------------------------------------- + + +def test_render_markdown_report_accepts_json_roundtripped_input(): + report = { + "metadata": {"suite": "v0-smoke"}, + "results": { + "category-1": { + "stub": {"count": 2, "failure_rate": 0.0, "scores": {"acc": 0.5}} + } + }, + } + roundtripped = json.loads(json.dumps(report)) + assert _render_markdown_report(roundtripped) == _render_markdown_report(report) diff --git a/tests/test_cli_integration.py b/tests/test_cli_integration.py new file mode 100644 index 0000000..5bafaf6 --- /dev/null +++ b/tests/test_cli_integration.py @@ -0,0 +1,63 @@ +"""End-to-end smoke tests for the ``gdb`` CLI. + +These are intentionally cheap: they invoke ``python -m gdb`` as a subprocess +and check that a handful of read-only commands exit cleanly and print the +expected headings. Anything that requires model inference belongs in +:mod:`tests.test_cli_helpers` (pure logic) or in the ``gdb verify`` runtime +smoke path (executed out of band). +""" + +from __future__ import annotations + +import subprocess +import sys + + +def _run(*args: str) -> subprocess.CompletedProcess: + """Invoke ``python -m gdb`` with the given args.""" + return subprocess.run( + [sys.executable, "-m", "gdb", *args], + capture_output=True, + text=True, + check=False, + ) + + +def test_version_flag_prints_lica_gdb_version(): + result = _run("--version") + assert result.returncode == 0, result.stderr + assert result.stdout.strip().startswith("lica-gdb ") + + +def test_help_lists_core_subcommands(): + result = _run("--help") + assert result.returncode == 0, result.stderr + for subcmd in ("list", "info", "suites", "eval", "verify", "submit", "collect"): + assert subcmd in result.stdout, f"`{subcmd}` missing from --help output" + + +def test_list_prints_39_benchmarks(): + result = _run("list") + assert result.returncode == 0, result.stderr + # The exact integer is worth asserting: it's the number the paper and + # README both reference, and the CI smoke test also pins it. + assert "39 benchmark(s)" in result.stdout + + +def test_suites_lists_all_v0_suites(): + result = _run("suites") + assert result.returncode == 0, result.stderr + for suite in ("v0-all", "v0-smoke", "v0-understanding", "v0-generation"): + assert suite in result.stdout, f"suite `{suite}` missing from `gdb suites`" + + +def test_info_reports_known_benchmark(): + result = _run("info", "category-1") + assert result.returncode == 0, result.stderr + assert "category-1" in result.stdout + assert "understanding" in result.stdout.lower() + + +def test_info_unknown_benchmark_exits_nonzero(): + result = _run("info", "does-not-exist") + assert result.returncode != 0 diff --git a/tests/test_suites.py b/tests/test_suites.py new file mode 100644 index 0000000..0d811d8 --- /dev/null +++ b/tests/test_suites.py @@ -0,0 +1,105 @@ +"""Tests for :mod:`gdb.suites`. + +These tests use a real discovered registry so suite routing is validated +against the actual task set. That means adding a task will also need a suite +expectation update here if the task changes the understanding/generation +balance, which we consider a feature rather than a bug. +""" + +from __future__ import annotations + +import pytest + +from gdb.suites import describe_suite, list_suites, resolve_suite + +KNOWN_SUITES = {"v0-all", "v0-smoke", "v0-understanding", "v0-generation"} + + +def test_list_suites_contains_all_known(): + names = set(list_suites()) + assert KNOWN_SUITES.issubset(names) + + +def test_list_suites_is_sorted_and_unique(): + names = list_suites() + assert names == sorted(set(names)) + + +def test_v0_smoke_returns_fixed_task_ids(registry): + # Static suite: must match the hardcoded list in gdb.suites. If this ever + # changes, the bundled verify fixture in src/gdb/_verify_data/ probably + # needs to change too. + expected = { + "category-1", + "layout-4", + "layout-5", + "typography-1", + "svg-1", + "template-1", + } + assert set(resolve_suite("v0-smoke", registry)) == expected + + +def test_v0_all_matches_registry_contents(registry): + ids = resolve_suite("v0-all", registry) + assert set(ids) == {b.meta.id for b in registry.list()} + assert ids == sorted(ids) + + +def test_v0_understanding_and_generation_partition_v0_all(registry): + u = set(resolve_suite("v0-understanding", registry)) + g = set(resolve_suite("v0-generation", registry)) + all_ = set(resolve_suite("v0-all", registry)) + assert u | g == all_ + assert u & g == set() + + +def test_v0_understanding_is_nonempty_and_sorted(registry): + ids = resolve_suite("v0-understanding", registry) + assert ids + assert ids == sorted(ids) + + +def test_v0_generation_is_nonempty_and_sorted(registry): + ids = resolve_suite("v0-generation", registry) + assert ids + assert ids == sorted(ids) + + +def test_resolve_suite_returns_list_copy(registry): + # Mutating the returned list must not corrupt the cached static suite. + a = resolve_suite("v0-smoke", registry) + a.append("not-a-real-benchmark") + b = resolve_suite("v0-smoke", registry) + assert "not-a-real-benchmark" not in b + + +def test_resolve_suite_unknown_raises_keyerror(registry): + with pytest.raises(KeyError) as excinfo: + resolve_suite("v99-all", registry) + msg = str(excinfo.value) + assert "v99-all" in msg + # The message should list the known suites to help users. + assert "v0-all" in msg + assert "v0-smoke" in msg + + +def test_describe_suite_static_shape(registry): + info = describe_suite("v0-smoke", registry) + assert info["name"] == "v0-smoke" + assert info["kind"] == "static" + assert info["n_tasks"] == len(info["task_ids"]) + assert info["n_tasks"] > 0 + + +def test_describe_suite_dynamic_shape(registry): + info = describe_suite("v0-all", registry) + assert info["name"] == "v0-all" + assert info["kind"] == "dynamic" + assert info["n_tasks"] == len(info["task_ids"]) + assert info["n_tasks"] == len(list(registry.list())) + + +def test_describe_suite_unknown_propagates_keyerror(registry): + with pytest.raises(KeyError): + describe_suite("nope", registry)