Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ jobs:
with:
python-version: "3.12"
- run: pip install ruff
- run: ruff check src/ scripts/
- run: ruff check src/ scripts/ tests/

test-import:
test:
runs-on: ubuntu-latest
strategy:
matrix:
Expand All @@ -30,8 +30,8 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- run: pip install -e .
- name: Smoke test
- run: pip install -e ".[dev]"
- name: Registry sanity
run: |
python -c "
from gdb import BenchmarkRegistry
Expand All @@ -40,4 +40,7 @@ jobs:
ids = reg.list_ids()
assert len(ids) == 39, f'Expected 39 benchmarks, got {len(ids)}'
"
python scripts/run_benchmarks.py --list
- name: pytest
run: pytest tests/ -v
- name: Legacy shim smoke
run: python scripts/run_benchmarks.py --list
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ hub = [
]
dev = [
"ruff",
"pytest>=7",
]

[tool.setuptools.packages.find]
Expand Down
Empty file added tests/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Shared pytest fixtures for the GDB test suite."""

from __future__ import annotations

import pytest

from gdb.registry import BenchmarkRegistry


@pytest.fixture(scope="session")
def registry() -> BenchmarkRegistry:
"""A fully-discovered registry shared across the test session.

Discovery walks :mod:`gdb.tasks` once; tests that only need to read the
registry (``list``, ``get``, ``list_ids``) can reuse this fixture freely.
"""
reg = BenchmarkRegistry()
reg.discover()
return reg
256 changes: 256 additions & 0 deletions tests/test_cli_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
"""Tests for pure-function helpers in :mod:`gdb.cli`.

The CLI has a lot of orchestration code; these tests cover the small, pure
pieces that sit under argparse and above the model/runner layers. They don't
exercise any network, model loading, or benchmark execution.
"""

from __future__ import annotations

import argparse
import json

import pytest

from gdb.cli import (
_parse_json_dict_arg,
_parse_model_spec,
_render_markdown_report,
_resolve_benchmark_ids,
)

# ---------------------------------------------------------------------------
# _parse_model_spec
# ---------------------------------------------------------------------------


def test_parse_model_spec_basic():
name, provider, model_id = _parse_model_spec("openai:gpt-4o")
assert name == "openai:gpt-4o"
assert provider == "openai"
assert model_id == "gpt-4o"


def test_parse_model_spec_with_alias():
name, provider, model_id = _parse_model_spec("myalias=openai:gpt-4o")
assert name == "myalias"
assert provider == "openai"
assert model_id == "gpt-4o"


def test_parse_model_spec_strips_whitespace():
name, provider, model_id = _parse_model_spec(
" myalias = anthropic : claude-haiku-4-5 "
)
assert name == "myalias"
assert provider == "anthropic"
assert model_id == "claude-haiku-4-5"


def test_parse_model_spec_custom_entrypoint_with_colon():
# model_id can itself contain colons (e.g. ``module.path:attr``) because
# we split only on the first ``:``. This is the shape used by the
# ``custom`` provider.
name, provider, model_id = _parse_model_spec("custom:my_pkg.wrapper:build")
assert provider == "custom"
assert model_id == "my_pkg.wrapper:build"
assert name == "custom:my_pkg.wrapper:build"


def test_parse_model_spec_missing_colon_raises():
with pytest.raises(ValueError, match="Invalid --multi-models spec"):
_parse_model_spec("openai-gpt-4o")


def test_parse_model_spec_unknown_provider_raises():
with pytest.raises(ValueError) as excinfo:
_parse_model_spec("bogus:some-model")
msg = str(excinfo.value)
assert "Unknown provider" in msg
# The error should enumerate the valid providers.
assert "openai" in msg
assert "anthropic" in msg


# ---------------------------------------------------------------------------
# _parse_json_dict_arg
# ---------------------------------------------------------------------------


def test_parse_json_dict_arg_none_returns_empty_dict():
assert _parse_json_dict_arg(None, field_name="x") == {}


def test_parse_json_dict_arg_empty_string_returns_empty_dict():
assert _parse_json_dict_arg("", field_name="x") == {}
assert _parse_json_dict_arg(" ", field_name="x") == {}


def test_parse_json_dict_arg_dict_is_passthrough():
d = {"foo": 1, "bar": [2, 3]}
assert _parse_json_dict_arg(d, field_name="x") is d


def test_parse_json_dict_arg_json_string():
assert _parse_json_dict_arg('{"k": "v"}', field_name="x") == {"k": "v"}


def test_parse_json_dict_arg_reads_file(tmp_path):
path = tmp_path / "init.json"
path.write_text('{"checkpoint": "/models/foo", "dtype": "bfloat16"}')
result = _parse_json_dict_arg(str(path), field_name="custom init kwargs")
assert result == {"checkpoint": "/models/foo", "dtype": "bfloat16"}


def test_parse_json_dict_arg_rejects_non_dict_json():
with pytest.raises(ValueError, match="JSON object/dict"):
_parse_json_dict_arg("[1, 2, 3]", field_name="x")


def test_parse_json_dict_arg_rejects_wrong_type():
with pytest.raises(ValueError, match="JSON object/dict"):
_parse_json_dict_arg(42, field_name="x")


# ---------------------------------------------------------------------------
# _resolve_benchmark_ids
# ---------------------------------------------------------------------------


def _ns(**kwargs) -> argparse.Namespace:
return argparse.Namespace(**kwargs)


def test_resolve_benchmark_ids_from_suite(registry):
ids = _resolve_benchmark_ids(
_ns(suite="v0-smoke", benchmarks=None), registry
)
assert "category-1" in ids
assert "svg-1" in ids


def test_resolve_benchmark_ids_from_explicit_list(registry):
ids = _resolve_benchmark_ids(
_ns(suite=None, benchmarks=["layout-4", "svg-1"]), registry
)
assert ids == ["layout-4", "svg-1"]


def test_resolve_benchmark_ids_conflicting_args(registry):
with pytest.raises(SystemExit, match="either --suite or --benchmarks"):
_resolve_benchmark_ids(
_ns(suite="v0-all", benchmarks=["layout-4"]), registry
)


def test_resolve_benchmark_ids_no_selection(registry):
with pytest.raises(SystemExit, match="One of --suite or --benchmarks"):
_resolve_benchmark_ids(_ns(suite=None, benchmarks=None), registry)


def test_resolve_benchmark_ids_unknown_suite(registry):
with pytest.raises(SystemExit, match="Unknown suite"):
_resolve_benchmark_ids(_ns(suite="v99-bogus", benchmarks=None), registry)


# ---------------------------------------------------------------------------
# _render_markdown_report
# ---------------------------------------------------------------------------


def test_render_markdown_report_empty_results():
md = _render_markdown_report({"metadata": {"run_id": "abc"}, "results": {}})
assert md.startswith("# GDB run report")
assert "## Metadata" in md
assert "- **run_id**: abc" in md
assert "## Results" in md
assert "_(empty)_" in md


def test_render_markdown_report_includes_table_header_and_rows():
report = {
"metadata": {},
"results": {
"category-1": {
"openai:gpt-4o": {
"count": 10,
"failure_rate": 0.1,
"scores": {"accuracy": 0.8, "macro_f1": 0.75},
},
},
},
}
md = _render_markdown_report(report)
# Header row
assert "| Benchmark | Model | n | fail_rate | accuracy | macro_f1 |" in md
# Separator row
assert "|---|---|---|---|---|---|" in md
# Data row
assert "| category-1 | openai:gpt-4o | 10 | 10.0% | 0.8000 | 0.7500 |" in md


def test_render_markdown_report_handles_missing_metric_as_em_dash():
# One benchmark reports accuracy; another reports only f1. Each column
# should render "—" when the metric is absent for a given row.
report = {
"metadata": {},
"results": {
"bench-a": {
"stub": {
"count": 2,
"failure_rate": 0.0,
"scores": {"accuracy": 1.0},
},
},
"bench-b": {
"stub": {
"count": 2,
"failure_rate": 0.0,
"scores": {"f1": 0.5},
},
},
},
}
md = _render_markdown_report(report)
assert "| bench-a | stub | 2 | 0.0% | 1.0000 | — |" in md
assert "| bench-b | stub | 2 | 0.0% | — | 0.5000 |" in md


def test_render_markdown_report_missing_metadata_omits_section():
md = _render_markdown_report({"results": {}})
assert "## Metadata" not in md


def test_render_markdown_report_output_is_parseable_markdown_table():
# Sanity: the header + separator + at least one row are present in the
# right order with consistent pipe counts.
report = {
"metadata": {"model": "stub"},
"results": {
"t1": {"stub": {"count": 1, "failure_rate": 0.0, "scores": {"x": 1.0}}}
},
}
lines = _render_markdown_report(report).splitlines()
table_lines = [line for line in lines if line.startswith("|")]
assert len(table_lines) >= 3 # header, separator, at least one row
pipe_counts = [line.count("|") for line in table_lines]
assert len(set(pipe_counts)) == 1 # all rows have the same column count


# ---------------------------------------------------------------------------
# Sanity: we can still JSON-roundtrip the report renderer's input contract
# (guards against accidental changes to the expected dict shape).
# ---------------------------------------------------------------------------


def test_render_markdown_report_accepts_json_roundtripped_input():
report = {
"metadata": {"suite": "v0-smoke"},
"results": {
"category-1": {
"stub": {"count": 2, "failure_rate": 0.0, "scores": {"acc": 0.5}}
}
},
}
roundtripped = json.loads(json.dumps(report))
assert _render_markdown_report(roundtripped) == _render_markdown_report(report)
63 changes: 63 additions & 0 deletions tests/test_cli_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""End-to-end smoke tests for the ``gdb`` CLI.

These are intentionally cheap: they invoke ``python -m gdb`` as a subprocess
and check that a handful of read-only commands exit cleanly and print the
expected headings. Anything that requires model inference belongs in
:mod:`tests.test_cli_helpers` (pure logic) or in the ``gdb verify`` runtime
smoke path (executed out of band).
"""

from __future__ import annotations

import subprocess
import sys


def _run(*args: str) -> subprocess.CompletedProcess:
"""Invoke ``python -m gdb`` with the given args."""
return subprocess.run(
[sys.executable, "-m", "gdb", *args],
capture_output=True,
text=True,
check=False,
)


def test_version_flag_prints_lica_gdb_version():
result = _run("--version")
assert result.returncode == 0, result.stderr
assert result.stdout.strip().startswith("lica-gdb ")


def test_help_lists_core_subcommands():
result = _run("--help")
assert result.returncode == 0, result.stderr
for subcmd in ("list", "info", "suites", "eval", "verify", "submit", "collect"):
assert subcmd in result.stdout, f"`{subcmd}` missing from --help output"


def test_list_prints_39_benchmarks():
result = _run("list")
assert result.returncode == 0, result.stderr
# The exact integer is worth asserting: it's the number the paper and
# README both reference, and the CI smoke test also pins it.
assert "39 benchmark(s)" in result.stdout


def test_suites_lists_all_v0_suites():
result = _run("suites")
assert result.returncode == 0, result.stderr
for suite in ("v0-all", "v0-smoke", "v0-understanding", "v0-generation"):
assert suite in result.stdout, f"suite `{suite}` missing from `gdb suites`"


def test_info_reports_known_benchmark():
result = _run("info", "category-1")
assert result.returncode == 0, result.stderr
assert "category-1" in result.stdout
assert "understanding" in result.stdout.lower()


def test_info_unknown_benchmark_exits_nonzero():
result = _run("info", "does-not-exist")
assert result.returncode != 0
Loading
Loading