diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c918677..fa4b432 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,9 +18,9 @@ jobs:
         with:
           python-version: "3.12"
       - run: pip install ruff
-      - run: ruff check src/ scripts/
+      - run: ruff check src/ scripts/ tests/
 
-  test-import:
+  test:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -30,8 +30,8 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      - run: pip install -e .
-      - name: Smoke test
+      - run: pip install -e ".[dev]"
+      - name: Registry sanity
         run: |
           python -c "
           from gdb import BenchmarkRegistry
@@ -40,4 +40,7 @@ jobs:
           ids = reg.list_ids()
           assert len(ids) == 39, f'Expected 39 benchmarks, got {len(ids)}'
           "
-          python scripts/run_benchmarks.py --list
+      - name: pytest
+        run: pytest tests/ -v
+      - name: Legacy shim smoke
+        run: python scripts/run_benchmarks.py --list
diff --git a/pyproject.toml b/pyproject.toml
index 7037324..c914561 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,6 +76,7 @@ hub = [
 ]
 dev = [
     "ruff",
+    "pytest>=7",
 ]
 
 [tool.setuptools.packages.find]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..63a00f8
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,19 @@
+"""Shared pytest fixtures for the GDB test suite."""
+
+from __future__ import annotations
+
+import pytest
+
+from gdb.registry import BenchmarkRegistry
+
+
+@pytest.fixture(scope="session")
+def registry() -> BenchmarkRegistry:
+    """A fully-discovered registry shared across the test session.
+
+    Discovery walks :mod:`gdb.tasks` once; tests that only need to read the
+    registry (``list``, ``get``, ``list_ids``) can reuse this fixture freely.
+    """
+    reg = BenchmarkRegistry()
+    reg.discover()
+    return reg
diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py
new file mode 100644
index 0000000..21f9a63
--- /dev/null
+++ b/tests/test_cli_helpers.py
@@ -0,0 +1,256 @@
+"""Tests for pure-function helpers in :mod:`gdb.cli`.
+
+The CLI has a lot of orchestration code; these tests cover the small, pure
+pieces that sit under argparse and above the model/runner layers. They don't
+exercise any network, model loading, or benchmark execution.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+
+import pytest
+
+from gdb.cli import (
+    _parse_json_dict_arg,
+    _parse_model_spec,
+    _render_markdown_report,
+    _resolve_benchmark_ids,
+)
+
+# ---------------------------------------------------------------------------
+# _parse_model_spec
+# ---------------------------------------------------------------------------
+
+
+def test_parse_model_spec_basic():
+    name, provider, model_id = _parse_model_spec("openai:gpt-4o")
+    assert name == "openai:gpt-4o"
+    assert provider == "openai"
+    assert model_id == "gpt-4o"
+
+
+def test_parse_model_spec_with_alias():
+    name, provider, model_id = _parse_model_spec("myalias=openai:gpt-4o")
+    assert name == "myalias"
+    assert provider == "openai"
+    assert model_id == "gpt-4o"
+
+
+def test_parse_model_spec_strips_whitespace():
+    name, provider, model_id = _parse_model_spec(
+        "  myalias  =  anthropic  :  claude-haiku-4-5  "
+    )
+    assert name == "myalias"
+    assert provider == "anthropic"
+    assert model_id == "claude-haiku-4-5"
+
+
+def test_parse_model_spec_custom_entrypoint_with_colon():
+    # model_id can itself contain colons (e.g. ``module.path:attr``) because
+    # we split only on the first ``:``. This is the shape used by the
+    # ``custom`` provider.
+    name, provider, model_id = _parse_model_spec("custom:my_pkg.wrapper:build")
+    assert provider == "custom"
+    assert model_id == "my_pkg.wrapper:build"
+    assert name == "custom:my_pkg.wrapper:build"
+
+
+def test_parse_model_spec_missing_colon_raises():
+    with pytest.raises(ValueError, match="Invalid --multi-models spec"):
+        _parse_model_spec("openai-gpt-4o")
+
+
+def test_parse_model_spec_unknown_provider_raises():
+    with pytest.raises(ValueError) as excinfo:
+        _parse_model_spec("bogus:some-model")
+    msg = str(excinfo.value)
+    assert "Unknown provider" in msg
+    # The error should enumerate the valid providers.
+    assert "openai" in msg
+    assert "anthropic" in msg
+
+
+# ---------------------------------------------------------------------------
+# _parse_json_dict_arg
+# ---------------------------------------------------------------------------
+
+
+def test_parse_json_dict_arg_none_returns_empty_dict():
+    assert _parse_json_dict_arg(None, field_name="x") == {}
+
+
+def test_parse_json_dict_arg_empty_string_returns_empty_dict():
+    assert _parse_json_dict_arg("", field_name="x") == {}
+    assert _parse_json_dict_arg("   ", field_name="x") == {}
+
+
+def test_parse_json_dict_arg_dict_is_passthrough():
+    d = {"foo": 1, "bar": [2, 3]}
+    assert _parse_json_dict_arg(d, field_name="x") is d
+
+
+def test_parse_json_dict_arg_json_string():
+    assert _parse_json_dict_arg('{"k": "v"}', field_name="x") == {"k": "v"}
+
+
+def test_parse_json_dict_arg_reads_file(tmp_path):
+    path = tmp_path / "init.json"
+    path.write_text('{"checkpoint": "/models/foo", "dtype": "bfloat16"}')
+    result = _parse_json_dict_arg(str(path), field_name="custom init kwargs")
+    assert result == {"checkpoint": "/models/foo", "dtype": "bfloat16"}
+
+
+def test_parse_json_dict_arg_rejects_non_dict_json():
+    with pytest.raises(ValueError, match="JSON object/dict"):
+        _parse_json_dict_arg("[1, 2, 3]", field_name="x")
+
+
+def test_parse_json_dict_arg_rejects_wrong_type():
+    with pytest.raises(ValueError, match="JSON object/dict"):
+        _parse_json_dict_arg(42, field_name="x")
+
+
+# ---------------------------------------------------------------------------
+# _resolve_benchmark_ids
+# ---------------------------------------------------------------------------
+
+
+def _ns(**kwargs) -> argparse.Namespace:
+    return argparse.Namespace(**kwargs)
+
+
+def test_resolve_benchmark_ids_from_suite(registry):
+    ids = _resolve_benchmark_ids(
+        _ns(suite="v0-smoke", benchmarks=None), registry
+    )
+    assert "category-1" in ids
+    assert "svg-1" in ids
+
+
+def test_resolve_benchmark_ids_from_explicit_list(registry):
+    ids = _resolve_benchmark_ids(
+        _ns(suite=None, benchmarks=["layout-4", "svg-1"]), registry
+    )
+    assert ids == ["layout-4", "svg-1"]
+
+
+def test_resolve_benchmark_ids_conflicting_args(registry):
+    with pytest.raises(SystemExit, match="either --suite or --benchmarks"):
+        _resolve_benchmark_ids(
+            _ns(suite="v0-all", benchmarks=["layout-4"]), registry
+        )
+
+
+def test_resolve_benchmark_ids_no_selection(registry):
+    with pytest.raises(SystemExit, match="One of --suite or --benchmarks"):
+        _resolve_benchmark_ids(_ns(suite=None, benchmarks=None), registry)
+
+
+def test_resolve_benchmark_ids_unknown_suite(registry):
+    with pytest.raises(SystemExit, match="Unknown suite"):
+        _resolve_benchmark_ids(_ns(suite="v99-bogus", benchmarks=None), registry)
+
+
+# ---------------------------------------------------------------------------
+# _render_markdown_report
+# ---------------------------------------------------------------------------
+
+
+def test_render_markdown_report_empty_results():
+    md = _render_markdown_report({"metadata": {"run_id": "abc"}, "results": {}})
+    assert md.startswith("# GDB run report")
+    assert "## Metadata" in md
+    assert "- **run_id**: abc" in md
+    assert "## Results" in md
+    assert "_(empty)_" in md
+
+
+def test_render_markdown_report_includes_table_header_and_rows():
+    report = {
+        "metadata": {},
+        "results": {
+            "category-1": {
+                "openai:gpt-4o": {
+                    "count": 10,
+                    "failure_rate": 0.1,
+                    "scores": {"accuracy": 0.8, "macro_f1": 0.75},
+                },
+            },
+        },
+    }
+    md = _render_markdown_report(report)
+    # Header row
+    assert "| Benchmark | Model | n | fail_rate | accuracy | macro_f1 |" in md
+    # Separator row
+    assert "|---|---|---|---|---|---|" in md
+    # Data row
+    assert "| category-1 | openai:gpt-4o | 10 | 10.0% | 0.8000 | 0.7500 |" in md
+
+
+def test_render_markdown_report_handles_missing_metric_as_em_dash():
+    # One benchmark reports accuracy; another reports only f1. Each column
+    # should render "—" when the metric is absent for a given row.
+    report = {
+        "metadata": {},
+        "results": {
+            "bench-a": {
+                "stub": {
+                    "count": 2,
+                    "failure_rate": 0.0,
+                    "scores": {"accuracy": 1.0},
+                },
+            },
+            "bench-b": {
+                "stub": {
+                    "count": 2,
+                    "failure_rate": 0.0,
+                    "scores": {"f1": 0.5},
+                },
+            },
+        },
+    }
+    md = _render_markdown_report(report)
+    assert "| bench-a | stub | 2 | 0.0% | 1.0000 | — |" in md
+    assert "| bench-b | stub | 2 | 0.0% | — | 0.5000 |" in md
+
+
+def test_render_markdown_report_missing_metadata_omits_section():
+    md = _render_markdown_report({"results": {}})
+    assert "## Metadata" not in md
+
+
+def test_render_markdown_report_output_is_parseable_markdown_table():
+    # Sanity: the header + separator + at least one row are present in the
+    # right order with consistent pipe counts.
+    report = {
+        "metadata": {"model": "stub"},
+        "results": {
+            "t1": {"stub": {"count": 1, "failure_rate": 0.0, "scores": {"x": 1.0}}}
+        },
+    }
+    lines = _render_markdown_report(report).splitlines()
+    table_lines = [line for line in lines if line.startswith("|")]
+    assert len(table_lines) >= 3  # header, separator, at least one row
+    pipe_counts = [line.count("|") for line in table_lines]
+    assert len(set(pipe_counts)) == 1  # all rows have the same column count
+
+
+# ---------------------------------------------------------------------------
+# Sanity: we can still JSON-roundtrip the report renderer's input contract
+# (guards against accidental changes to the expected dict shape).
+# ---------------------------------------------------------------------------
+
+
+def test_render_markdown_report_accepts_json_roundtripped_input():
+    report = {
+        "metadata": {"suite": "v0-smoke"},
+        "results": {
+            "category-1": {
+                "stub": {"count": 2, "failure_rate": 0.0, "scores": {"acc": 0.5}}
+            }
+        },
+    }
+    roundtripped = json.loads(json.dumps(report))
+    assert _render_markdown_report(roundtripped) == _render_markdown_report(report)
diff --git a/tests/test_cli_integration.py b/tests/test_cli_integration.py
new file mode 100644
index 0000000..5bafaf6
--- /dev/null
+++ b/tests/test_cli_integration.py
@@ -0,0 +1,63 @@
+"""End-to-end smoke tests for the ``gdb`` CLI.
+
+These are intentionally cheap: they invoke ``python -m gdb`` as a subprocess
+and check that a handful of read-only commands exit cleanly and print the
+expected headings. Anything that requires model inference belongs in
+:mod:`tests.test_cli_helpers` (pure logic) or in the ``gdb verify`` runtime
+smoke path (executed out of band).
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+
+
+def _run(*args: str) -> subprocess.CompletedProcess:
+    """Invoke ``python -m gdb`` with the given args."""
+    return subprocess.run(
+        [sys.executable, "-m", "gdb", *args],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+
+def test_version_flag_prints_lica_gdb_version():
+    result = _run("--version")
+    assert result.returncode == 0, result.stderr
+    assert result.stdout.strip().startswith("lica-gdb ")
+
+
+def test_help_lists_core_subcommands():
+    result = _run("--help")
+    assert result.returncode == 0, result.stderr
+    for subcmd in ("list", "info", "suites", "eval", "verify", "submit", "collect"):
+        assert subcmd in result.stdout, f"`{subcmd}` missing from --help output"
+
+
+def test_list_prints_39_benchmarks():
+    result = _run("list")
+    assert result.returncode == 0, result.stderr
+    # The exact integer is worth asserting: it's the number the paper and
+    # README both reference, and the CI smoke test also pins it.
+    assert "39 benchmark(s)" in result.stdout
+
+
+def test_suites_lists_all_v0_suites():
+    result = _run("suites")
+    assert result.returncode == 0, result.stderr
+    for suite in ("v0-all", "v0-smoke", "v0-understanding", "v0-generation"):
+        assert suite in result.stdout, f"suite `{suite}` missing from `gdb suites`"
+
+
+def test_info_reports_known_benchmark():
+    result = _run("info", "category-1")
+    assert result.returncode == 0, result.stderr
+    assert "category-1" in result.stdout
+    assert "understanding" in result.stdout.lower()
+
+
+def test_info_unknown_benchmark_exits_nonzero():
+    result = _run("info", "does-not-exist")
+    assert result.returncode != 0
diff --git a/tests/test_suites.py b/tests/test_suites.py
new file mode 100644
index 0000000..0d811d8
--- /dev/null
+++ b/tests/test_suites.py
@@ -0,0 +1,105 @@
+"""Tests for :mod:`gdb.suites`.
+
+These tests use a real discovered registry so suite routing is validated
+against the actual task set. That means adding a task will also need a suite
+expectation update here if the task changes the understanding/generation
+balance, which we consider a feature rather than a bug.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from gdb.suites import describe_suite, list_suites, resolve_suite
+
+KNOWN_SUITES = {"v0-all", "v0-smoke", "v0-understanding", "v0-generation"}
+
+
+def test_list_suites_contains_all_known():
+    names = set(list_suites())
+    assert KNOWN_SUITES.issubset(names)
+
+
+def test_list_suites_is_sorted_and_unique():
+    names = list_suites()
+    assert names == sorted(set(names))
+
+
+def test_v0_smoke_returns_fixed_task_ids(registry):
+    # Static suite: must match the hardcoded list in gdb.suites. If this ever
+    # changes, the bundled verify fixture in src/gdb/_verify_data/ probably
+    # needs to change too.
+    expected = {
+        "category-1",
+        "layout-4",
+        "layout-5",
+        "typography-1",
+        "svg-1",
+        "template-1",
+    }
+    assert set(resolve_suite("v0-smoke", registry)) == expected
+
+
+def test_v0_all_matches_registry_contents(registry):
+    ids = resolve_suite("v0-all", registry)
+    assert set(ids) == {b.meta.id for b in registry.list()}
+    assert ids == sorted(ids)
+
+
+def test_v0_understanding_and_generation_partition_v0_all(registry):
+    u = set(resolve_suite("v0-understanding", registry))
+    g = set(resolve_suite("v0-generation", registry))
+    all_ = set(resolve_suite("v0-all", registry))
+    assert u | g == all_
+    assert u & g == set()
+
+
+def test_v0_understanding_is_nonempty_and_sorted(registry):
+    ids = resolve_suite("v0-understanding", registry)
+    assert ids
+    assert ids == sorted(ids)
+
+
+def test_v0_generation_is_nonempty_and_sorted(registry):
+    ids = resolve_suite("v0-generation", registry)
+    assert ids
+    assert ids == sorted(ids)
+
+
+def test_resolve_suite_returns_list_copy(registry):
+    # Mutating the returned list must not corrupt the cached static suite.
+    a = resolve_suite("v0-smoke", registry)
+    a.append("not-a-real-benchmark")
+    b = resolve_suite("v0-smoke", registry)
+    assert "not-a-real-benchmark" not in b
+
+
+def test_resolve_suite_unknown_raises_keyerror(registry):
+    with pytest.raises(KeyError) as excinfo:
+        resolve_suite("v99-all", registry)
+    msg = str(excinfo.value)
+    assert "v99-all" in msg
+    # The message should list the known suites to help users.
+    assert "v0-all" in msg
+    assert "v0-smoke" in msg
+
+
+def test_describe_suite_static_shape(registry):
+    info = describe_suite("v0-smoke", registry)
+    assert info["name"] == "v0-smoke"
+    assert info["kind"] == "static"
+    assert info["n_tasks"] == len(info["task_ids"])
+    assert info["n_tasks"] > 0
+
+
+def test_describe_suite_dynamic_shape(registry):
+    info = describe_suite("v0-all", registry)
+    assert info["name"] == "v0-all"
+    assert info["kind"] == "dynamic"
+    assert info["n_tasks"] == len(info["task_ids"])
+    assert info["n_tasks"] == len(list(registry.list()))
+
+
+def test_describe_suite_unknown_propagates_keyerror(registry):
+    with pytest.raises(KeyError):
+        describe_suite("nope", registry)