feat: Add pytest integration (#25)

AbhiPrasad · web-flow · commit 97da34df8474 · 2026-03-06T12:44:47.000-05:00
diff --git a/py/noxfile.py b/py/noxfile.py
@@ -77,6 +77,7 @@
 GOOGLE_ADK_VERSIONS = (LATEST, "1.14.1")
 # temporalio 1.19.0+ requires Python >= 3.10; skip Python 3.9 entirely
 TEMPORAL_VERSIONS = (LATEST, "1.20.0", "1.19.0")
+PYTEST_VERSIONS = (LATEST, "8.4.2")
 
 
 @nox.session()
@@ -254,6 +255,14 @@ def test_cli(session):
     _run_tests(session, "braintrust/devserver/test_server_integration.py")
 
 
+@nox.session()
+@nox.parametrize("version", PYTEST_VERSIONS, ids=PYTEST_VERSIONS)
+def test_pytest_plugin(session, version):
+    _install_test_deps(session)
+    _install(session, "pytest", version)
+    _run_tests(session, f"{WRAPPER_DIR}/pytest_plugin/test_plugin.py")
+
+
 @nox.session()
 def test_otel(session):
     """Test OtelExporter with OpenTelemetry installed."""
@@ -390,6 +399,11 @@ def _run_tests(session, test_path, ignore_path="", ignore_paths=None, env=None):
         # Run the tests in the src directory
         test_args = [
             "pytest",
+            # Disable the braintrust pytest plugin (registered via pytest11 entry
+            # point) to avoid ImportPathMismatchError when the installed package
+            # and the source tree both contain braintrust/conftest.py.
+            "-p",
+            "no:braintrust",
             f"src/{test_path}",
         ]
         for path in paths_to_ignore:
diff --git a/py/setup.py b/py/setup.py
@@ -58,7 +58,10 @@
     packages=setuptools.find_packages(where="src"),
     package_data={"braintrust": ["py.typed"]},
     python_requires=">=3.10.0",
-    entry_points={"console_scripts": ["braintrust = braintrust.cli.__main__:main"]},
+    entry_points={
+        "console_scripts": ["braintrust = braintrust.cli.__main__:main"],
+        "pytest11": ["braintrust = braintrust.wrappers.pytest_plugin.plugin"],
+    },
     install_requires=install_requires,
     extras_require=extras_require,
 )
diff --git a/py/src/braintrust/wrappers/pytest_plugin/README.md b/py/src/braintrust/wrappers/pytest_plugin/README.md
@@ -0,0 +1,114 @@
+# Braintrust Pytest Plugin
+
+Automatically track pytest test results as [Braintrust](https://www.braintrust.dev) experiments. Mark tests with `@pytest.mark.braintrust`, run with `--braintrust`, and get experiment tracking with spans, pass/fail scores, and custom metrics.
+
+## Requirements
+
+- Python >= 3.10
+- pytest >= 8
+
+## Installation
+
+The plugin is included with the Braintrust SDK and auto-registers via `pytest11` entry point:
+
+```bash
+pip install braintrust
+```
+
+## Quick Start
+
+```python
+import pytest
+
+@pytest.mark.braintrust
+def test_my_llm(braintrust_span):
+    result = my_llm("hello")
+    braintrust_span.log(input={"query": "hello"}, output=result)
+    assert "greeting" in result
+```
+
+Run with:
+
+```bash
+pytest --braintrust --braintrust-project="my-project"
+```
+
+## Configuration
+
+### CLI Options
+
+| Option                    | Env Var                 | Description                               |
+| ------------------------- | ----------------------- | ----------------------------------------- |
+| `--braintrust`            | —                       | Enable experiment tracking (required)     |
+| `--braintrust-project`    | `BRAINTRUST_PROJECT`    | Project name (default: test module name)  |
+| `--braintrust-experiment` | `BRAINTRUST_EXPERIMENT` | Experiment name (default: auto-generated) |
+| `--braintrust-api-key`    | `BRAINTRUST_API_KEY`    | API key for Braintrust                    |
+| `--braintrust-no-summary` | —                       | Suppress terminal summary                 |
+
+### Marker kwargs
+
+```python
+@pytest.mark.braintrust(
+    project="my-project",      # Override project for this test/class
+    input={"query": "hello"},  # Static input data
+    expected={"answer": "hi"}, # Expected output
+    tags=["regression"],       # Tags for the span
+    metadata={"model": "gpt-4"},  # Additional metadata
+)
+```
+
+## Logging Data
+
+The `braintrust_span` fixture is a standard [`Span`](https://www.braintrust.dev/docs/reference/python#span) object. Use `span.log()` to record data:
+
+```python
+def test_example(braintrust_span):
+    braintrust_span.log(
+        input={"query": "hello"},
+        output={"response": "world"},
+        expected={"response": "world"},
+        scores={"accuracy": 0.95},
+        metadata={"model": "gpt-4"},
+    )
+```
+
+When `--braintrust` is not passed, the fixture returns a no-op span that silently discards all logged data, so tests still pass normally.
+
+## Experiment Grouping
+
+By default, one experiment is created per test module. Override with:
+
+- **CLI**: `--braintrust-project="name"` (applies to all tests)
+- **Marker**: `@pytest.mark.braintrust(project="name")` (per-test or per-class)
+
+## Data-Driven Tests
+
+Parametrized test arguments are automatically logged as input:
+
+```python
+@pytest.mark.braintrust
+@pytest.mark.parametrize("query,expected_answer", [
+    ("2+2?", "4"),
+    ("Capital of France?", "Paris"),
+])
+def test_qa(braintrust_span, query, expected_answer):
+    result = my_llm(query)
+    braintrust_span.log(output=result)
+```
+
+Each parametrized case becomes a separate span with `input: {"query": "2+2?", "expected_answer": "4"}`.
+
+If you provide `input` via the marker, it takes precedence over auto-logged arguments.
+
+## Viewing Results
+
+After a test run, the terminal shows an experiment summary:
+
+```
+=========================SUMMARY=========================
+95.50% 'pass'  score
+
+See results for my-test-2026-03-02T15:30:00 at https://www.braintrust.dev/app/...
+```
+
+Click the URL to view detailed results in the Braintrust UI.
diff --git a/py/src/braintrust/wrappers/pytest_plugin/__init__.py b/py/src/braintrust/wrappers/pytest_plugin/__init__.py
diff --git a/py/src/braintrust/wrappers/pytest_plugin/plugin.py b/py/src/braintrust/wrappers/pytest_plugin/plugin.py
@@ -0,0 +1,217 @@
+"""Braintrust pytest plugin.
+
+Automatically tracks test results as Braintrust experiments when tests are
+decorated with ``@pytest.mark.braintrust`` and run with ``--braintrust``.
+
+The plugin is registered via the ``pytest11`` entry point in ``setup.py``
+so it auto-loads when braintrust is installed.
+"""
+
+from __future__ import annotations
+
+import traceback
+from typing import TYPE_CHECKING, Any
+
+import pytest
+
+if TYPE_CHECKING:
+    from braintrust.logger import Span
+
+# ---------------------------------------------------------------------------
+# Marker registration & CLI options (always active)
+# ---------------------------------------------------------------------------
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    group = parser.getgroup("braintrust", "Braintrust experiment tracking")
+    group.addoption(
+        "--braintrust",
+        action="store_true",
+        default=False,
+        help="Enable Braintrust experiment tracking for @pytest.mark.braintrust tests.",
+    )
+    group.addoption(
+        "--braintrust-project",
+        action="store",
+        default=None,
+        help="Override the Braintrust project name for all tests (env: BRAINTRUST_PROJECT).",
+    )
+    group.addoption(
+        "--braintrust-experiment",
+        action="store",
+        default=None,
+        help="Override the experiment name (env: BRAINTRUST_EXPERIMENT).",
+    )
+    group.addoption(
+        "--braintrust-api-key",
+        action="store",
+        default=None,
+        help="Braintrust API key (env: BRAINTRUST_API_KEY).",
+    )
+    group.addoption(
+        "--braintrust-no-summary",
+        action="store_true",
+        default=False,
+        help="Suppress the experiment summary at the end of the session.",
+    )
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    # Always register the marker so that ``@pytest.mark.braintrust`` never
+    # triggers an "unknown marker" warning.
+    config.addinivalue_line(
+        "markers",
+        "braintrust: mark test for Braintrust experiment tracking. "
+        "Optional kwargs: project, input, expected, tags, metadata.",
+    )
+
+    if config.getoption("--braintrust", default=False):
+        import os
+
+        api_key = config.getoption("--braintrust-api-key") or os.environ.get("BRAINTRUST_API_KEY")
+        if api_key:
+            os.environ["BRAINTRUST_API_KEY"] = api_key
+
+        plugin = BraintrustPytestPlugin(config)
+        config.pluginmanager.register(plugin, "braintrust-plugin")
+
+
+# ---------------------------------------------------------------------------
+# braintrust_span fixture (always available)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def braintrust_span(request: pytest.FixtureRequest) -> Span:
+    """Return the :class:`~braintrust.logger.Span` for the current test.
+
+    When ``--braintrust`` is not active the fixture returns a no-op span
+    that silently discards all logged data.
+    """
+    from braintrust.logger import NOOP_SPAN
+
+    return getattr(request.node, "_braintrust_span", NOOP_SPAN)
+
+
+# ---------------------------------------------------------------------------
+# Plugin class (registered only when --braintrust is passed)
+# ---------------------------------------------------------------------------
+
+
+class BraintrustPytestPlugin:
+    """Core hook implementation — registered on the pluginmanager only when
+    ``--braintrust`` is active."""
+
+    def __init__(self, config: pytest.Config) -> None:
+        self._config = config
+        self._cli_project: str | None = config.getoption("--braintrust-project")
+        self._cli_experiment: str | None = config.getoption("--braintrust-experiment") or None
+
+        # Keyed by experiment group key (module path or project override).
+        self.experiments: dict[str, Any] = {}
+
+    # -- helpers ------------------------------------------------------------
+
+    def _get_experiment_key(self, item: pytest.Item) -> str:
+        """Determine the experiment grouping key for *item*."""
+        # CLI override applies globally.
+        if self._cli_project:
+            return self._cli_project
+
+        # Per-test / per-class marker override.
+        marker = item.get_closest_marker("braintrust")
+        if marker and marker.kwargs.get("project"):
+            return marker.kwargs["project"]
+
+        # Default: module path.
+        return item.module.__name__ if item.module else item.nodeid.split("::")[0]
+
+    def _get_or_create_experiment(self, key: str) -> Any:
+        """Get or lazily create an experiment for *key*."""
+        if key not in self.experiments:
+            import braintrust
+
+            exp = braintrust.init(project=key, experiment=self._cli_experiment)
+            self.experiments[key] = exp
+        return self.experiments[key]
+
+    def _collect_auto_input(self, item: pytest.Item) -> dict[str, Any] | None:
+        """Auto-collect parametrize args as input."""
+        if hasattr(item, "callspec"):
+            return dict(item.callspec.params)
+        return None
+
+    # -- hooks --------------------------------------------------------------
+
+    @pytest.hookimpl(tryfirst=True)
+    def pytest_runtest_setup(self, item: pytest.Item) -> None:
+        marker = item.get_closest_marker("braintrust")
+        if marker is None:
+            return
+
+        from braintrust.span_types import SpanTypeAttribute
+
+        key = self._get_experiment_key(item)
+        exp = self._get_or_create_experiment(key)
+
+        span = exp.start_span(name=item.name, type=SpanTypeAttribute.EVAL)
+
+        # Collect all static marker data into a single log call.
+        marker_kwargs: dict[str, Any] = {}
+
+        marker_input = marker.kwargs.get("input")
+        auto_input = self._collect_auto_input(item)
+        if marker_input is not None:
+            marker_kwargs["input"] = marker_input
+        elif auto_input is not None:
+            marker_kwargs["input"] = auto_input
+
+        for field in ("expected", "metadata", "tags"):
+            value = marker.kwargs.get(field)
+            if value is not None:
+                marker_kwargs[field] = value
+
+        if marker_kwargs:
+            span.log(**marker_kwargs)
+
+        item._braintrust_span = span  # type: ignore[attr-defined]
+        item._braintrust_experiment_key = key  # type: ignore[attr-defined]
+
+        span.set_current()
+
+    @pytest.hookimpl(hookwrapper=True)
+    def pytest_runtest_makereport(self, item: pytest.Item, call: pytest.CallInfo) -> None:  # type: ignore[type-arg]
+        outcome = yield
+        report = outcome.get_result()
+        if call.when == "call":
+            item._braintrust_report = report  # type: ignore[attr-defined]
+
+    @pytest.hookimpl(trylast=True)
+    def pytest_runtest_teardown(self, item: pytest.Item) -> None:
+        span: Span | None = getattr(item, "_braintrust_span", None)
+        if span is None:
+            return
+
+        report = getattr(item, "_braintrust_report", None)
+        passed = report.passed if report else False
+
+        span.log(scores={"pass": 1.0 if passed else 0.0})
+
+        if report and report.failed:
+            error_str = report.longreprtext if hasattr(report, "longreprtext") else str(report.longrepr)
+            span.log(error=error_str)
+
+        span.end()
+        span.unset_current()
+
+    def pytest_sessionfinish(self, session: pytest.Session) -> None:
+        show_summary = not self._config.getoption("--braintrust-no-summary", default=False)
+
+        for key, exp in self.experiments.items():
+            try:
+                summary = exp.summarize()
+                if show_summary:
+                    print(summary)
+                exp.flush()
+            except Exception:
+                traceback.print_exc()
diff --git a/py/src/braintrust/wrappers/pytest_plugin/test_plugin.py b/py/src/braintrust/wrappers/pytest_plugin/test_plugin.py