Skip to content

Commit 97da34d

Browse files
authored
feat: Add pytest integration (#25)
1 parent fdd0202 commit 97da34d

File tree

6 files changed

+742
-1
lines changed

6 files changed

+742
-1
lines changed

py/noxfile.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
GOOGLE_ADK_VERSIONS = (LATEST, "1.14.1")
7878
# temporalio 1.19.0+ requires Python >= 3.10; skip Python 3.9 entirely
7979
TEMPORAL_VERSIONS = (LATEST, "1.20.0", "1.19.0")
80+
PYTEST_VERSIONS = (LATEST, "8.4.2")
8081

8182

8283
@nox.session()
@@ -254,6 +255,14 @@ def test_cli(session):
254255
_run_tests(session, "braintrust/devserver/test_server_integration.py")
255256

256257

258+
@nox.session()
259+
@nox.parametrize("version", PYTEST_VERSIONS, ids=PYTEST_VERSIONS)
260+
def test_pytest_plugin(session, version):
261+
_install_test_deps(session)
262+
_install(session, "pytest", version)
263+
_run_tests(session, f"{WRAPPER_DIR}/pytest_plugin/test_plugin.py")
264+
265+
257266
@nox.session()
258267
def test_otel(session):
259268
"""Test OtelExporter with OpenTelemetry installed."""
@@ -390,6 +399,11 @@ def _run_tests(session, test_path, ignore_path="", ignore_paths=None, env=None):
390399
# Run the tests in the src directory
391400
test_args = [
392401
"pytest",
402+
# Disable the braintrust pytest plugin (registered via pytest11 entry
403+
# point) to avoid ImportPathMismatchError when the installed package
404+
# and the source tree both contain braintrust/conftest.py.
405+
"-p",
406+
"no:braintrust",
393407
f"src/{test_path}",
394408
]
395409
for path in paths_to_ignore:

py/setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,10 @@
5858
packages=setuptools.find_packages(where="src"),
5959
package_data={"braintrust": ["py.typed"]},
6060
python_requires=">=3.10.0",
61-
entry_points={"console_scripts": ["braintrust = braintrust.cli.__main__:main"]},
61+
entry_points={
62+
"console_scripts": ["braintrust = braintrust.cli.__main__:main"],
63+
"pytest11": ["braintrust = braintrust.wrappers.pytest_plugin.plugin"],
64+
},
6265
install_requires=install_requires,
6366
extras_require=extras_require,
6467
)
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Braintrust Pytest Plugin
2+
3+
Automatically track pytest test results as [Braintrust](https://www.braintrust.dev) experiments. Mark tests with `@pytest.mark.braintrust`, run with `--braintrust`, and get experiment tracking with spans, pass/fail scores, and custom metrics.
4+
5+
## Requirements
6+
7+
- Python >= 3.10
8+
- pytest >= 8
9+
10+
## Installation
11+
12+
The plugin is included with the Braintrust SDK and auto-registers via `pytest11` entry point:
13+
14+
```bash
15+
pip install braintrust
16+
```
17+
18+
## Quick Start
19+
20+
```python
21+
import pytest
22+
23+
@pytest.mark.braintrust
24+
def test_my_llm(braintrust_span):
25+
result = my_llm("hello")
26+
braintrust_span.log(input={"query": "hello"}, output=result)
27+
assert "greeting" in result
28+
```
29+
30+
Run with:
31+
32+
```bash
33+
pytest --braintrust --braintrust-project="my-project"
34+
```
35+
36+
## Configuration
37+
38+
### CLI Options
39+
40+
| Option | Env Var | Description |
41+
| ------------------------- | ----------------------- | ----------------------------------------- |
42+
| `--braintrust` || Enable experiment tracking (required) |
43+
| `--braintrust-project` | `BRAINTRUST_PROJECT` | Project name (default: test module name) |
44+
| `--braintrust-experiment` | `BRAINTRUST_EXPERIMENT` | Experiment name (default: auto-generated) |
45+
| `--braintrust-api-key` | `BRAINTRUST_API_KEY` | API key for Braintrust |
46+
| `--braintrust-no-summary` || Suppress terminal summary |
47+
48+
### Marker kwargs
49+
50+
```python
51+
@pytest.mark.braintrust(
52+
project="my-project", # Override project for this test/class
53+
input={"query": "hello"}, # Static input data
54+
expected={"answer": "hi"}, # Expected output
55+
tags=["regression"], # Tags for the span
56+
metadata={"model": "gpt-4"}, # Additional metadata
57+
)
58+
```
59+
60+
## Logging Data
61+
62+
The `braintrust_span` fixture is a standard [`Span`](https://www.braintrust.dev/docs/reference/python#span) object. Use `span.log()` to record data:
63+
64+
```python
65+
def test_example(braintrust_span):
66+
braintrust_span.log(
67+
input={"query": "hello"},
68+
output={"response": "world"},
69+
expected={"response": "world"},
70+
scores={"accuracy": 0.95},
71+
metadata={"model": "gpt-4"},
72+
)
73+
```
74+
75+
When `--braintrust` is not passed, the fixture returns a no-op span that silently discards all logged data, so tests still pass normally.
76+
77+
## Experiment Grouping
78+
79+
By default, one experiment is created per test module. Override with:
80+
81+
- **CLI**: `--braintrust-project="name"` (applies to all tests)
82+
- **Marker**: `@pytest.mark.braintrust(project="name")` (per-test or per-class)
83+
84+
## Data-Driven Tests
85+
86+
Parametrized test arguments are automatically logged as input:
87+
88+
```python
89+
@pytest.mark.braintrust
90+
@pytest.mark.parametrize("query,expected_answer", [
91+
("2+2?", "4"),
92+
("Capital of France?", "Paris"),
93+
])
94+
def test_qa(braintrust_span, query, expected_answer):
95+
result = my_llm(query)
96+
braintrust_span.log(output=result)
97+
```
98+
99+
Each parametrized case becomes a separate span with `input: {"query": "2+2?", "expected_answer": "4"}`.
100+
101+
If you provide `input` via the marker, it takes precedence over auto-logged arguments.
102+
103+
## Viewing Results
104+
105+
After a test run, the terminal shows an experiment summary:
106+
107+
```
108+
=========================SUMMARY=========================
109+
95.50% 'pass' score
110+
111+
See results for my-test-2026-03-02T15:30:00 at https://www.braintrust.dev/app/...
112+
```
113+
114+
Click the URL to view detailed results in the Braintrust UI.

py/src/braintrust/wrappers/pytest_plugin/__init__.py

Whitespace-only changes.
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
"""Braintrust pytest plugin.
2+
3+
Automatically tracks test results as Braintrust experiments when tests are
4+
decorated with ``@pytest.mark.braintrust`` and run with ``--braintrust``.
5+
6+
The plugin is registered via the ``pytest11`` entry point in ``setup.py``
7+
so it auto-loads when braintrust is installed.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import traceback
13+
from typing import TYPE_CHECKING, Any
14+
15+
import pytest
16+
17+
if TYPE_CHECKING:
18+
from braintrust.logger import Span
19+
20+
# ---------------------------------------------------------------------------
21+
# Marker registration & CLI options (always active)
22+
# ---------------------------------------------------------------------------
23+
24+
25+
def pytest_addoption(parser: pytest.Parser) -> None:
26+
group = parser.getgroup("braintrust", "Braintrust experiment tracking")
27+
group.addoption(
28+
"--braintrust",
29+
action="store_true",
30+
default=False,
31+
help="Enable Braintrust experiment tracking for @pytest.mark.braintrust tests.",
32+
)
33+
group.addoption(
34+
"--braintrust-project",
35+
action="store",
36+
default=None,
37+
help="Override the Braintrust project name for all tests (env: BRAINTRUST_PROJECT).",
38+
)
39+
group.addoption(
40+
"--braintrust-experiment",
41+
action="store",
42+
default=None,
43+
help="Override the experiment name (env: BRAINTRUST_EXPERIMENT).",
44+
)
45+
group.addoption(
46+
"--braintrust-api-key",
47+
action="store",
48+
default=None,
49+
help="Braintrust API key (env: BRAINTRUST_API_KEY).",
50+
)
51+
group.addoption(
52+
"--braintrust-no-summary",
53+
action="store_true",
54+
default=False,
55+
help="Suppress the experiment summary at the end of the session.",
56+
)
57+
58+
59+
def pytest_configure(config: pytest.Config) -> None:
60+
# Always register the marker so that ``@pytest.mark.braintrust`` never
61+
# triggers an "unknown marker" warning.
62+
config.addinivalue_line(
63+
"markers",
64+
"braintrust: mark test for Braintrust experiment tracking. "
65+
"Optional kwargs: project, input, expected, tags, metadata.",
66+
)
67+
68+
if config.getoption("--braintrust", default=False):
69+
import os
70+
71+
api_key = config.getoption("--braintrust-api-key") or os.environ.get("BRAINTRUST_API_KEY")
72+
if api_key:
73+
os.environ["BRAINTRUST_API_KEY"] = api_key
74+
75+
plugin = BraintrustPytestPlugin(config)
76+
config.pluginmanager.register(plugin, "braintrust-plugin")
77+
78+
79+
# ---------------------------------------------------------------------------
80+
# braintrust_span fixture (always available)
81+
# ---------------------------------------------------------------------------
82+
83+
84+
@pytest.fixture
85+
def braintrust_span(request: pytest.FixtureRequest) -> Span:
86+
"""Return the :class:`~braintrust.logger.Span` for the current test.
87+
88+
When ``--braintrust`` is not active the fixture returns a no-op span
89+
that silently discards all logged data.
90+
"""
91+
from braintrust.logger import NOOP_SPAN
92+
93+
return getattr(request.node, "_braintrust_span", NOOP_SPAN)
94+
95+
96+
# ---------------------------------------------------------------------------
97+
# Plugin class (registered only when --braintrust is passed)
98+
# ---------------------------------------------------------------------------
99+
100+
101+
class BraintrustPytestPlugin:
102+
"""Core hook implementation — registered on the pluginmanager only when
103+
``--braintrust`` is active."""
104+
105+
def __init__(self, config: pytest.Config) -> None:
106+
self._config = config
107+
self._cli_project: str | None = config.getoption("--braintrust-project")
108+
self._cli_experiment: str | None = config.getoption("--braintrust-experiment") or None
109+
110+
# Keyed by experiment group key (module path or project override).
111+
self.experiments: dict[str, Any] = {}
112+
113+
# -- helpers ------------------------------------------------------------
114+
115+
def _get_experiment_key(self, item: pytest.Item) -> str:
116+
"""Determine the experiment grouping key for *item*."""
117+
# CLI override applies globally.
118+
if self._cli_project:
119+
return self._cli_project
120+
121+
# Per-test / per-class marker override.
122+
marker = item.get_closest_marker("braintrust")
123+
if marker and marker.kwargs.get("project"):
124+
return marker.kwargs["project"]
125+
126+
# Default: module path.
127+
return item.module.__name__ if item.module else item.nodeid.split("::")[0]
128+
129+
def _get_or_create_experiment(self, key: str) -> Any:
130+
"""Get or lazily create an experiment for *key*."""
131+
if key not in self.experiments:
132+
import braintrust
133+
134+
exp = braintrust.init(project=key, experiment=self._cli_experiment)
135+
self.experiments[key] = exp
136+
return self.experiments[key]
137+
138+
def _collect_auto_input(self, item: pytest.Item) -> dict[str, Any] | None:
139+
"""Auto-collect parametrize args as input."""
140+
if hasattr(item, "callspec"):
141+
return dict(item.callspec.params)
142+
return None
143+
144+
# -- hooks --------------------------------------------------------------
145+
146+
@pytest.hookimpl(tryfirst=True)
147+
def pytest_runtest_setup(self, item: pytest.Item) -> None:
148+
marker = item.get_closest_marker("braintrust")
149+
if marker is None:
150+
return
151+
152+
from braintrust.span_types import SpanTypeAttribute
153+
154+
key = self._get_experiment_key(item)
155+
exp = self._get_or_create_experiment(key)
156+
157+
span = exp.start_span(name=item.name, type=SpanTypeAttribute.EVAL)
158+
159+
# Collect all static marker data into a single log call.
160+
marker_kwargs: dict[str, Any] = {}
161+
162+
marker_input = marker.kwargs.get("input")
163+
auto_input = self._collect_auto_input(item)
164+
if marker_input is not None:
165+
marker_kwargs["input"] = marker_input
166+
elif auto_input is not None:
167+
marker_kwargs["input"] = auto_input
168+
169+
for field in ("expected", "metadata", "tags"):
170+
value = marker.kwargs.get(field)
171+
if value is not None:
172+
marker_kwargs[field] = value
173+
174+
if marker_kwargs:
175+
span.log(**marker_kwargs)
176+
177+
item._braintrust_span = span # type: ignore[attr-defined]
178+
item._braintrust_experiment_key = key # type: ignore[attr-defined]
179+
180+
span.set_current()
181+
182+
@pytest.hookimpl(hookwrapper=True)
183+
def pytest_runtest_makereport(self, item: pytest.Item, call: pytest.CallInfo) -> None: # type: ignore[type-arg]
184+
outcome = yield
185+
report = outcome.get_result()
186+
if call.when == "call":
187+
item._braintrust_report = report # type: ignore[attr-defined]
188+
189+
@pytest.hookimpl(trylast=True)
190+
def pytest_runtest_teardown(self, item: pytest.Item) -> None:
191+
span: Span | None = getattr(item, "_braintrust_span", None)
192+
if span is None:
193+
return
194+
195+
report = getattr(item, "_braintrust_report", None)
196+
passed = report.passed if report else False
197+
198+
span.log(scores={"pass": 1.0 if passed else 0.0})
199+
200+
if report and report.failed:
201+
error_str = report.longreprtext if hasattr(report, "longreprtext") else str(report.longrepr)
202+
span.log(error=error_str)
203+
204+
span.end()
205+
span.unset_current()
206+
207+
def pytest_sessionfinish(self, session: pytest.Session) -> None:
208+
show_summary = not self._config.getoption("--braintrust-no-summary", default=False)
209+
210+
for key, exp in self.experiments.items():
211+
try:
212+
summary = exp.summarize()
213+
if show_summary:
214+
print(summary)
215+
exp.flush()
216+
except Exception:
217+
traceback.print_exc()

0 commit comments

Comments
 (0)