|
| 1 | +"""Braintrust pytest plugin. |
| 2 | +
|
| 3 | +Automatically tracks test results as Braintrust experiments when tests are |
| 4 | +decorated with ``@pytest.mark.braintrust`` and run with ``--braintrust``. |
| 5 | +
|
| 6 | +The plugin is registered via the ``pytest11`` entry point in ``setup.py`` |
| 7 | +so it auto-loads when braintrust is installed. |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import traceback |
| 13 | +from typing import TYPE_CHECKING, Any |
| 14 | + |
| 15 | +import pytest |
| 16 | + |
| 17 | +if TYPE_CHECKING: |
| 18 | + from braintrust.logger import Span |
| 19 | + |
| 20 | +# --------------------------------------------------------------------------- |
| 21 | +# Marker registration & CLI options (always active) |
| 22 | +# --------------------------------------------------------------------------- |
| 23 | + |
| 24 | + |
| 25 | +def pytest_addoption(parser: pytest.Parser) -> None: |
| 26 | + group = parser.getgroup("braintrust", "Braintrust experiment tracking") |
| 27 | + group.addoption( |
| 28 | + "--braintrust", |
| 29 | + action="store_true", |
| 30 | + default=False, |
| 31 | + help="Enable Braintrust experiment tracking for @pytest.mark.braintrust tests.", |
| 32 | + ) |
| 33 | + group.addoption( |
| 34 | + "--braintrust-project", |
| 35 | + action="store", |
| 36 | + default=None, |
| 37 | + help="Override the Braintrust project name for all tests (env: BRAINTRUST_PROJECT).", |
| 38 | + ) |
| 39 | + group.addoption( |
| 40 | + "--braintrust-experiment", |
| 41 | + action="store", |
| 42 | + default=None, |
| 43 | + help="Override the experiment name (env: BRAINTRUST_EXPERIMENT).", |
| 44 | + ) |
| 45 | + group.addoption( |
| 46 | + "--braintrust-api-key", |
| 47 | + action="store", |
| 48 | + default=None, |
| 49 | + help="Braintrust API key (env: BRAINTRUST_API_KEY).", |
| 50 | + ) |
| 51 | + group.addoption( |
| 52 | + "--braintrust-no-summary", |
| 53 | + action="store_true", |
| 54 | + default=False, |
| 55 | + help="Suppress the experiment summary at the end of the session.", |
| 56 | + ) |
| 57 | + |
| 58 | + |
| 59 | +def pytest_configure(config: pytest.Config) -> None: |
| 60 | + # Always register the marker so that ``@pytest.mark.braintrust`` never |
| 61 | + # triggers an "unknown marker" warning. |
| 62 | + config.addinivalue_line( |
| 63 | + "markers", |
| 64 | + "braintrust: mark test for Braintrust experiment tracking. " |
| 65 | + "Optional kwargs: project, input, expected, tags, metadata.", |
| 66 | + ) |
| 67 | + |
| 68 | + if config.getoption("--braintrust", default=False): |
| 69 | + import os |
| 70 | + |
| 71 | + api_key = config.getoption("--braintrust-api-key") or os.environ.get("BRAINTRUST_API_KEY") |
| 72 | + if api_key: |
| 73 | + os.environ["BRAINTRUST_API_KEY"] = api_key |
| 74 | + |
| 75 | + plugin = BraintrustPytestPlugin(config) |
| 76 | + config.pluginmanager.register(plugin, "braintrust-plugin") |
| 77 | + |
| 78 | + |
| 79 | +# --------------------------------------------------------------------------- |
| 80 | +# braintrust_span fixture (always available) |
| 81 | +# --------------------------------------------------------------------------- |
| 82 | + |
| 83 | + |
| 84 | +@pytest.fixture |
| 85 | +def braintrust_span(request: pytest.FixtureRequest) -> Span: |
| 86 | + """Return the :class:`~braintrust.logger.Span` for the current test. |
| 87 | +
|
| 88 | + When ``--braintrust`` is not active the fixture returns a no-op span |
| 89 | + that silently discards all logged data. |
| 90 | + """ |
| 91 | + from braintrust.logger import NOOP_SPAN |
| 92 | + |
| 93 | + return getattr(request.node, "_braintrust_span", NOOP_SPAN) |
| 94 | + |
| 95 | + |
| 96 | +# --------------------------------------------------------------------------- |
| 97 | +# Plugin class (registered only when --braintrust is passed) |
| 98 | +# --------------------------------------------------------------------------- |
| 99 | + |
| 100 | + |
| 101 | +class BraintrustPytestPlugin: |
| 102 | + """Core hook implementation — registered on the pluginmanager only when |
| 103 | + ``--braintrust`` is active.""" |
| 104 | + |
| 105 | + def __init__(self, config: pytest.Config) -> None: |
| 106 | + self._config = config |
| 107 | + self._cli_project: str | None = config.getoption("--braintrust-project") |
| 108 | + self._cli_experiment: str | None = config.getoption("--braintrust-experiment") or None |
| 109 | + |
| 110 | + # Keyed by experiment group key (module path or project override). |
| 111 | + self.experiments: dict[str, Any] = {} |
| 112 | + |
| 113 | + # -- helpers ------------------------------------------------------------ |
| 114 | + |
| 115 | + def _get_experiment_key(self, item: pytest.Item) -> str: |
| 116 | + """Determine the experiment grouping key for *item*.""" |
| 117 | + # CLI override applies globally. |
| 118 | + if self._cli_project: |
| 119 | + return self._cli_project |
| 120 | + |
| 121 | + # Per-test / per-class marker override. |
| 122 | + marker = item.get_closest_marker("braintrust") |
| 123 | + if marker and marker.kwargs.get("project"): |
| 124 | + return marker.kwargs["project"] |
| 125 | + |
| 126 | + # Default: module path. |
| 127 | + return item.module.__name__ if item.module else item.nodeid.split("::")[0] |
| 128 | + |
| 129 | + def _get_or_create_experiment(self, key: str) -> Any: |
| 130 | + """Get or lazily create an experiment for *key*.""" |
| 131 | + if key not in self.experiments: |
| 132 | + import braintrust |
| 133 | + |
| 134 | + exp = braintrust.init(project=key, experiment=self._cli_experiment) |
| 135 | + self.experiments[key] = exp |
| 136 | + return self.experiments[key] |
| 137 | + |
| 138 | + def _collect_auto_input(self, item: pytest.Item) -> dict[str, Any] | None: |
| 139 | + """Auto-collect parametrize args as input.""" |
| 140 | + if hasattr(item, "callspec"): |
| 141 | + return dict(item.callspec.params) |
| 142 | + return None |
| 143 | + |
| 144 | + # -- hooks -------------------------------------------------------------- |
| 145 | + |
| 146 | + @pytest.hookimpl(tryfirst=True) |
| 147 | + def pytest_runtest_setup(self, item: pytest.Item) -> None: |
| 148 | + marker = item.get_closest_marker("braintrust") |
| 149 | + if marker is None: |
| 150 | + return |
| 151 | + |
| 152 | + from braintrust.span_types import SpanTypeAttribute |
| 153 | + |
| 154 | + key = self._get_experiment_key(item) |
| 155 | + exp = self._get_or_create_experiment(key) |
| 156 | + |
| 157 | + span = exp.start_span(name=item.name, type=SpanTypeAttribute.EVAL) |
| 158 | + |
| 159 | + # Collect all static marker data into a single log call. |
| 160 | + marker_kwargs: dict[str, Any] = {} |
| 161 | + |
| 162 | + marker_input = marker.kwargs.get("input") |
| 163 | + auto_input = self._collect_auto_input(item) |
| 164 | + if marker_input is not None: |
| 165 | + marker_kwargs["input"] = marker_input |
| 166 | + elif auto_input is not None: |
| 167 | + marker_kwargs["input"] = auto_input |
| 168 | + |
| 169 | + for field in ("expected", "metadata", "tags"): |
| 170 | + value = marker.kwargs.get(field) |
| 171 | + if value is not None: |
| 172 | + marker_kwargs[field] = value |
| 173 | + |
| 174 | + if marker_kwargs: |
| 175 | + span.log(**marker_kwargs) |
| 176 | + |
| 177 | + item._braintrust_span = span # type: ignore[attr-defined] |
| 178 | + item._braintrust_experiment_key = key # type: ignore[attr-defined] |
| 179 | + |
| 180 | + span.set_current() |
| 181 | + |
| 182 | + @pytest.hookimpl(hookwrapper=True) |
| 183 | + def pytest_runtest_makereport(self, item: pytest.Item, call: pytest.CallInfo) -> None: # type: ignore[type-arg] |
| 184 | + outcome = yield |
| 185 | + report = outcome.get_result() |
| 186 | + if call.when == "call": |
| 187 | + item._braintrust_report = report # type: ignore[attr-defined] |
| 188 | + |
| 189 | + @pytest.hookimpl(trylast=True) |
| 190 | + def pytest_runtest_teardown(self, item: pytest.Item) -> None: |
| 191 | + span: Span | None = getattr(item, "_braintrust_span", None) |
| 192 | + if span is None: |
| 193 | + return |
| 194 | + |
| 195 | + report = getattr(item, "_braintrust_report", None) |
| 196 | + passed = report.passed if report else False |
| 197 | + |
| 198 | + span.log(scores={"pass": 1.0 if passed else 0.0}) |
| 199 | + |
| 200 | + if report and report.failed: |
| 201 | + error_str = report.longreprtext if hasattr(report, "longreprtext") else str(report.longrepr) |
| 202 | + span.log(error=error_str) |
| 203 | + |
| 204 | + span.end() |
| 205 | + span.unset_current() |
| 206 | + |
| 207 | + def pytest_sessionfinish(self, session: pytest.Session) -> None: |
| 208 | + show_summary = not self._config.getoption("--braintrust-no-summary", default=False) |
| 209 | + |
| 210 | + for key, exp in self.experiments.items(): |
| 211 | + try: |
| 212 | + summary = exp.summarize() |
| 213 | + if show_summary: |
| 214 | + print(summary) |
| 215 | + exp.flush() |
| 216 | + except Exception: |
| 217 | + traceback.print_exc() |
0 commit comments