Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
98 commits
Select commit Hold shift + click to select a range
9d8f149
kind of works
Nov 9, 2025
95c21c8
trace scoring
Nov 9, 2025
4036939
better api
Nov 10, 2025
d02704a
more changes
Nov 11, 2025
942b49c
better api
Nov 11, 2025
64e1070
Merge branch 'main' into alex/trace-in-scorer
Nov 20, 2025
e734a6e
make trace context flush before fetching
Nov 21, 2025
355cf55
Merge branch 'main' into alex/trace-in-scorer
Dec 1, 2025
fda9d29
rename
Dec 1, 2025
fcd8f91
jsdoc
Dec 1, 2025
f0f93af
Merge branch 'main' into alex/trace-in-scorer
Dec 15, 2025
b13ee06
cache v1
Dec 16, 2025
c0035f6
tmp file
Dec 16, 2025
77b26a9
flag
Dec 16, 2025
59ee9b8
bump vers to fix test
Dec 16, 2025
4c1bc79
major bump
Dec 16, 2025
2eeeb46
disable local cache
Dec 16, 2025
30e5217
otel support?
Dec 16, 2025
f67f9e8
turn off cache when otel is used
Dec 16, 2025
7807091
remove console.log
Dec 17, 2025
bc236ef
sensible new version
Dec 17, 2025
9e864e2
Merge branch 'main' into alex/trace-in-scorer
Dec 17, 2025
efea2cb
fix build
Dec 17, 2025
b5d117b
try to fix web builds
Dec 17, 2025
a34741d
don't pass trace to scoring args
Dec 17, 2025
59f6c7b
Merge branch 'main' into alex/trace-in-scorer
Dec 19, 2025
b1dc350
pass state into the trace object
Dec 26, 2025
f67aba5
get passed in state
Dec 26, 2025
dcc7dd7
make cache writes not block
Dec 26, 2025
7cf3a5a
remove trace re-export
Dec 26, 2025
0300f07
fix test
Dec 26, 2025
469fa20
extend object fetcher
Dec 26, 2025
5295cd4
refactors for objects
Dec 26, 2025
b5b3511
state argument for init function
Dec 29, 2025
c314138
forgot doc
Dec 29, 2025
8eb6d49
Merge branch 'main' into alex/trace-in-scorer
CLowbrow Dec 29, 2025
3ff1c5d
evaluator doesn't always have state
Dec 29, 2025
b252963
console
Dec 29, 2025
cd7d04e
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Dec 30, 2025
3b4653a
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Dec 31, 2025
56c5bf9
export trace
ankrgyl Dec 31, 2025
38dcd00
convert Trace to be an interface
ankrgyl Jan 1, 2026
087e2a2
export more stuff
ankrgyl Jan 1, 2026
ddccd08
fix
ankrgyl Jan 2, 2026
5ca164f
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 3, 2026
4c789a0
use span_attributes.purpsoe
ankrgyl Jan 3, 2026
ca8b861
fix
ankrgyl Jan 3, 2026
0fc8400
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 9, 2026
c5e43fc
only turn cache on for evals
Jan 10, 2026
863b5e6
get rid of syncwrite
Jan 12, 2026
927df14
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 12, 2026
7992b74
factor out cached span fetcher
ankrgyl Jan 13, 2026
e721910
spancache cleanup
Jan 13, 2026
98ae957
init cache in inner eval method
Jan 13, 2026
db9ad2a
handle parallell evals
Jan 13, 2026
763dab6
belt and suspenders
Jan 13, 2026
3fca8aa
remove logs
Jan 13, 2026
6324d80
use mergeDict
Jan 13, 2026
53e6380
cleanup
Jan 13, 2026
ebf9524
don't crash
Jan 14, 2026
91dcaa1
PYTHON
Jan 14, 2026
bc1f473
fix bundler test
cpinn Jan 14, 2026
f7a8beb
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 15, 2026
9cec760
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 15, 2026
d2226f7
ag fixes
ankrgyl Jan 14, 2026
d00c30c
fix invoke
ankrgyl Jan 14, 2026
a53b21a
snapshot
ankrgyl Jan 15, 2026
d0c375c
Revert "snapshot"
ankrgyl Jan 15, 2026
9eb327e
Revert "fix invoke"
ankrgyl Jan 15, 2026
c27c538
fix json serializability
ankrgyl Jan 15, 2026
25322d2
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 15, 2026
62e8955
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 15, 2026
c53ff50
rename config fields
Jan 15, 2026
e83fcb1
Merge branch 'main' into alex/trace-in-scorer
Jan 15, 2026
e43c61e
Merge branch 'caitlin/fix-bundler-tes' into alex/trace-in-scorer
Jan 15, 2026
f2ae38f
Merge branch 'main' into alex/trace-in-scorer
Jan 15, 2026
c505f2a
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 16, 2026
26e0658
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 16, 2026
61ba617
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 16, 2026
28cfbe8
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 16, 2026
1314573
wip otel stuff
Jan 16, 2026
67abadc
add a toJSON method
ankrgyl Jan 16, 2026
a6d1dd4
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 16, 2026
499e6fc
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 16, 2026
c630564
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 16, 2026
d74224f
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 16, 2026
5c18fe4
init-dataset-with-id (#1276)
ankrgyl Jan 16, 2026
e71bf48
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 17, 2026
b99f266
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 17, 2026
4245dae
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 17, 2026
c62769e
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 17, 2026
4a935ca
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 20, 2026
5d5ab97
Merge branch 'main' into alex/trace-in-scorer-python
Jan 21, 2026
a7b13f8
fix python
Jan 21, 2026
8bb623d
python updates
Jan 21, 2026
7d3cad9
Merge branch 'main' into alex/trace-in-scorer-python
Jan 21, 2026
c97bf75
handle playground logs in py
Jan 21, 2026
a815eb5
lint
Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion integrations/openai-agents-js/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ export class OpenAIAgentsTraceProcessor {
if (!data.metrics.completion_tokens && usage.completionTokens)
data.metrics.completion_tokens = usage.completionTokens;
if (usage.input_tokens_details?.cached_tokens != null)
data.metrics.prompt_cached_tokens = usage.input_tokens_details.cached_tokens;
data.metrics.prompt_cached_tokens =
usage.input_tokens_details.cached_tokens;
}

return data;
Expand Down
43 changes: 33 additions & 10 deletions integrations/openai-agents-js/src/openai-agents-integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ describe(
output_tokens: 50,
total_tokens: 150,
input_tokens_details: {
cached_tokens: 80, // check for this later
cached_tokens: 80, // check for this later
},
},
},
Expand All @@ -934,9 +934,17 @@ describe(
const metrics = (responseSpanLog as any).metrics;
assert.ok(metrics, "Response span should have metrics");
assert.equal(metrics.prompt_tokens, 100, "Should have prompt_tokens");
assert.equal(metrics.completion_tokens, 50, "Should have completion_tokens");
assert.equal(
metrics.completion_tokens,
50,
"Should have completion_tokens",
);
assert.equal(metrics.tokens, 150, "Should have total tokens");
assert.equal(metrics.prompt_cached_tokens, 80, "Should extract cached_tokens to prompt_cached_tokens");
assert.equal(
metrics.prompt_cached_tokens,
80,
"Should extract cached_tokens to prompt_cached_tokens",
);
});

test("Response span handles zero cached tokens correctly", async () => {
Expand Down Expand Up @@ -965,7 +973,7 @@ describe(
input_tokens: 100,
output_tokens: 50,
input_tokens_details: {
cached_tokens: 0, // Zero is a valid value
cached_tokens: 0, // Zero is a valid value
},
},
},
Expand All @@ -977,7 +985,9 @@ describe(
await processor.onSpanEnd(responseSpan);

const spans = await backgroundLogger.drain();
const responseSpanLog = spans.find((s: any) => s.span_attributes?.type === "llm");
const responseSpanLog = spans.find(
(s: any) => s.span_attributes?.type === "llm",
);
const metrics = (responseSpanLog as any).metrics;

// Zero should be logged, not skipped
Expand Down Expand Up @@ -1024,11 +1034,16 @@ describe(
await processor.onSpanEnd(responseSpan);

const spans = await backgroundLogger.drain();
const responseSpanLog = spans.find((s: any) => s.span_attributes?.type === "llm");
const responseSpanLog = spans.find(
(s: any) => s.span_attributes?.type === "llm",
);
const metrics = (responseSpanLog as any).metrics;

// Should not have prompt_cached_tokens if not present in usage
assert.isUndefined(metrics.prompt_cached_tokens, "Should not add prompt_cached_tokens if not in usage");
assert.isUndefined(
metrics.prompt_cached_tokens,
"Should not add prompt_cached_tokens if not in usage",
);
});

test("Generation span extracts cached tokens from usage", async () => {
Expand Down Expand Up @@ -1060,7 +1075,7 @@ describe(
output_tokens: 75,
total_tokens: 275,
input_tokens_details: {
cached_tokens: 150, // Test Generation span extraction
cached_tokens: 150, // Test Generation span extraction
},
},
},
Expand All @@ -1080,8 +1095,16 @@ describe(
const metrics = (generationSpanLog as any).metrics;
assert.ok(metrics, "Generation span should have metrics");
assert.equal(metrics.prompt_tokens, 200, "Should have prompt_tokens");
assert.equal(metrics.completion_tokens, 75, "Should have completion_tokens");
assert.equal(metrics.prompt_cached_tokens, 150, "Should extract cached_tokens from Generation span");
assert.equal(
metrics.completion_tokens,
75,
"Should have completion_tokens",
);
assert.equal(
metrics.prompt_cached_tokens,
150,
"Should extract cached_tokens from Generation span",
);
});
},
);
60 changes: 56 additions & 4 deletions py/examples/evals/eval_example.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,64 @@
import json

from braintrust import Eval

NUM_EXAMPLES = 10


def exact_match_scorer(input, output, expected):
if expected is None:
return 0.0
return 1.0 if output == expected else 0.0
async def exact_match_scorer(input, output, expected, trace=None):
"""Async scorer that prints trace spans."""
score = 0.0
if expected is not None:
score = 1.0 if output == expected else 0.0

if trace:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as a user why/when would trace ever be none?

print("\n" + "="*80)
print(f"🔍 TRACE INFO for input: {input}")
print("="*80)

# Print trace configuration
config = trace.get_configuration()
print(f"\n📋 Configuration:")
print(f" Object Type: {config.get('objectType')}")
print(f" Object ID: {config.get('objectId')}")
print(f" Root Span: {config.get('rootSpanId')}")

# Fetch and print spans
try:
spans = await trace.get_spans()
print(f"\n✨ Found {len(spans)} spans:")
print("-"*80)

for i, span in enumerate(spans, 1):
print(f"\n Span {i}:")
print(f" ID: {span.span_id}")
span_type = span.span_attributes.get('type', 'N/A') if span.span_attributes else 'N/A'
span_name = span.span_attributes.get('name', 'N/A') if span.span_attributes else 'N/A'
print(f" Type: {span_type}")
print(f" Name: {span_name}")

if span.input:
input_str = json.dumps(span.input)
if len(input_str) > 100:
input_str = input_str[:100] + "..."
print(f" Input: {input_str}")
if span.output:
output_str = json.dumps(span.output)
if len(output_str) > 100:
output_str = output_str[:100] + "..."
print(f" Output: {output_str}")
if span.metadata:
print(f" Metadata: {list(span.metadata.keys())}")

print("\n" + "="*80 + "\n")
except Exception as e:
print(f"\n⚠️ Error fetching spans: {e}")
import traceback
traceback.print_exc()
else:
print(f"⚠️ No trace available for input: {input}")

return score


def data_fn():
Expand Down
99 changes: 98 additions & 1 deletion py/src/braintrust/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
filters: list[Filter],
stream: Callable[[SSEProgressEvent], None] | None = None,
state: BraintrustState | None = None,
):
# Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
if state is None:
from braintrust.logger import _internal_get_global_state

state = _internal_get_global_state()

state.span_cache.start()
try:
return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
finally:
# Clean up disk-based span cache after eval completes and stop caching
state.span_cache.dispose()
state.span_cache.stop()


async def _run_evaluator_internal_impl(
experiment,
evaluator: Evaluator,
position: int | None,
filters: list[Filter],
stream: Callable[[SSEProgressEvent], None] | None = None,
state: BraintrustState | None = None,
):
event_loop = asyncio.get_event_loop()

Expand All @@ -1290,11 +1313,13 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs):
{**parent_propagated},
{"span_attributes": {"purpose": "scorer"}},
)
# Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
with root_span.start_span(
name=name,
span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
propagated_event=merged_propagated,
input=dict(**kwargs),
input=logged_input,
) as span:
score = scorer
if hasattr(scorer, "eval_async"):
Expand Down Expand Up @@ -1415,6 +1440,77 @@ def report_progress(event: TaskProgressEvent):
tags = hooks.tags if hooks.tags else None
root_span.log(output=output, metadata=metadata, tags=tags)

# Create trace object for scorers
from braintrust.trace import LocalTrace
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think we have a circ. dep so i'd move this to the top of the file


async def ensure_spans_flushed():
# Flush native Braintrust spans
if experiment:
await asyncio.get_event_loop().run_in_executor(
None, lambda: experiment.state.flush()
)
elif state:
await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
else:
from braintrust.logger import flush as flush_logger

await asyncio.get_event_loop().run_in_executor(None, flush_logger)

# Also flush OTEL spans if registered
if state:
await state.flush_otel()

experiment_id = None
if experiment:
try:
experiment_id = experiment.id
except:
experiment_id = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#nit experiment_id = getattr(experiment, 'id', None)


trace = None
if state or experiment:
# Get the state to use
trace_state = state
if not trace_state and experiment:
trace_state = experiment.state
if not trace_state:
# Fall back to global state
from braintrust.logger import _internal_get_global_state

trace_state = _internal_get_global_state()

# Access root_span_id from the concrete SpanImpl instance
# The Span interface doesn't expose this but SpanImpl has it
root_span_id_value = getattr(root_span, "root_span_id", root_span.id)

# Check if there's a parent in the context to determine object_type and object_id
from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string

parent_str = trace_state.current_parent.get()
parent_components = None
if parent_str:
try:
parent_components = SpanComponentsV3.from_str(parent_str)
except Exception:
# If parsing fails, parent_components stays None
pass

# Determine object_type and object_id based on parent or experiment
if parent_components:
trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
trace_object_id = parent_components.object_id or ""
else:
trace_object_type = "experiment"
trace_object_id = experiment_id or ""

trace = LocalTrace(
object_type=trace_object_type,
object_id=trace_object_id,
root_span_id=root_span_id_value,
ensure_spans_flushed=ensure_spans_flushed,
state=trace_state,
)

score_promises = [
asyncio.create_task(
await_or_run_scorer(
Expand All @@ -1426,6 +1522,7 @@ def report_progress(event: TaskProgressEvent):
"expected": datum.expected,
"metadata": metadata,
"output": output,
"trace": trace,
},
)
)
Expand Down
4 changes: 3 additions & 1 deletion py/src/braintrust/functions/invoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sseclient import SSEClient

from .._generated_types import FunctionTypeEnum
from ..logger import Exportable, get_span_parent_object, login, proxy_conn
from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
from ..util import response_raise_for_status
from .constants import INVOKE_API_VERSION
from .stream import BraintrustInvokeError, BraintrustStream
Expand Down Expand Up @@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
:param version: Optional version of the function to use. Defaults to latest.
:return: A function that can be used as a task or scorer.
"""
# Disable span cache since remote function spans won't be in the local cache
_internal_get_global_state().span_cache.disable()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may be too indirect. if the remote function spans are not in the cache what's the big deal in checking the cache?


def f(*args: Any, **kwargs: Any) -> Any:
if len(args) > 0:
Expand Down
61 changes: 61 additions & 0 deletions py/src/braintrust/functions/test_invoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Tests for the invoke module, particularly init_function."""


from braintrust.functions.invoke import init_function
from braintrust.logger import _internal_get_global_state, _internal_reset_global_state


class TestInitFunction:
"""Tests for init_function."""

def setup_method(self):
"""Reset state before each test."""
_internal_reset_global_state()

def teardown_method(self):
"""Clean up after each test."""
_internal_reset_global_state()

def test_init_function_disables_span_cache(self):
"""Test that init_function disables the span cache."""
state = _internal_get_global_state()

# Cache should be disabled by default (it's only enabled during evals)
assert state.span_cache.disabled is True

# Enable the cache (simulating what happens during eval)
state.span_cache.start()
assert state.span_cache.disabled is False

# Call init_function
f = init_function("test-project", "test-function")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is a bit odd that loading a function will cause a cache to be disabled.


# Cache should now be disabled (init_function explicitly disables it)
assert state.span_cache.disabled is True
assert f.__name__ == "init_function-test-project-test-function-latest"

def test_init_function_with_version(self):
"""Test that init_function creates a function with the correct name including version."""
f = init_function("my-project", "my-scorer", version="v1")
assert f.__name__ == "init_function-my-project-my-scorer-v1"

def test_init_function_without_version_uses_latest(self):
"""Test that init_function uses 'latest' in name when version not specified."""
f = init_function("my-project", "my-scorer")
assert f.__name__ == "init_function-my-project-my-scorer-latest"

def test_init_function_permanently_disables_cache(self):
"""Test that init_function permanently disables the cache (can't be re-enabled)."""
state = _internal_get_global_state()

# Enable the cache
state.span_cache.start()
assert state.span_cache.disabled is False

# Call init_function
init_function("test-project", "test-function")
assert state.span_cache.disabled is True

# Try to start again - should still be disabled because of explicit disable
state.span_cache.start()
assert state.span_cache.disabled is True
Loading
Loading