braintrustdata · CLowbrow · Jan 21, 2026 · Nov 9, 2025 · Nov 9, 2025 · Nov 10, 2025
diff --git a/integrations/openai-agents-js/src/index.ts b/integrations/openai-agents-js/src/index.ts
@@ -382,7 +382,8 @@ export class OpenAIAgentsTraceProcessor {
       if (!data.metrics.completion_tokens && usage.completionTokens)
         data.metrics.completion_tokens = usage.completionTokens;
       if (usage.input_tokens_details?.cached_tokens != null)
-        data.metrics.prompt_cached_tokens = usage.input_tokens_details.cached_tokens;
+        data.metrics.prompt_cached_tokens =
+          usage.input_tokens_details.cached_tokens;
     }
 
     return data;

diff --git a/integrations/openai-agents-js/src/openai-agents-integration.test.ts b/integrations/openai-agents-js/src/openai-agents-integration.test.ts
@@ -908,7 +908,7 @@ describe(
               output_tokens: 50,
               total_tokens: 150,
               input_tokens_details: {
-                cached_tokens: 80,  // check for this later
+                cached_tokens: 80, // check for this later
               },
             },
           },
@@ -934,9 +934,17 @@ describe(
       const metrics = (responseSpanLog as any).metrics;
       assert.ok(metrics, "Response span should have metrics");
       assert.equal(metrics.prompt_tokens, 100, "Should have prompt_tokens");
-      assert.equal(metrics.completion_tokens, 50, "Should have completion_tokens");
+      assert.equal(
+        metrics.completion_tokens,
+        50,
+        "Should have completion_tokens",
+      );
       assert.equal(metrics.tokens, 150, "Should have total tokens");
-      assert.equal(metrics.prompt_cached_tokens, 80, "Should extract cached_tokens to prompt_cached_tokens");
+      assert.equal(
+        metrics.prompt_cached_tokens,
+        80,
+        "Should extract cached_tokens to prompt_cached_tokens",
+      );
     });
 
     test("Response span handles zero cached tokens correctly", async () => {
@@ -965,7 +973,7 @@ describe(
               input_tokens: 100,
               output_tokens: 50,
               input_tokens_details: {
-                cached_tokens: 0,  // Zero is a valid value
+                cached_tokens: 0, // Zero is a valid value
               },
             },
           },
@@ -977,7 +985,9 @@ describe(
       await processor.onSpanEnd(responseSpan);
 
       const spans = await backgroundLogger.drain();
-      const responseSpanLog = spans.find((s: any) => s.span_attributes?.type === "llm");
+      const responseSpanLog = spans.find(
+        (s: any) => s.span_attributes?.type === "llm",
+      );
       const metrics = (responseSpanLog as any).metrics;
 
       // Zero should be logged, not skipped
@@ -1024,11 +1034,16 @@ describe(
       await processor.onSpanEnd(responseSpan);
 
       const spans = await backgroundLogger.drain();
-      const responseSpanLog = spans.find((s: any) => s.span_attributes?.type === "llm");
+      const responseSpanLog = spans.find(
+        (s: any) => s.span_attributes?.type === "llm",
+      );
       const metrics = (responseSpanLog as any).metrics;
 
       // Should not have prompt_cached_tokens if not present in usage
-      assert.isUndefined(metrics.prompt_cached_tokens, "Should not add prompt_cached_tokens if not in usage");
+      assert.isUndefined(
+        metrics.prompt_cached_tokens,
+        "Should not add prompt_cached_tokens if not in usage",
+      );
     });
 
     test("Generation span extracts cached tokens from usage", async () => {
@@ -1060,7 +1075,7 @@ describe(
             output_tokens: 75,
             total_tokens: 275,
             input_tokens_details: {
-              cached_tokens: 150,  // Test Generation span extraction
+              cached_tokens: 150, // Test Generation span extraction
             },
           },
         },
@@ -1080,8 +1095,16 @@ describe(
       const metrics = (generationSpanLog as any).metrics;
       assert.ok(metrics, "Generation span should have metrics");
       assert.equal(metrics.prompt_tokens, 200, "Should have prompt_tokens");
-      assert.equal(metrics.completion_tokens, 75, "Should have completion_tokens");
-      assert.equal(metrics.prompt_cached_tokens, 150, "Should extract cached_tokens from Generation span");
+      assert.equal(
+        metrics.completion_tokens,
+        75,
+        "Should have completion_tokens",
+      );
+      assert.equal(
+        metrics.prompt_cached_tokens,
+        150,
+        "Should extract cached_tokens from Generation span",
+      );
     });
   },
 );
diff --git a/py/examples/evals/eval_example.py b/py/examples/evals/eval_example.py
@@ -1,12 +1,64 @@
+import json
+
 from braintrust import Eval
 
 NUM_EXAMPLES = 10
 
 
-def exact_match_scorer(input, output, expected):
-    if expected is None:
-        return 0.0
-    return 1.0 if output == expected else 0.0
+async def exact_match_scorer(input, output, expected, trace=None):
+    """Async scorer that prints trace spans."""
+    score = 0.0
+    if expected is not None:
+        score = 1.0 if output == expected else 0.0
+
+    if trace:
+        print("\n" + "="*80)
+        print(f"🔍 TRACE INFO for input: {input}")
+        print("="*80)
+
+        # Print trace configuration
+        config = trace.get_configuration()
+        print(f"\n📋 Configuration:")
+        print(f"  Object Type: {config.get('objectType')}")
+        print(f"  Object ID:   {config.get('objectId')}")
+        print(f"  Root Span:   {config.get('rootSpanId')}")
+
+        # Fetch and print spans
+        try:
+            spans = await trace.get_spans()
+            print(f"\n✨ Found {len(spans)} spans:")
+            print("-"*80)
+
+            for i, span in enumerate(spans, 1):
+                print(f"\n  Span {i}:")
+                print(f"    ID:         {span.span_id}")
+                span_type = span.span_attributes.get('type', 'N/A') if span.span_attributes else 'N/A'
+                span_name = span.span_attributes.get('name', 'N/A') if span.span_attributes else 'N/A'
+                print(f"    Type:       {span_type}")
+                print(f"    Name:       {span_name}")
+
+                if span.input:
+                    input_str = json.dumps(span.input)
+                    if len(input_str) > 100:
+                        input_str = input_str[:100] + "..."
+                    print(f"    Input:      {input_str}")
+                if span.output:
+                    output_str = json.dumps(span.output)
+                    if len(output_str) > 100:
+                        output_str = output_str[:100] + "..."
+                    print(f"    Output:     {output_str}")
+                if span.metadata:
+                    print(f"    Metadata:   {list(span.metadata.keys())}")
+
+            print("\n" + "="*80 + "\n")
+        except Exception as e:
+            print(f"\n⚠️  Error fetching spans: {e}")
+            import traceback
+            traceback.print_exc()
+    else:
+        print(f"⚠️  No trace available for input: {input}")
+
+    return score
 
 
 def data_fn():

diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py
@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+):
+    # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
+    if state is None:
+        from braintrust.logger import _internal_get_global_state
+
+        state = _internal_get_global_state()
+
+    state.span_cache.start()
+    try:
+        return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
+    finally:
+        # Clean up disk-based span cache after eval completes and stop caching
+        state.span_cache.dispose()
+        state.span_cache.stop()
+
+
+async def _run_evaluator_internal_impl(
+    experiment,
+    evaluator: Evaluator,
+    position: int | None,
+    filters: list[Filter],
+    stream: Callable[[SSEProgressEvent], None] | None = None,
+    state: BraintrustState | None = None,
 ):
     event_loop = asyncio.get_event_loop()
 
@@ -1290,11 +1313,13 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs):
             {**parent_propagated},
             {"span_attributes": {"purpose": "scorer"}},
         )
+        # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
+        logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
         with root_span.start_span(
             name=name,
             span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
             propagated_event=merged_propagated,
-            input=dict(**kwargs),
+            input=logged_input,
         ) as span:
             score = scorer
             if hasattr(scorer, "eval_async"):
@@ -1415,6 +1440,77 @@ def report_progress(event: TaskProgressEvent):
                 tags = hooks.tags if hooks.tags else None
                 root_span.log(output=output, metadata=metadata, tags=tags)
 
+                # Create trace object for scorers
+                from braintrust.trace import LocalTrace
+
+                async def ensure_spans_flushed():
+                    # Flush native Braintrust spans
+                    if experiment:
+                        await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: experiment.state.flush()
+                        )
+                    elif state:
+                        await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
+                    else:
+                        from braintrust.logger import flush as flush_logger
+
+                        await asyncio.get_event_loop().run_in_executor(None, flush_logger)
+
+                    # Also flush OTEL spans if registered
+                    if state:
+                        await state.flush_otel()
+
+                experiment_id = None
+                if experiment:
+                    try:
+                        experiment_id = experiment.id
+                    except:
+                        experiment_id = None
+
+                trace = None
+                if state or experiment:
+                    # Get the state to use
+                    trace_state = state
+                    if not trace_state and experiment:
+                        trace_state = experiment.state
+                    if not trace_state:
+                        # Fall back to global state
+                        from braintrust.logger import _internal_get_global_state
+
+                        trace_state = _internal_get_global_state()
+
+                    # Access root_span_id from the concrete SpanImpl instance
+                    # The Span interface doesn't expose this but SpanImpl has it
+                    root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
+
+                    # Check if there's a parent in the context to determine object_type and object_id
+                    from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
+
+                    parent_str = trace_state.current_parent.get()
+                    parent_components = None
+                    if parent_str:
+                        try:
+                            parent_components = SpanComponentsV3.from_str(parent_str)
+                        except Exception:
+                            # If parsing fails, parent_components stays None
+                            pass
+
+                    # Determine object_type and object_id based on parent or experiment
+                    if parent_components:
+                        trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
+                        trace_object_id = parent_components.object_id or ""
+                    else:
+                        trace_object_type = "experiment"
+                        trace_object_id = experiment_id or ""
+
+                    trace = LocalTrace(
+                        object_type=trace_object_type,
+                        object_id=trace_object_id,
+                        root_span_id=root_span_id_value,
+                        ensure_spans_flushed=ensure_spans_flushed,
+                        state=trace_state,
+                    )
+
                 score_promises = [
                     asyncio.create_task(
                         await_or_run_scorer(
@@ -1426,6 +1522,7 @@ def report_progress(event: TaskProgressEvent):
                                 "expected": datum.expected,
                                 "metadata": metadata,
                                 "output": output,
+                                "trace": trace,
                             },
                         )
                     )

diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py
@@ -3,7 +3,7 @@
 from sseclient import SSEClient
 
 from .._generated_types import FunctionTypeEnum
-from ..logger import Exportable, get_span_parent_object, login, proxy_conn
+from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
 from ..util import response_raise_for_status
 from .constants import INVOKE_API_VERSION
 from .stream import BraintrustInvokeError, BraintrustStream
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
     :param version: Optional version of the function to use. Defaults to latest.
     :return: A function that can be used as a task or scorer.
     """
+    # Disable span cache since remote function spans won't be in the local cache
+    _internal_get_global_state().span_cache.disable()
 
     def f(*args: Any, **kwargs: Any) -> Any:
         if len(args) > 0:

diff --git a/py/src/braintrust/functions/test_invoke.py b/py/src/braintrust/functions/test_invoke.py
@@ -0,0 +1,61 @@
+"""Tests for the invoke module, particularly init_function."""
+
+
+from braintrust.functions.invoke import init_function
+from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
+
+
+class TestInitFunction:
+    """Tests for init_function."""
+
+    def setup_method(self):
+        """Reset state before each test."""
+        _internal_reset_global_state()
+
+    def teardown_method(self):
+        """Clean up after each test."""
+        _internal_reset_global_state()
+
+    def test_init_function_disables_span_cache(self):
+        """Test that init_function disables the span cache."""
+        state = _internal_get_global_state()
+
+        # Cache should be disabled by default (it's only enabled during evals)
+        assert state.span_cache.disabled is True
+
+        # Enable the cache (simulating what happens during eval)
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+
+        # Call init_function
+        f = init_function("test-project", "test-function")
+
+        # Cache should now be disabled (init_function explicitly disables it)
+        assert state.span_cache.disabled is True
+        assert f.__name__ == "init_function-test-project-test-function-latest"
+
+    def test_init_function_with_version(self):
+        """Test that init_function creates a function with the correct name including version."""
+        f = init_function("my-project", "my-scorer", version="v1")
+        assert f.__name__ == "init_function-my-project-my-scorer-v1"
+
+    def test_init_function_without_version_uses_latest(self):
+        """Test that init_function uses 'latest' in name when version not specified."""
+        f = init_function("my-project", "my-scorer")
+        assert f.__name__ == "init_function-my-project-my-scorer-latest"
+
+    def test_init_function_permanently_disables_cache(self):
+        """Test that init_function permanently disables the cache (can't be re-enabled)."""
+        state = _internal_get_global_state()
+
+        # Enable the cache
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+
+        # Call init_function
+        init_function("test-project", "test-function")
+        assert state.span_cache.disabled is True
+
+        # Try to start again - should still be disabled because of explicit disable
+        state.span_cache.start()
+        assert state.span_cache.disabled is True