From 9d8f149bc1a3e3ab86003b44d1e793086336180e Mon Sep 17 00:00:00 2001 From: Alex Z Date: Sun, 9 Nov 2025 11:24:41 -0800 Subject: [PATCH 01/65] kind of works --- js/src/exports-node.ts | 1 + js/src/framework.ts | 22 ++++++++++ js/src/scorer-context.ts | 90 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 js/src/scorer-context.ts diff --git a/js/src/exports-node.ts b/js/src/exports-node.ts index 05b858387..b15e012ad 100644 --- a/js/src/exports-node.ts +++ b/js/src/exports-node.ts @@ -22,6 +22,7 @@ export { ReporterBody, Reporter, SpanContext, + ScorerContext, buildLocalSummary, reportFailures, runEvaluator, diff --git a/js/src/framework.ts b/js/src/framework.ts index 253c914ad..1b3b3d4dd 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -15,6 +15,8 @@ import chalk from "chalk"; import pluralize from "pluralize"; import { GenericFunction } from "./framework-types"; import { CodeFunction, CodePrompt } from "./framework2"; +import { ScorerContext } from "./scorer-context"; +export { ScorerContext } from "./scorer-context"; import { BaseMetadata, BraintrustState, @@ -161,6 +163,7 @@ export type EvalScorerArgs< Metadata extends BaseMetadata = DefaultMetadataType, > = EvalCase & { output: Output; + scorerContext?: ScorerContext; }; export type OneOrMoreScores = Score | number | null | Array; @@ -873,6 +876,18 @@ async function runEvaluatorInternal( ); progressReporter.start(evaluator.evalName, dataWithTrials.length); + + const experimentIdPromise: Promise | undefined = + experiment + ? (async () => { + try { + return await experiment.id; + } catch { + return undefined; + } + })() + : undefined; + interface EvalResult { // eslint-disable-next-line @typescript-eslint/no-explicit-any input: any; @@ -924,6 +939,12 @@ async function runEvaluatorInternal( }; const callback = async (rootSpan: Span) => { + const scorerContext = new ScorerContext({ + experimentId: experimentIdPromise + ? await experimentIdPromise + : undefined, + rootSpanId: rootSpan.rootSpanId, + }); let metadata: Record = { ...("metadata" in datum ? datum.metadata : {}), }; @@ -991,6 +1012,7 @@ async function runEvaluatorInternal( expected: "expected" in datum ? datum.expected : undefined, metadata, output, + scorerContext, }; const scoreResults = await Promise.all( evaluator.scores.map(async (score, score_idx) => { diff --git a/js/src/scorer-context.ts b/js/src/scorer-context.ts new file mode 100644 index 000000000..768b75d14 --- /dev/null +++ b/js/src/scorer-context.ts @@ -0,0 +1,90 @@ +import { _internalGetGlobalState } from "./logger"; + +const MAX_FETCH_RETRIES = 10; +const INITIAL_RETRY_DELAY_MS = 200; + +const sleep = (ms: number) => + new Promise((resolve) => { + setTimeout(resolve, ms); + }); + +export interface ScorerContextOptions { + experimentId?: string; + logsId?: string; + rootSpanId: string; +} + +/** + * Carries identifying information about the evaluation so scorers can perform + * richer logging or side effects. Additional behavior will be layered on top + * of this skeleton class later. + */ +export class ScorerContext { + // Store values privately so future helper methods can expose them safely. + private readonly experimentId?: string; + private readonly logsId?: string; + private readonly rootSpanId: string; + + constructor({ experimentId, logsId, rootSpanId }: ScorerContextOptions) { + console.log("Creating ScorerContext"); + this.experimentId = experimentId; + this.logsId = logsId; + this.rootSpanId = rootSpanId; + } + + getConfiguration() { + return { + experimentId: this.experimentId, + logsId: this.logsId, + rootSpanId: this.rootSpanId, + }; + } + + /** + * Fetch all rows for this root span from its parent experiment. + * Returns an empty array when no experiment is associated with the context. + */ + async fetchRootSpanRows(): Promise { + if (!this.experimentId) { + return []; + } + + const state = _internalGetGlobalState(); + if (!state) { + return []; + } + + await state.login({}); + + const query = ` + from: experiment('${this.experimentId}') + | filter: root_span_id = '${this.rootSpanId}' + | select: * + `; + + for (let attempt = 0; attempt < MAX_FETCH_RETRIES; attempt++) { + const response = await state.apiConn().post( + "btql", + { + query, + use_columnstore: false, + brainstore_realtime: false, + }, + { headers: { "Accept-Encoding": "gzip" } }, + ); + + const payload = await response.json(); + const rows = payload?.data ?? []; + console.log(rows.length); + if (rows.length > 0 || attempt === MAX_FETCH_RETRIES - 1) { + return rows; + } + + const backoff = + INITIAL_RETRY_DELAY_MS * Math.pow(2, Math.min(attempt, 3)); + await sleep(backoff); + } + + return []; + } +} From 95c21c8dec58d34443a95822e82c7c068c912893 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Sun, 9 Nov 2025 11:31:47 -0800 Subject: [PATCH 02/65] trace scoring --- js/src/scorer-context.ts | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/js/src/scorer-context.ts b/js/src/scorer-context.ts index 768b75d14..e92a1451f 100644 --- a/js/src/scorer-context.ts +++ b/js/src/scorer-context.ts @@ -1,7 +1,7 @@ import { _internalGetGlobalState } from "./logger"; -const MAX_FETCH_RETRIES = 10; -const INITIAL_RETRY_DELAY_MS = 200; +const MAX_FETCH_RETRIES = 8; +const INITIAL_RETRY_DELAY_MS = 250; const sleep = (ms: number) => new Promise((resolve) => { @@ -26,7 +26,6 @@ export class ScorerContext { private readonly rootSpanId: string; constructor({ experimentId, logsId, rootSpanId }: ScorerContextOptions) { - console.log("Creating ScorerContext"); this.experimentId = experimentId; this.logsId = logsId; this.rootSpanId = rootSpanId; @@ -75,8 +74,13 @@ export class ScorerContext { const payload = await response.json(); const rows = payload?.data ?? []; - console.log(rows.length); - if (rows.length > 0 || attempt === MAX_FETCH_RETRIES - 1) { + const freshness = payload?.freshness_state; + const isFresh = + freshness?.last_processed_xact_id != null && + freshness?.last_processed_xact_id === + freshness?.last_considered_xact_id; + + if ((rows.length > 0 && isFresh) || attempt === MAX_FETCH_RETRIES - 1) { return rows; } From 4036939cd5a3f133461def633908f66ecb9c3330 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 10 Nov 2025 14:55:52 -0800 Subject: [PATCH 03/65] better api --- js/src/scorer-context.ts | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/js/src/scorer-context.ts b/js/src/scorer-context.ts index e92a1451f..a1326d097 100644 --- a/js/src/scorer-context.ts +++ b/js/src/scorer-context.ts @@ -43,7 +43,7 @@ export class ScorerContext { * Fetch all rows for this root span from its parent experiment. * Returns an empty array when no experiment is associated with the context. */ - async fetchRootSpanRows(): Promise { + async getSpans({ spanType }: { spanType: string }): Promise { if (!this.experimentId) { return []; } @@ -57,7 +57,7 @@ export class ScorerContext { const query = ` from: experiment('${this.experimentId}') - | filter: root_span_id = '${this.rootSpanId}' + | filter: root_span_id = '${this.rootSpanId}' ${spanType ? `AND span_attributes.type = '${spanType}'` : ""} | select: * `; @@ -67,7 +67,7 @@ export class ScorerContext { { query, use_columnstore: false, - brainstore_realtime: false, + brainstore_realtime: true, }, { headers: { "Accept-Encoding": "gzip" } }, ); @@ -81,7 +81,16 @@ export class ScorerContext { freshness?.last_considered_xact_id; if ((rows.length > 0 && isFresh) || attempt === MAX_FETCH_RETRIES - 1) { - return rows; + return rows + .filter((row: any) => row.span_attributes?.type !== "score") + .map((row: any) => ({ + input: row.input, + output: row.output, + metadata: row.metadata, + span_id: row.span_id, + span_parents: row.span_parents, + span_attributes: row.span_attributes, + })); } const backoff = From d02704a4539dae862220a2d3106fbc03b6d119cf Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 10 Nov 2025 16:27:58 -0800 Subject: [PATCH 04/65] more changes --- js/src/scorer-context.ts | 70 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/js/src/scorer-context.ts b/js/src/scorer-context.ts index a1326d097..41d9f383a 100644 --- a/js/src/scorer-context.ts +++ b/js/src/scorer-context.ts @@ -1,4 +1,5 @@ import { _internalGetGlobalState } from "./logger"; +import { createHash } from "node:crypto"; const MAX_FETCH_RETRIES = 8; const INITIAL_RETRY_DELAY_MS = 250; @@ -14,6 +15,22 @@ export interface ScorerContextOptions { rootSpanId: string; } +function isObject(value: any): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} + +const getMessageHash = ( + message: any, + hashCache: Map, +): string => { + const messageString = JSON.stringify(message); + const hashString = createHash("md5").update(messageString).digest("hex"); + + // Cache the result + hashCache.set(messageString, hashString); + return hashString; +}; + /** * Carries identifying information about the evaluation so scorers can perform * richer logging or side effects. Additional behavior will be layered on top @@ -43,7 +60,7 @@ export class ScorerContext { * Fetch all rows for this root span from its parent experiment. * Returns an empty array when no experiment is associated with the context. */ - async getSpans({ spanType }: { spanType: string }): Promise { + async getSpans({ spanType }: { spanType?: string } = {}): Promise { if (!this.experimentId) { return []; } @@ -59,6 +76,7 @@ export class ScorerContext { from: experiment('${this.experimentId}') | filter: root_span_id = '${this.rootSpanId}' ${spanType ? `AND span_attributes.type = '${spanType}'` : ""} | select: * + | sort: _xact_id asc `; for (let attempt = 0; attempt < MAX_FETCH_RETRIES; attempt++) { @@ -66,7 +84,7 @@ export class ScorerContext { "btql", { query, - use_columnstore: false, + use_columnstore: true, brainstore_realtime: true, }, { headers: { "Accept-Encoding": "gzip" } }, @@ -100,4 +118,52 @@ export class ScorerContext { return []; } + + async getThread() { + const spans = await this.getSpans({ spanType: "llm" }); + const hashCache = new Map(); + const messages: any[] = []; + const hashes = new Set(); + const addMessage = ( + rawMessage: any, + { skipDedupe = false }: { skipDedupe?: boolean } = {}, + ) => { + if (!isObject(rawMessage)) { + return; + } + const message = { ...rawMessage }; + const messageHash = getMessageHash(message, hashCache); + if (!skipDedupe && hashes.has(messageHash)) { + return; + } + messages.push(message); + hashes.add(messageHash); + }; + for (const span of spans) { + if (span.input instanceof Array) { + for (const message of span.input) { + addMessage(message); + } + } else if (isObject(span.input)) { + addMessage(span.input); + } else if (typeof span.input === "string") { + addMessage({ role: "user", content: span.input }); + } + + // Always include outputs + if (span.output instanceof Array) { + for (const message of span.output) { + addMessage(message, { skipDedupe: true }); + } + } else if (isObject(span.output)) { + addMessage(span.output, { skipDedupe: true }); + } else if (typeof span.output === "string") { + addMessage( + { role: "assistant", content: span.output }, + { skipDedupe: true }, + ); + } + } + return messages; + } } From 942b49ca3250f5de452c53011ba27ba1d831e8d7 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 10 Nov 2025 16:32:59 -0800 Subject: [PATCH 05/65] better api --- js/src/scorer-context.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/src/scorer-context.ts b/js/src/scorer-context.ts index 41d9f383a..348902bd9 100644 --- a/js/src/scorer-context.ts +++ b/js/src/scorer-context.ts @@ -60,7 +60,7 @@ export class ScorerContext { * Fetch all rows for this root span from its parent experiment. * Returns an empty array when no experiment is associated with the context. */ - async getSpans({ spanType }: { spanType?: string } = {}): Promise { + async getSpans({ spanType }: { spanType?: string[] } = {}): Promise { if (!this.experimentId) { return []; } @@ -74,7 +74,7 @@ export class ScorerContext { const query = ` from: experiment('${this.experimentId}') - | filter: root_span_id = '${this.rootSpanId}' ${spanType ? `AND span_attributes.type = '${spanType}'` : ""} + | filter: root_span_id = '${this.rootSpanId}' ${spanType ? `AND span_attributes.type IN ${JSON.stringify(spanType)}` : ""} | select: * | sort: _xact_id asc `; @@ -120,7 +120,7 @@ export class ScorerContext { } async getThread() { - const spans = await this.getSpans({ spanType: "llm" }); + const spans = await this.getSpans({ spanType: ["llm"] }); const hashCache = new Map(); const messages: any[] = []; const hashes = new Set(); From e734a6e63f293ceab7b8a77651c09364e84697a4 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Thu, 20 Nov 2025 16:24:22 -0800 Subject: [PATCH 06/65] make trace context flush before fetching --- js/src/framework.ts | 13 +++++++++++++ js/src/scorer-context.ts | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 1b3b3d4dd..c34f89226 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -939,11 +939,24 @@ async function runEvaluatorInternal( }; const callback = async (rootSpan: Span) => { + const ensureSpansFlushed = async () => { + if (experiment) { + await flush({ state: experiment.loggingState }); + return; + } + if (evaluator.state) { + await flush({ state: evaluator.state }); + return; + } + await flush(); + }; + const scorerContext = new ScorerContext({ experimentId: experimentIdPromise ? await experimentIdPromise : undefined, rootSpanId: rootSpan.rootSpanId, + ensureSpansFlushed, }); let metadata: Record = { ...("metadata" in datum ? datum.metadata : {}), diff --git a/js/src/scorer-context.ts b/js/src/scorer-context.ts index 348902bd9..142f334e9 100644 --- a/js/src/scorer-context.ts +++ b/js/src/scorer-context.ts @@ -13,6 +13,7 @@ export interface ScorerContextOptions { experimentId?: string; logsId?: string; rootSpanId: string; + ensureSpansFlushed?: () => Promise; } function isObject(value: any): value is Record { @@ -41,11 +42,20 @@ export class ScorerContext { private readonly experimentId?: string; private readonly logsId?: string; private readonly rootSpanId: string; - - constructor({ experimentId, logsId, rootSpanId }: ScorerContextOptions) { + private readonly ensureSpansFlushed?: () => Promise; + private spansFlushed = false; + private spansFlushPromise: Promise | null = null; + + constructor({ + experimentId, + logsId, + rootSpanId, + ensureSpansFlushed, + }: ScorerContextOptions) { this.experimentId = experimentId; this.logsId = logsId; this.rootSpanId = rootSpanId; + this.ensureSpansFlushed = ensureSpansFlushed; } getConfiguration() { @@ -65,6 +75,8 @@ export class ScorerContext { return []; } + await this.ensureSpansReady(); + const state = _internalGetGlobalState(); if (!state) { return []; @@ -166,4 +178,24 @@ export class ScorerContext { } return messages; } + + private async ensureSpansReady() { + if (this.spansFlushed || !this.ensureSpansFlushed) { + return; + } + + if (!this.spansFlushPromise) { + this.spansFlushPromise = this.ensureSpansFlushed().then( + () => { + this.spansFlushed = true; + }, + (err) => { + this.spansFlushPromise = null; + throw err; + }, + ); + } + + await this.spansFlushPromise; + } } From fda9d29b91b2f61dbf98d7d535978cf220f0ee42 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 1 Dec 2025 13:35:14 -0800 Subject: [PATCH 07/65] rename --- js/src/exports-node.ts | 2 +- js/src/framework.ts | 10 +++++----- js/src/{scorer-context.ts => trace.ts} | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) rename js/src/{scorer-context.ts => trace.ts} (98%) diff --git a/js/src/exports-node.ts b/js/src/exports-node.ts index c88e607c4..35e5923db 100644 --- a/js/src/exports-node.ts +++ b/js/src/exports-node.ts @@ -22,7 +22,7 @@ export { ReporterBody, Reporter, SpanContext, - ScorerContext, + Trace, buildLocalSummary, reportFailures, runEvaluator, diff --git a/js/src/framework.ts b/js/src/framework.ts index bc37b5cb6..8e9e09d8d 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -15,8 +15,8 @@ import chalk from "chalk"; import pluralize from "pluralize"; import { GenericFunction } from "./framework-types"; import { CodeFunction, CodePrompt } from "./framework2"; -import { ScorerContext } from "./scorer-context"; -export { ScorerContext } from "./scorer-context"; +import { Trace } from "./trace"; +export { Trace } from "./trace"; import { BaseMetadata, BraintrustState, @@ -163,7 +163,7 @@ export type EvalScorerArgs< Metadata extends BaseMetadata = DefaultMetadataType, > = EvalCase & { output: Output; - scorerContext?: ScorerContext; + trace?: Trace; }; export type OneOrMoreScores = Score | number | null | Array; @@ -991,7 +991,7 @@ async function runEvaluatorInternal( await flush(); }; - const scorerContext = new ScorerContext({ + const trace = new Trace({ experimentId: experimentIdPromise ? await experimentIdPromise : undefined, @@ -1065,7 +1065,7 @@ async function runEvaluatorInternal( expected: "expected" in datum ? datum.expected : undefined, metadata, output, - scorerContext, + trace, }; const scoreResults = await Promise.all( evaluator.scores.map(async (score, score_idx) => { diff --git a/js/src/scorer-context.ts b/js/src/trace.ts similarity index 98% rename from js/src/scorer-context.ts rename to js/src/trace.ts index 142f334e9..5a602e5f4 100644 --- a/js/src/scorer-context.ts +++ b/js/src/trace.ts @@ -9,7 +9,7 @@ const sleep = (ms: number) => setTimeout(resolve, ms); }); -export interface ScorerContextOptions { +export interface TraceOptions { experimentId?: string; logsId?: string; rootSpanId: string; @@ -37,7 +37,7 @@ const getMessageHash = ( * richer logging or side effects. Additional behavior will be layered on top * of this skeleton class later. */ -export class ScorerContext { +export class Trace { // Store values privately so future helper methods can expose them safely. private readonly experimentId?: string; private readonly logsId?: string; @@ -51,7 +51,7 @@ export class ScorerContext { logsId, rootSpanId, ensureSpansFlushed, - }: ScorerContextOptions) { + }: TraceOptions) { this.experimentId = experimentId; this.logsId = logsId; this.rootSpanId = rootSpanId; From fcd8f91a4c74663653daf34a1c0a84a2cf8d07e3 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 1 Dec 2025 13:38:54 -0800 Subject: [PATCH 08/65] jsdoc --- js/src/trace.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/js/src/trace.ts b/js/src/trace.ts index 5a602e5f4..0ab540486 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -131,6 +131,11 @@ export class Trace { return []; } + /** + * Fetch the thread of messages for this trace. + * + * @experimental This method is experimental and may change in the future. + */ async getThread() { const spans = await this.getSpans({ spanType: ["llm"] }); const hashCache = new Map(); From b13ee0636c5651f6f09b04aca31065ededa5af6b Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 15 Dec 2025 17:09:04 -0800 Subject: [PATCH 09/65] cache v1 --- js/src/logger.ts | 15 +++ js/src/span-cache.test.ts | 166 ++++++++++++++++++++++++++++++++ js/src/span-cache.ts | 192 ++++++++++++++++++++++++++++++++++++++ js/src/trace.ts | 35 ++++++- 4 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 js/src/span-cache.test.ts create mode 100644 js/src/span-cache.ts diff --git a/js/src/logger.ts b/js/src/logger.ts index 10f906b5e..08657ebb2 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -93,6 +93,7 @@ import { } from "./util"; import { lintTemplate } from "./mustache-utils"; import { prettifyXact } from "../util/index"; +import { SpanCache, CachedSpan } from "./span-cache"; // Context management interfaces export interface ContextParentSpanIds { @@ -557,6 +558,7 @@ export class BraintrustState { private _proxyConn: HTTPConnection | null = null; public promptCache: PromptCache; + public spanCache: SpanCache; private _idGenerator: IDGenerator | null = null; private _contextManager: ContextManager | null = null; @@ -595,6 +597,7 @@ export class BraintrustState { }) : undefined; this.promptCache = new PromptCache({ memoryCache, diskCache }); + this.spanCache = new SpanCache(); } public resetLoginInfo() { @@ -5602,6 +5605,18 @@ export class SpanImpl implements Span { throw new Error("Tags can only be logged to the root span"); } + // Write to local span cache for scorer access + const cachedSpan: CachedSpan = { + input: partialRecord.input, + output: partialRecord.output, + metadata: partialRecord.metadata as Record | undefined, + span_id: this._spanId, + span_parents: this._spanParents, + span_attributes: + partialRecord.span_attributes as CachedSpan["span_attributes"], + }; + this._state.spanCache.write(this._rootSpanId, this._spanId, cachedSpan); + const computeRecord = async () => ({ ...partialRecord, ...Object.fromEntries( diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts new file mode 100644 index 000000000..924e1f268 --- /dev/null +++ b/js/src/span-cache.test.ts @@ -0,0 +1,166 @@ +import { describe, expect, test, beforeEach, vi } from "vitest"; +import { SpanCache } from "./span-cache"; + +describe("SpanCache", () => { + let cache: SpanCache; + + beforeEach(() => { + cache = new SpanCache(); + }); + + describe("write and read", () => { + test("should store and retrieve spans by rootSpanId", () => { + const rootSpanId = "root-123"; + const span1 = { + span_id: "span-1", + input: { text: "hello" }, + output: { response: "world" }, + }; + const span2 = { + span_id: "span-2", + input: { text: "foo" }, + output: { response: "bar" }, + }; + + cache.write(rootSpanId, span1.span_id, span1); + cache.write(rootSpanId, span2.span_id, span2); + + const spans = cache.getByRootSpanId(rootSpanId); + expect(spans).toHaveLength(2); + expect(spans).toContainEqual(span1); + expect(spans).toContainEqual(span2); + }); + + test("should return undefined for unknown rootSpanId", () => { + const spans = cache.getByRootSpanId("nonexistent"); + expect(spans).toBeUndefined(); + }); + + test("should merge span data on subsequent writes to same spanId", () => { + const rootSpanId = "root-123"; + const spanId = "span-1"; + + cache.write(rootSpanId, spanId, { + span_id: spanId, + input: { text: "hello" }, + }); + + cache.write(rootSpanId, spanId, { + span_id: spanId, + output: { response: "world" }, + }); + + const spans = cache.getByRootSpanId(rootSpanId); + expect(spans).toHaveLength(1); + expect(spans![0]).toEqual({ + span_id: spanId, + input: { text: "hello" }, + output: { response: "world" }, + }); + }); + + test("should merge metadata objects", () => { + const rootSpanId = "root-123"; + const spanId = "span-1"; + + cache.write(rootSpanId, spanId, { + span_id: spanId, + metadata: { key1: "value1" }, + }); + + cache.write(rootSpanId, spanId, { + span_id: spanId, + metadata: { key2: "value2" }, + }); + + const spans = cache.getByRootSpanId(rootSpanId); + expect(spans![0].metadata).toEqual({ + key1: "value1", + key2: "value2", + }); + }); + }); + + describe("has", () => { + test("should return true when rootSpanId exists", () => { + cache.write("root-123", "span-1", { span_id: "span-1" }); + expect(cache.has("root-123")).toBe(true); + }); + + test("should return false when rootSpanId does not exist", () => { + expect(cache.has("nonexistent")).toBe(false); + }); + }); + + describe("clear", () => { + test("should remove spans for a specific rootSpanId", () => { + cache.write("root-1", "span-1", { span_id: "span-1" }); + cache.write("root-2", "span-2", { span_id: "span-2" }); + + cache.clear("root-1"); + + expect(cache.has("root-1")).toBe(false); + expect(cache.has("root-2")).toBe(true); + }); + }); + + describe("clearAll", () => { + test("should remove all cached spans", () => { + cache.write("root-1", "span-1", { span_id: "span-1" }); + cache.write("root-2", "span-2", { span_id: "span-2" }); + + cache.clearAll(); + + expect(cache.size).toBe(0); + }); + }); + + describe("eviction", () => { + test("should evict oldest entries when maxRootSpans is exceeded", () => { + const smallCache = new SpanCache({ maxRootSpans: 2 }); + + smallCache.write("root-1", "span-1", { span_id: "span-1" }); + smallCache.write("root-2", "span-2", { span_id: "span-2" }); + smallCache.write("root-3", "span-3", { span_id: "span-3" }); + + expect(smallCache.size).toBe(2); + expect(smallCache.has("root-1")).toBe(false); // Oldest evicted + expect(smallCache.has("root-2")).toBe(true); + expect(smallCache.has("root-3")).toBe(true); + }); + }); + + describe("TTL expiration", () => { + test("should return undefined for expired entries", () => { + vi.useFakeTimers(); + + const shortTtlCache = new SpanCache({ ttlMs: 1000 }); // 1 second TTL + shortTtlCache.write("root-1", "span-1", { span_id: "span-1" }); + + expect(shortTtlCache.has("root-1")).toBe(true); + + // Advance time past TTL + vi.advanceTimersByTime(2000); + + expect(shortTtlCache.has("root-1")).toBe(false); + expect(shortTtlCache.getByRootSpanId("root-1")).toBeUndefined(); + + vi.useRealTimers(); + }); + }); + + describe("size", () => { + test("should return the number of root spans", () => { + expect(cache.size).toBe(0); + + cache.write("root-1", "span-1", { span_id: "span-1" }); + expect(cache.size).toBe(1); + + cache.write("root-1", "span-2", { span_id: "span-2" }); // Same root + expect(cache.size).toBe(1); + + cache.write("root-2", "span-3", { span_id: "span-3" }); // Different root + expect(cache.size).toBe(2); + }); + }); +}); diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts new file mode 100644 index 000000000..6cf317e6f --- /dev/null +++ b/js/src/span-cache.ts @@ -0,0 +1,192 @@ +/** + * SpanCache provides a local in-memory cache for span data, allowing + * scorers to read spans without making server round-trips when possible. + * + * Spans are indexed by rootSpanId, matching the query pattern used by + * Trace.getSpans(). + */ + +export interface CachedSpan { + input?: unknown; + output?: unknown; + metadata?: Record; + span_id: string; + span_parents?: string[]; + span_attributes?: { + name?: string; + type?: string; + [key: string]: unknown; + }; +} + +export interface SpanCacheOptions { + /** + * Maximum number of root spans to cache. When exceeded, oldest entries + * are evicted. Defaults to 1000. + */ + maxRootSpans?: number; + + /** + * Time-to-live in milliseconds. Cached spans older than this are + * considered stale. Defaults to 300000 (5 minutes). + */ + ttlMs?: number; +} + +interface CacheEntry { + spans: Map; + createdAt: number; +} + +const DEFAULT_MAX_ROOT_SPANS = 1000; +const DEFAULT_TTL_MS = 5 * 60 * 1000; // 5 minutes + +/** + * Local cache for span data, keyed by rootSpanId. + * + * This cache is used by Trace.getSpans() to avoid server round-trips + * when fetching span data that was just logged locally. + */ +export class SpanCache { + private cache: Map = new Map(); + private readonly maxRootSpans: number; + private readonly ttlMs: number; + + constructor(options?: SpanCacheOptions) { + this.maxRootSpans = options?.maxRootSpans ?? DEFAULT_MAX_ROOT_SPANS; + this.ttlMs = options?.ttlMs ?? DEFAULT_TTL_MS; + } + + /** + * Write or update a span in the cache. + * + * @param rootSpanId The root span ID that groups this span + * @param spanId The unique ID of this span + * @param data The span data to cache + */ + write(rootSpanId: string, spanId: string, data: CachedSpan): void { + let entry = this.cache.get(rootSpanId); + + if (!entry) { + // Evict oldest entries if at capacity + this.evictIfNeeded(); + + entry = { + spans: new Map(), + createdAt: Date.now(), + }; + this.cache.set(rootSpanId, entry); + } + + // Merge with existing span data if present + const existing = entry.spans.get(spanId); + if (existing) { + entry.spans.set(spanId, this.mergeSpanData(existing, data)); + } else { + entry.spans.set(spanId, data); + } + } + + /** + * Get all cached spans for a given rootSpanId. + * + * @param rootSpanId The root span ID to look up + * @returns Array of cached spans, or undefined if not in cache or expired + */ + getByRootSpanId(rootSpanId: string): CachedSpan[] | undefined { + const entry = this.cache.get(rootSpanId); + + if (!entry) { + return undefined; + } + + // Check TTL + if (Date.now() - entry.createdAt > this.ttlMs) { + this.cache.delete(rootSpanId); + return undefined; + } + + return Array.from(entry.spans.values()); + } + + /** + * Check if a rootSpanId has cached data. + */ + has(rootSpanId: string): boolean { + const entry = this.cache.get(rootSpanId); + if (!entry) return false; + + // Check TTL + if (Date.now() - entry.createdAt > this.ttlMs) { + this.cache.delete(rootSpanId); + return false; + } + + return true; + } + + /** + * Clear all cached spans for a given rootSpanId. + * Useful for explicit cleanup after scoring completes. + */ + clear(rootSpanId: string): void { + this.cache.delete(rootSpanId); + } + + /** + * Clear all cached data. + */ + clearAll(): void { + this.cache.clear(); + } + + /** + * Get the number of root spans currently cached. + */ + get size(): number { + return this.cache.size; + } + + private evictIfNeeded(): void { + if (this.cache.size < this.maxRootSpans) { + return; + } + + // Find and remove the oldest entry + let oldestKey: string | undefined; + let oldestTime = Infinity; + + for (const [key, entry] of this.cache) { + if (entry.createdAt < oldestTime) { + oldestTime = entry.createdAt; + oldestKey = key; + } + } + + if (oldestKey) { + this.cache.delete(oldestKey); + } + } + + private mergeSpanData( + existing: CachedSpan, + incoming: CachedSpan, + ): CachedSpan { + // Merge strategy: incoming values override existing ONLY if defined. + // Undefined values in incoming should not overwrite existing values. + return { + span_id: incoming.span_id, + span_parents: incoming.span_parents ?? existing.span_parents, + input: incoming.input !== undefined ? incoming.input : existing.input, + output: incoming.output !== undefined ? incoming.output : existing.output, + metadata: + existing.metadata || incoming.metadata + ? { ...existing.metadata, ...incoming.metadata } + : undefined, + span_attributes: + existing.span_attributes || incoming.span_attributes + ? { ...existing.span_attributes, ...incoming.span_attributes } + : undefined, + }; + } +} diff --git a/js/src/trace.ts b/js/src/trace.ts index 0ab540486..f7c809841 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -69,19 +69,50 @@ export class Trace { /** * Fetch all rows for this root span from its parent experiment. * Returns an empty array when no experiment is associated with the context. + * + * First checks the local span cache for recently logged spans, then falls + * back to BTQL API if not found in cache. */ async getSpans({ spanType }: { spanType?: string[] } = {}): Promise { if (!this.experimentId) { return []; } - await this.ensureSpansReady(); - const state = _internalGetGlobalState(); if (!state) { return []; } + // Try local cache first + const cachedSpans = state.spanCache.getByRootSpanId(this.rootSpanId); + cachedSpans && cachedSpans.forEach((span) => console.log(span)); + if (cachedSpans && cachedSpans.length > 0) { + let spans = cachedSpans.filter( + (span) => span.span_attributes?.type !== "score", + ); + + // Apply spanType filter if specified + if (spanType && spanType.length > 0) { + spans = spans.filter((span) => + spanType.includes(span.span_attributes?.type ?? ""), + ); + } + + return spans.map((span) => ({ + input: span.input, + output: span.output, + metadata: span.metadata, + span_id: span.span_id, + span_parents: span.span_parents, + span_attributes: span.span_attributes, + })); + } + + console.log("Cache miss - falling back to BTQL"); + + // Cache miss - fall back to BTQL + await this.ensureSpansReady(); + await state.login({}); const query = ` From c0035f6624a63fe93d8ae0644e74eb4aa20d1c0f Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 15 Dec 2025 17:34:43 -0800 Subject: [PATCH 10/65] tmp file --- js/src/framework.ts | 3 + js/src/logger.ts | 2 +- js/src/span-cache.test.ts | 87 ++++++-------- js/src/span-cache.ts | 239 ++++++++++++++++++++++++-------------- 4 files changed, 191 insertions(+), 140 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 2773997fa..2ceca9f0c 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -36,6 +36,7 @@ import { traced, withCurrent, withParent, + _internalGetGlobalState, } from "./logger"; import { BarProgressReporter, ProgressReporter } from "./progress"; import { isEmpty, InternalAbortError } from "./util"; @@ -729,6 +730,8 @@ export async function Eval< } } finally { progressReporter.stop(); + // Clean up disk-based span cache after eval completes + _internalGetGlobalState()?.spanCache?.dispose(); } } diff --git a/js/src/logger.ts b/js/src/logger.ts index 08657ebb2..5aa9c7f81 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -5615,7 +5615,7 @@ export class SpanImpl implements Span { span_attributes: partialRecord.span_attributes as CachedSpan["span_attributes"], }; - this._state.spanCache.write(this._rootSpanId, this._spanId, cachedSpan); + this._state.spanCache.writeSync(this._rootSpanId, this._spanId, cachedSpan); const computeRecord = async () => ({ ...partialRecord, diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts index 924e1f268..54ae6a8c3 100644 --- a/js/src/span-cache.test.ts +++ b/js/src/span-cache.test.ts @@ -1,13 +1,18 @@ -import { describe, expect, test, beforeEach, vi } from "vitest"; +import { describe, expect, test, beforeEach, afterEach } from "vitest"; import { SpanCache } from "./span-cache"; -describe("SpanCache", () => { +describe("SpanCache (disk-based)", () => { let cache: SpanCache; beforeEach(() => { cache = new SpanCache(); }); + afterEach(() => { + // Clean up temp file after each test + cache.dispose(); + }); + describe("write and read", () => { test("should store and retrieve spans by rootSpanId", () => { const rootSpanId = "root-123"; @@ -22,8 +27,8 @@ describe("SpanCache", () => { output: { response: "bar" }, }; - cache.write(rootSpanId, span1.span_id, span1); - cache.write(rootSpanId, span2.span_id, span2); + cache.writeSync(rootSpanId, span1.span_id, span1); + cache.writeSync(rootSpanId, span2.span_id, span2); const spans = cache.getByRootSpanId(rootSpanId); expect(spans).toHaveLength(2); @@ -40,12 +45,12 @@ describe("SpanCache", () => { const rootSpanId = "root-123"; const spanId = "span-1"; - cache.write(rootSpanId, spanId, { + cache.writeSync(rootSpanId, spanId, { span_id: spanId, input: { text: "hello" }, }); - cache.write(rootSpanId, spanId, { + cache.writeSync(rootSpanId, spanId, { span_id: spanId, output: { response: "world" }, }); @@ -63,12 +68,12 @@ describe("SpanCache", () => { const rootSpanId = "root-123"; const spanId = "span-1"; - cache.write(rootSpanId, spanId, { + cache.writeSync(rootSpanId, spanId, { span_id: spanId, metadata: { key1: "value1" }, }); - cache.write(rootSpanId, spanId, { + cache.writeSync(rootSpanId, spanId, { span_id: spanId, metadata: { key2: "value2" }, }); @@ -83,7 +88,7 @@ describe("SpanCache", () => { describe("has", () => { test("should return true when rootSpanId exists", () => { - cache.write("root-123", "span-1", { span_id: "span-1" }); + cache.writeSync("root-123", "span-1", { span_id: "span-1" }); expect(cache.has("root-123")).toBe(true); }); @@ -93,9 +98,9 @@ describe("SpanCache", () => { }); describe("clear", () => { - test("should remove spans for a specific rootSpanId", () => { - cache.write("root-1", "span-1", { span_id: "span-1" }); - cache.write("root-2", "span-2", { span_id: "span-2" }); + test("should remove spans for a specific rootSpanId from index", () => { + cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + cache.writeSync("root-2", "span-2", { span_id: "span-2" }); cache.clear("root-1"); @@ -106,8 +111,8 @@ describe("SpanCache", () => { describe("clearAll", () => { test("should remove all cached spans", () => { - cache.write("root-1", "span-1", { span_id: "span-1" }); - cache.write("root-2", "span-2", { span_id: "span-2" }); + cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + cache.writeSync("root-2", "span-2", { span_id: "span-2" }); cache.clearAll(); @@ -115,52 +120,34 @@ describe("SpanCache", () => { }); }); - describe("eviction", () => { - test("should evict oldest entries when maxRootSpans is exceeded", () => { - const smallCache = new SpanCache({ maxRootSpans: 2 }); + describe("size", () => { + test("should return the number of root spans tracked", () => { + expect(cache.size).toBe(0); - smallCache.write("root-1", "span-1", { span_id: "span-1" }); - smallCache.write("root-2", "span-2", { span_id: "span-2" }); - smallCache.write("root-3", "span-3", { span_id: "span-3" }); + cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(cache.size).toBe(1); - expect(smallCache.size).toBe(2); - expect(smallCache.has("root-1")).toBe(false); // Oldest evicted - expect(smallCache.has("root-2")).toBe(true); - expect(smallCache.has("root-3")).toBe(true); + cache.writeSync("root-1", "span-2", { span_id: "span-2" }); // Same root + expect(cache.size).toBe(1); + + cache.writeSync("root-2", "span-3", { span_id: "span-3" }); // Different root + expect(cache.size).toBe(2); }); }); - describe("TTL expiration", () => { - test("should return undefined for expired entries", () => { - vi.useFakeTimers(); - - const shortTtlCache = new SpanCache({ ttlMs: 1000 }); // 1 second TTL - shortTtlCache.write("root-1", "span-1", { span_id: "span-1" }); - - expect(shortTtlCache.has("root-1")).toBe(true); - - // Advance time past TTL - vi.advanceTimersByTime(2000); + describe("dispose", () => { + test("should clean up and allow reuse", () => { + cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(cache.size).toBe(1); - expect(shortTtlCache.has("root-1")).toBe(false); - expect(shortTtlCache.getByRootSpanId("root-1")).toBeUndefined(); + cache.dispose(); - vi.useRealTimers(); - }); - }); - - describe("size", () => { - test("should return the number of root spans", () => { expect(cache.size).toBe(0); + expect(cache.has("root-1")).toBe(false); - cache.write("root-1", "span-1", { span_id: "span-1" }); - expect(cache.size).toBe(1); - - cache.write("root-1", "span-2", { span_id: "span-2" }); // Same root + // Should be able to write again after dispose + cache.writeSync("root-2", "span-2", { span_id: "span-2" }); expect(cache.size).toBe(1); - - cache.write("root-2", "span-3", { span_id: "span-3" }); // Different root - expect(cache.size).toBe(2); }); }); }); diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index 6cf317e6f..ecec0baaa 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -1,11 +1,15 @@ /** - * SpanCache provides a local in-memory cache for span data, allowing + * SpanCache provides a disk-based cache for span data, allowing * scorers to read spans without making server round-trips when possible. * - * Spans are indexed by rootSpanId, matching the query pattern used by - * Trace.getSpans(). + * Spans are stored on disk to minimize memory usage during evaluations. + * The cache file is automatically cleaned up when dispose() is called. */ +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; + export interface CachedSpan { input?: unknown; output?: unknown; @@ -19,42 +23,54 @@ export interface CachedSpan { }; } -export interface SpanCacheOptions { - /** - * Maximum number of root spans to cache. When exceeded, oldest entries - * are evicted. Defaults to 1000. - */ - maxRootSpans?: number; - - /** - * Time-to-live in milliseconds. Cached spans older than this are - * considered stale. Defaults to 300000 (5 minutes). - */ - ttlMs?: number; +interface DiskSpanRecord { + rootSpanId: string; + spanId: string; + data: CachedSpan; } -interface CacheEntry { - spans: Map; - createdAt: number; -} - -const DEFAULT_MAX_ROOT_SPANS = 1000; -const DEFAULT_TTL_MS = 5 * 60 * 1000; // 5 minutes - /** - * Local cache for span data, keyed by rootSpanId. + * Disk-based cache for span data, keyed by rootSpanId. * - * This cache is used by Trace.getSpans() to avoid server round-trips - * when fetching span data that was just logged locally. + * This cache writes spans to a temporary file to minimize memory usage. + * It uses append-only writes and reads the full file when querying. */ export class SpanCache { - private cache: Map = new Map(); - private readonly maxRootSpans: number; - private readonly ttlMs: number; + private cacheFilePath: string | null = null; + private fileHandle: fs.promises.FileHandle | null = null; + private initialized = false; + private initPromise: Promise | null = null; - constructor(options?: SpanCacheOptions) { - this.maxRootSpans = options?.maxRootSpans ?? DEFAULT_MAX_ROOT_SPANS; - this.ttlMs = options?.ttlMs ?? DEFAULT_TTL_MS; + // Small in-memory index tracking which rootSpanIds have data + private rootSpanIndex: Set = new Set(); + + constructor() { + // Initialization is lazy - file is created on first write + } + + private async ensureInitialized(): Promise { + if (this.initialized) { + return; + } + + if (this.initPromise) { + return this.initPromise; + } + + this.initPromise = (async () => { + const tmpDir = os.tmpdir(); + const uniqueId = `${Date.now()}-${Math.random().toString(36).slice(2)}`; + this.cacheFilePath = path.join( + tmpDir, + `braintrust-span-cache-${uniqueId}.jsonl`, + ); + + // Open file for append+read + this.fileHandle = await fs.promises.open(this.cacheFilePath, "a+"); + this.initialized = true; + })(); + + return this.initPromise; } /** @@ -64,108 +80,153 @@ export class SpanCache { * @param spanId The unique ID of this span * @param data The span data to cache */ - write(rootSpanId: string, spanId: string, data: CachedSpan): void { - let entry = this.cache.get(rootSpanId); - - if (!entry) { - // Evict oldest entries if at capacity - this.evictIfNeeded(); - - entry = { - spans: new Map(), - createdAt: Date.now(), - }; - this.cache.set(rootSpanId, entry); - } + async write( + rootSpanId: string, + spanId: string, + data: CachedSpan, + ): Promise { + await this.ensureInitialized(); + + const record: DiskSpanRecord = { rootSpanId, spanId, data }; + const line = JSON.stringify(record) + "\n"; + + await this.fileHandle!.appendFile(line, "utf8"); + this.rootSpanIndex.add(rootSpanId); + } - // Merge with existing span data if present - const existing = entry.spans.get(spanId); - if (existing) { - entry.spans.set(spanId, this.mergeSpanData(existing, data)); - } else { - entry.spans.set(spanId, data); + /** + * Synchronous write - fire and forget. + * Uses sync file operations to avoid blocking the caller. + */ + writeSync(rootSpanId: string, spanId: string, data: CachedSpan): void { + // Lazy init - create file synchronously if needed + if (!this.initialized) { + const tmpDir = os.tmpdir(); + const uniqueId = `${Date.now()}-${Math.random().toString(36).slice(2)}`; + this.cacheFilePath = path.join( + tmpDir, + `braintrust-span-cache-${uniqueId}.jsonl`, + ); + // Touch the file + fs.writeFileSync(this.cacheFilePath, ""); + this.initialized = true; } + + const record: DiskSpanRecord = { rootSpanId, spanId, data }; + const line = JSON.stringify(record) + "\n"; + + fs.appendFileSync(this.cacheFilePath!, line, "utf8"); + this.rootSpanIndex.add(rootSpanId); } /** * Get all cached spans for a given rootSpanId. * + * This reads the file and merges all records for the given rootSpanId. + * * @param rootSpanId The root span ID to look up - * @returns Array of cached spans, or undefined if not in cache or expired + * @returns Array of cached spans, or undefined if not in cache */ getByRootSpanId(rootSpanId: string): CachedSpan[] | undefined { - const entry = this.cache.get(rootSpanId); - - if (!entry) { + if (!this.initialized || !this.cacheFilePath) { return undefined; } - // Check TTL - if (Date.now() - entry.createdAt > this.ttlMs) { - this.cache.delete(rootSpanId); + // Quick check using in-memory index + if (!this.rootSpanIndex.has(rootSpanId)) { return undefined; } - return Array.from(entry.spans.values()); + try { + const content = fs.readFileSync(this.cacheFilePath, "utf8"); + const lines = content.trim().split("\n").filter(Boolean); + + // Accumulate spans by spanId, merging updates + const spanMap = new Map(); + + for (const line of lines) { + try { + const record = JSON.parse(line) as DiskSpanRecord; + if (record.rootSpanId !== rootSpanId) { + continue; + } + + const existing = spanMap.get(record.spanId); + if (existing) { + spanMap.set( + record.spanId, + this.mergeSpanData(existing, record.data), + ); + } else { + spanMap.set(record.spanId, record.data); + } + } catch { + // Skip malformed lines + } + } + + if (spanMap.size === 0) { + return undefined; + } + + return Array.from(spanMap.values()); + } catch { + return undefined; + } } /** * Check if a rootSpanId has cached data. */ has(rootSpanId: string): boolean { - const entry = this.cache.get(rootSpanId); - if (!entry) return false; - - // Check TTL - if (Date.now() - entry.createdAt > this.ttlMs) { - this.cache.delete(rootSpanId); - return false; - } - - return true; + return this.rootSpanIndex.has(rootSpanId); } /** * Clear all cached spans for a given rootSpanId. - * Useful for explicit cleanup after scoring completes. + * Note: This only removes from the index. The data remains in the file + * but will be ignored on reads. */ clear(rootSpanId: string): void { - this.cache.delete(rootSpanId); + this.rootSpanIndex.delete(rootSpanId); } /** - * Clear all cached data. + * Clear all cached data and remove the cache file. */ clearAll(): void { - this.cache.clear(); + this.rootSpanIndex.clear(); + this.dispose(); } /** - * Get the number of root spans currently cached. + * Get the number of root spans currently tracked. */ get size(): number { - return this.cache.size; + return this.rootSpanIndex.size; } - private evictIfNeeded(): void { - if (this.cache.size < this.maxRootSpans) { - return; + /** + * Clean up the cache file. Call this when the eval is complete. + */ + dispose(): void { + if (this.fileHandle) { + this.fileHandle.close().catch(() => {}); + this.fileHandle = null; } - // Find and remove the oldest entry - let oldestKey: string | undefined; - let oldestTime = Infinity; - - for (const [key, entry] of this.cache) { - if (entry.createdAt < oldestTime) { - oldestTime = entry.createdAt; - oldestKey = key; + if (this.cacheFilePath) { + try { + fs.unlinkSync(this.cacheFilePath); + } catch { + // Ignore cleanup errors } + this.cacheFilePath = null; } - if (oldestKey) { - this.cache.delete(oldestKey); - } + this.initialized = false; + this.initPromise = null; + this.rootSpanIndex.clear(); } private mergeSpanData( From 77b26a992014bf606b8a5b4ba3c4a851b2c6f536 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 16 Dec 2025 10:30:24 -0800 Subject: [PATCH 11/65] flag --- js/src/logger.ts | 8 +++++++- js/src/span-cache.ts | 12 +++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/js/src/logger.ts b/js/src/logger.ts index 5aa9c7f81..bd79223eb 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -597,7 +597,7 @@ export class BraintrustState { }) : undefined; this.promptCache = new PromptCache({ memoryCache, diskCache }); - this.spanCache = new SpanCache(); + this.spanCache = new SpanCache({ disabled: loginParams.disableSpanCache }); } public resetLoginInfo() { @@ -3691,6 +3691,12 @@ export interface LoginOptions { * Calls this function if there's an error in the background flusher. */ onFlushError?: (error: unknown) => void; + /** + * If true, disables the local span cache used to optimize scorer access + * to trace data. When disabled, scorers will always fetch spans from the + * server. Defaults to false. + */ + disableSpanCache?: boolean; } export type FullLoginOptions = LoginOptions & { diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index ecec0baaa..b5e44b950 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -40,11 +40,13 @@ export class SpanCache { private fileHandle: fs.promises.FileHandle | null = null; private initialized = false; private initPromise: Promise | null = null; + private readonly disabled: boolean; // Small in-memory index tracking which rootSpanIds have data private rootSpanIndex: Set = new Set(); - constructor() { + constructor(options?: { disabled?: boolean }) { + this.disabled = options?.disabled ?? false; // Initialization is lazy - file is created on first write } @@ -99,6 +101,10 @@ export class SpanCache { * Uses sync file operations to avoid blocking the caller. */ writeSync(rootSpanId: string, spanId: string, data: CachedSpan): void { + if (this.disabled) { + return; + } + // Lazy init - create file synchronously if needed if (!this.initialized) { const tmpDir = os.tmpdir(); @@ -128,6 +134,10 @@ export class SpanCache { * @returns Array of cached spans, or undefined if not in cache */ getByRootSpanId(rootSpanId: string): CachedSpan[] | undefined { + if (this.disabled) { + return undefined; + } + if (!this.initialized || !this.cacheFilePath) { return undefined; } From 59ee9b8336213a14ebd08a80b57643eb7ba23cea Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 16 Dec 2025 10:38:07 -0800 Subject: [PATCH 12/65] bump vers to fix test --- js/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/package.json b/js/package.json index 54766cf1d..8b0dd8dcb 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "braintrust", - "version": "1.0.2", + "version": "1.0.3", "description": "SDK for integrating Braintrust", "repository": { "type": "git", From 4c1bc7994e02a47b69e8fe93f2588154a856d0cb Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 16 Dec 2025 11:03:25 -0800 Subject: [PATCH 13/65] major bump --- js/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/package.json b/js/package.json index 8b0dd8dcb..51f320d4c 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "braintrust", - "version": "1.0.3", + "version": "2.0.3", "description": "SDK for integrating Braintrust", "repository": { "type": "git", From 2eeeb46b1a2923882692cbec34f6fface434b91c Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 16 Dec 2025 11:16:40 -0800 Subject: [PATCH 14/65] disable local cache --- js/src/functions/invoke.test.ts | 48 +++++++++++++++++++++++++++++++++ js/src/functions/invoke.ts | 3 +++ js/src/span-cache.test.ts | 41 ++++++++++++++++++++++++++++ js/src/span-cache.ts | 16 +++++++++-- 4 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 js/src/functions/invoke.test.ts diff --git a/js/src/functions/invoke.test.ts b/js/src/functions/invoke.test.ts new file mode 100644 index 000000000..452518391 --- /dev/null +++ b/js/src/functions/invoke.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, test, beforeEach, afterEach, vi } from "vitest"; +import { initFunction } from "./invoke"; +import { _internalGetGlobalState, _exportsForTestingOnly } from "../logger"; + +describe("initFunction", () => { + beforeEach(() => { + _exportsForTestingOnly.setInitialTestState(); + }); + + afterEach(() => { + _exportsForTestingOnly.clearTestBackgroundLogger(); + }); + + test("should disable span cache when called", async () => { + const state = _internalGetGlobalState(); + + // Cache should not be disabled initially + expect(state.spanCache.disabled).toBe(false); + + // Call initFunction + initFunction({ + projectName: "test-project", + slug: "test-function", + }); + + // Cache should now be disabled + expect(state.spanCache.disabled).toBe(true); + }); + + test("should return a function with correct name", () => { + const fn = initFunction({ + projectName: "my-project", + slug: "my-scorer", + version: "v1", + }); + + expect(fn.name).toBe("initFunction-my-project-my-scorer-v1"); + }); + + test("should use 'latest' in name when version not specified", () => { + const fn = initFunction({ + projectName: "my-project", + slug: "my-scorer", + }); + + expect(fn.name).toBe("initFunction-my-project-my-scorer-latest"); + }); +}); diff --git a/js/src/functions/invoke.ts b/js/src/functions/invoke.ts index da4c27286..1814ff0a9 100644 --- a/js/src/functions/invoke.ts +++ b/js/src/functions/invoke.ts @@ -255,6 +255,9 @@ export function initFunction({ slug: string; version?: string; }) { + // Disable span cache since remote function spans won't be in the local cache + _internalGetGlobalState()?.spanCache?.disable(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any const f = async (input: any): Promise => { return await invoke({ diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts index 54ae6a8c3..c1ac800b9 100644 --- a/js/src/span-cache.test.ts +++ b/js/src/span-cache.test.ts @@ -150,4 +150,45 @@ describe("SpanCache (disk-based)", () => { expect(cache.size).toBe(1); }); }); + + describe("disable", () => { + test("should prevent writes after disable() is called", () => { + cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(cache.size).toBe(1); + + cache.disable(); + + // Writes after disable should be no-ops + cache.writeSync("root-2", "span-2", { span_id: "span-2" }); + expect(cache.size).toBe(1); // Still 1, not 2 + }); + + test("should return undefined from getByRootSpanId after disable()", () => { + cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(cache.getByRootSpanId("root-1")).toBeDefined(); + + cache.disable(); + + // Reads after disable return undefined + expect(cache.getByRootSpanId("root-1")).toBeUndefined(); + }); + + test("disabled getter should reflect disabled state", () => { + expect(cache.disabled).toBe(false); + cache.disable(); + expect(cache.disabled).toBe(true); + }); + + test("should be disabled from constructor option", () => { + const disabledCache = new SpanCache({ disabled: true }); + expect(disabledCache.disabled).toBe(true); + + // Writes should be no-ops + disabledCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(disabledCache.size).toBe(0); + expect(disabledCache.getByRootSpanId("root-1")).toBeUndefined(); + + disabledCache.dispose(); + }); + }); }); diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index b5e44b950..b3064b8ec 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -40,16 +40,28 @@ export class SpanCache { private fileHandle: fs.promises.FileHandle | null = null; private initialized = false; private initPromise: Promise | null = null; - private readonly disabled: boolean; + private _disabled: boolean; // Small in-memory index tracking which rootSpanIds have data private rootSpanIndex: Set = new Set(); constructor(options?: { disabled?: boolean }) { - this.disabled = options?.disabled ?? false; + this._disabled = options?.disabled ?? false; // Initialization is lazy - file is created on first write } + /** + * Disable the cache at runtime. This is called automatically when + * initFunction is used, since remote function spans won't be in the cache. + */ + disable(): void { + this._disabled = true; + } + + get disabled(): boolean { + return this._disabled; + } + private async ensureInitialized(): Promise { if (this.initialized) { return; From 30e5217a9f1632135d88f63c6d7c2027480db73e Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 16 Dec 2025 15:04:38 -0800 Subject: [PATCH 15/65] otel support? --- integrations/otel-js/src/otel.ts | 12 ++++++++++- js/src/framework.ts | 15 +++++++++----- js/src/functions/invoke.test.ts | 34 ++++++++++++++++++++++++++++++++ js/src/logger.ts | 30 ++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 6 deletions(-) diff --git a/integrations/otel-js/src/otel.ts b/integrations/otel-js/src/otel.ts index 8132f6c88..25d7aaa0b 100644 --- a/integrations/otel-js/src/otel.ts +++ b/integrations/otel-js/src/otel.ts @@ -15,7 +15,11 @@ import { BatchSpanProcessor, type Span as SDKSpan, } from "@opentelemetry/sdk-trace-base"; -import { IDGenerator, type Span as BraintrustSpan } from "braintrust"; +import { + IDGenerator, + registerOtelFlush, + type Span as BraintrustSpan, +} from "braintrust"; interface ExportResult { code: number; @@ -251,6 +255,8 @@ export class BraintrustSpanProcessor implements SpanProcessor { } else { this.aiSpanProcessor = this.processor; } + // Register forceFlush callback with main SDK + registerOtelFlush(() => this.forceFlush()); return; } @@ -311,6 +317,10 @@ export class BraintrustSpanProcessor implements SpanProcessor { // Use the batch processor directly without filtering (default behavior) this.aiSpanProcessor = this.processor; } + + // Register forceFlush callback with main SDK so OTEL spans get flushed + // when scorers call trace.getSpans() and need to query BTQL + registerOtelFlush(() => this.forceFlush()); } onStart(span: SDKSpan, parentContext: Context): void { diff --git a/js/src/framework.ts b/js/src/framework.ts index 2ceca9f0c..34ba300a8 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -982,15 +982,20 @@ async function runEvaluatorInternal( const callback = async (rootSpan: Span) => { const ensureSpansFlushed = async () => { + // Flush native Braintrust spans if (experiment) { await flush({ state: experiment.loggingState }); - return; - } - if (evaluator.state) { + } else if (evaluator.state) { await flush({ state: evaluator.state }); - return; + } else { + await flush(); + } + + // Also flush OTEL spans if registered + const state = _internalGetGlobalState(); + if (state) { + await state.flushOtel(); } - await flush(); }; const trace = new Trace({ diff --git a/js/src/functions/invoke.test.ts b/js/src/functions/invoke.test.ts index 452518391..f79d752f6 100644 --- a/js/src/functions/invoke.test.ts +++ b/js/src/functions/invoke.test.ts @@ -46,3 +46,37 @@ describe("initFunction", () => { expect(fn.name).toBe("initFunction-my-project-my-scorer-latest"); }); }); + +describe("registerOtelFlush", () => { + beforeEach(() => { + _exportsForTestingOnly.setInitialTestState(); + }); + + afterEach(() => { + _exportsForTestingOnly.clearTestBackgroundLogger(); + }); + + test("should register OTEL flush callback", async () => { + const { registerOtelFlush } = await import("../logger"); + const state = _internalGetGlobalState(); + + let flushed = false; + const mockFlush = async () => { + flushed = true; + }; + + registerOtelFlush(mockFlush); + + // Calling flushOtel should invoke the registered callback + await state.flushOtel(); + + expect(flushed).toBe(true); + }); + + test("flushOtel should be no-op when no callback registered", async () => { + const state = _internalGetGlobalState(); + + // Should not throw + await state.flushOtel(); + }); +}); diff --git a/js/src/logger.ts b/js/src/logger.ts index bd79223eb..40003a2fb 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -561,6 +561,7 @@ export class BraintrustState { public spanCache: SpanCache; private _idGenerator: IDGenerator | null = null; private _contextManager: ContextManager | null = null; + private _otelFlushCallback: (() => Promise) | null = null; constructor(private loginParams: LoginOptions) { this.id = `${new Date().toLocaleString()}-${stateNonce++}`; // This is for debugging. uuidv4() breaks on platforms like Cloudflare. @@ -635,6 +636,24 @@ export class BraintrustState { return this._contextManager; } + /** + * Register an OTEL flush callback. This is called by @braintrust/otel + * when it initializes a BraintrustSpanProcessor/Exporter. + */ + public registerOtelFlush(callback: () => Promise): void { + this._otelFlushCallback = callback; + } + + /** + * Flush OTEL spans if a callback is registered. + * Called during ensureSpansFlushed to ensure OTEL spans are visible in BTQL. + */ + public async flushOtel(): Promise { + if (this._otelFlushCallback) { + await this._otelFlushCallback(); + } + } + public copyLoginInfo(other: BraintrustState) { this.appUrl = other.appUrl; this.appPublicUrl = other.appPublicUrl; @@ -906,6 +925,17 @@ export function _internalSetInitialState() { */ export const _internalGetGlobalState = () => _globalState; +/** + * Register a callback to flush OTEL spans. This is called by @braintrust/otel + * when it initializes a BraintrustSpanProcessor/Exporter. + * + * When ensureSpansFlushed is called (e.g., before a BTQL query in scorers), + * this callback will be invoked to ensure OTEL spans are flushed to the server. + */ +export function registerOtelFlush(callback: () => Promise): void { + _globalState?.registerOtelFlush(callback); +} + export class FailedHTTPResponse extends Error { public status: number; public text: string; From f67f9e8bab5f5688cf6a6b96376f87d72c17fffa Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 16 Dec 2025 15:17:08 -0800 Subject: [PATCH 16/65] turn off cache when otel is used --- js/src/logger.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/js/src/logger.ts b/js/src/logger.ts index 40003a2fb..87d5b63a6 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -931,9 +931,14 @@ export const _internalGetGlobalState = () => _globalState; * * When ensureSpansFlushed is called (e.g., before a BTQL query in scorers), * this callback will be invoked to ensure OTEL spans are flushed to the server. + * + * Also disables the span cache, since OTEL spans aren't in the local cache + * and we need BTQL to see the complete span tree (both native + OTEL spans). */ export function registerOtelFlush(callback: () => Promise): void { _globalState?.registerOtelFlush(callback); + // Disable span cache since OTEL spans aren't in the local cache + _globalState?.spanCache?.disable(); } export class FailedHTTPResponse extends Error { From 7807091aaa7033cc7cc579a7ebdbeaa77b966d96 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Dec 2025 10:16:28 -0800 Subject: [PATCH 17/65] remove console.log --- js/src/trace.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/js/src/trace.ts b/js/src/trace.ts index f7c809841..00a503388 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -85,7 +85,6 @@ export class Trace { // Try local cache first const cachedSpans = state.spanCache.getByRootSpanId(this.rootSpanId); - cachedSpans && cachedSpans.forEach((span) => console.log(span)); if (cachedSpans && cachedSpans.length > 0) { let spans = cachedSpans.filter( (span) => span.span_attributes?.type !== "score", @@ -108,8 +107,6 @@ export class Trace { })); } - console.log("Cache miss - falling back to BTQL"); - // Cache miss - fall back to BTQL await this.ensureSpansReady(); From bc236efc717d29a38e383f4323c9b042beddc5c1 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Dec 2025 10:17:06 -0800 Subject: [PATCH 18/65] sensible new version --- js/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/package.json b/js/package.json index 51f320d4c..d23a44d73 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "braintrust", - "version": "2.0.3", + "version": "2.0.0", "description": "SDK for integrating Braintrust", "repository": { "type": "git", From efea2cb71dc499755888963d341331a03d040a7f Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Dec 2025 11:11:45 -0800 Subject: [PATCH 19/65] fix build --- js/src/exports.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/js/src/exports.ts b/js/src/exports.ts index 814249ea3..b3a004c57 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -99,6 +99,7 @@ export { withLogger, withParent, wrapTraced, + registerOtelFlush, } from "./logger"; export type { InvokeFunctionArgs, InvokeReturn } from "./functions/invoke"; From b5d117b485ca1feca6449136a146d6d3a18e552e Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Dec 2025 13:29:43 -0800 Subject: [PATCH 20/65] try to fix web builds --- js/src/functions/invoke.test.ts | 2 + js/src/isomorph.ts | 10 ++++- js/src/node.ts | 7 ++++ js/src/span-cache.test.ts | 14 ++++++- js/src/span-cache.ts | 69 ++++++++++++++++++++++++--------- js/src/trace.ts | 10 ++++- 6 files changed, 90 insertions(+), 22 deletions(-) diff --git a/js/src/functions/invoke.test.ts b/js/src/functions/invoke.test.ts index f79d752f6..13ecb7e66 100644 --- a/js/src/functions/invoke.test.ts +++ b/js/src/functions/invoke.test.ts @@ -1,9 +1,11 @@ import { describe, expect, test, beforeEach, afterEach, vi } from "vitest"; import { initFunction } from "./invoke"; import { _internalGetGlobalState, _exportsForTestingOnly } from "../logger"; +import { configureNode } from "../node"; describe("initFunction", () => { beforeEach(() => { + configureNode(); _exportsForTestingOnly.setInitialTestState(); }); diff --git a/js/src/isomorph.ts b/js/src/isomorph.ts index 0095e7c5c..1e74cc65d 100644 --- a/js/src/isomorph.ts +++ b/js/src/isomorph.ts @@ -46,7 +46,7 @@ export interface Common { basename: (filepath: string) => string; writeln: (text: string) => void; - // Filesystem operations. + // Filesystem operations (async). pathJoin?: (...args: string[]) => string; pathDirname?: (path: string) => string; mkdir?: ( @@ -61,6 +61,14 @@ export interface Common { stat?: (path: string) => Promise; // type-erased statSync?: (path: string) => any; // type-erased homedir?: () => string; + tmpdir?: () => string; + + // Filesystem operations (sync) - for span cache. + writeFileSync?: (filename: string, data: string) => void; + appendFileSync?: (filename: string, data: string) => void; + readFileSync?: (filename: string, encoding: string) => string; + unlinkSync?: (path: string) => void; + openFile?: (path: string, flags: string) => Promise; // fs.promises.FileHandle, type-erased // zlib (promisified and type-erased). gunzip?: (data: any) => Promise; diff --git a/js/src/node.ts b/js/src/node.ts index 3ec477263..e358694b6 100644 --- a/js/src/node.ts +++ b/js/src/node.ts @@ -34,6 +34,13 @@ export function configureNode() { iso.utimes = fs.utimes; iso.unlink = fs.unlink; iso.homedir = os.homedir; + iso.tmpdir = os.tmpdir; + iso.writeFileSync = fsSync.writeFileSync; + iso.appendFileSync = fsSync.appendFileSync; + iso.readFileSync = (filename: string, encoding: string) => + fsSync.readFileSync(filename, encoding as BufferEncoding); + iso.unlinkSync = fsSync.unlinkSync; + iso.openFile = fs.open; iso.gzip = promisify(zlib.gzip); iso.gunzip = promisify(zlib.gunzip); iso.hash = (data) => crypto.createHash("sha256").update(data).digest("hex"); diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts index c1ac800b9..96ff07ed8 100644 --- a/js/src/span-cache.test.ts +++ b/js/src/span-cache.test.ts @@ -1,9 +1,21 @@ -import { describe, expect, test, beforeEach, afterEach } from "vitest"; +import { + describe, + expect, + test, + beforeEach, + afterEach, + beforeAll, +} from "vitest"; import { SpanCache } from "./span-cache"; +import { configureNode } from "./node"; describe("SpanCache (disk-based)", () => { let cache: SpanCache; + beforeAll(() => { + configureNode(); + }); + beforeEach(() => { cache = new SpanCache(); }); diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index b3064b8ec..b8c356584 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -4,11 +4,29 @@ * * Spans are stored on disk to minimize memory usage during evaluations. * The cache file is automatically cleaned up when dispose() is called. + * + * In browser environments where filesystem access isn't available, + * the cache becomes a no-op (all lookups return undefined). */ -import * as fs from "node:fs"; -import * as os from "node:os"; -import * as path from "node:path"; +import iso from "./isomorph"; + +/** + * Check if the span cache can be used (requires filesystem APIs). + * This is called at runtime, not at module load time, to allow + * configureNode() to set up the isomorph functions first. + */ +function canUseSpanCache(): boolean { + return !!( + iso.pathJoin && + iso.tmpdir && + iso.writeFileSync && + iso.appendFileSync && + iso.readFileSync && + iso.unlinkSync && + iso.openFile + ); +} export interface CachedSpan { input?: unknown; @@ -34,19 +52,23 @@ interface DiskSpanRecord { * * This cache writes spans to a temporary file to minimize memory usage. * It uses append-only writes and reads the full file when querying. + * + * In browser environments, this cache is automatically disabled and + * all operations become no-ops. */ export class SpanCache { private cacheFilePath: string | null = null; - private fileHandle: fs.promises.FileHandle | null = null; + private fileHandle: any | null = null; // type-erased fs.promises.FileHandle private initialized = false; private initPromise: Promise | null = null; - private _disabled: boolean; + private _explicitlyDisabled: boolean; // Small in-memory index tracking which rootSpanIds have data private rootSpanIndex: Set = new Set(); constructor(options?: { disabled?: boolean }) { - this._disabled = options?.disabled ?? false; + // Only track explicit disable from constructor - platform check is done at runtime + this._explicitlyDisabled = options?.disabled ?? false; // Initialization is lazy - file is created on first write } @@ -55,14 +77,18 @@ export class SpanCache { * initFunction is used, since remote function spans won't be in the cache. */ disable(): void { - this._disabled = true; + this._explicitlyDisabled = true; } get disabled(): boolean { - return this._disabled; + return this._explicitlyDisabled || !canUseSpanCache(); } private async ensureInitialized(): Promise { + if (this.disabled) { + return; + } + if (this.initialized) { return; } @@ -72,15 +98,15 @@ export class SpanCache { } this.initPromise = (async () => { - const tmpDir = os.tmpdir(); + const tmpDir = iso.tmpdir!(); const uniqueId = `${Date.now()}-${Math.random().toString(36).slice(2)}`; - this.cacheFilePath = path.join( + this.cacheFilePath = iso.pathJoin!( tmpDir, `braintrust-span-cache-${uniqueId}.jsonl`, ); // Open file for append+read - this.fileHandle = await fs.promises.open(this.cacheFilePath, "a+"); + this.fileHandle = await iso.openFile!(this.cacheFilePath, "a+"); this.initialized = true; })(); @@ -99,6 +125,10 @@ export class SpanCache { spanId: string, data: CachedSpan, ): Promise { + if (this.disabled) { + return; + } + await this.ensureInitialized(); const record: DiskSpanRecord = { rootSpanId, spanId, data }; @@ -119,21 +149,21 @@ export class SpanCache { // Lazy init - create file synchronously if needed if (!this.initialized) { - const tmpDir = os.tmpdir(); + const tmpDir = iso.tmpdir!(); const uniqueId = `${Date.now()}-${Math.random().toString(36).slice(2)}`; - this.cacheFilePath = path.join( + this.cacheFilePath = iso.pathJoin!( tmpDir, `braintrust-span-cache-${uniqueId}.jsonl`, ); // Touch the file - fs.writeFileSync(this.cacheFilePath, ""); + iso.writeFileSync!(this.cacheFilePath, ""); this.initialized = true; } const record: DiskSpanRecord = { rootSpanId, spanId, data }; const line = JSON.stringify(record) + "\n"; - fs.appendFileSync(this.cacheFilePath!, line, "utf8"); + iso.appendFileSync!(this.cacheFilePath!, line); this.rootSpanIndex.add(rootSpanId); } @@ -160,7 +190,7 @@ export class SpanCache { } try { - const content = fs.readFileSync(this.cacheFilePath, "utf8"); + const content = iso.readFileSync!(this.cacheFilePath, "utf8"); const lines = content.trim().split("\n").filter(Boolean); // Accumulate spans by spanId, merging updates @@ -201,6 +231,9 @@ export class SpanCache { * Check if a rootSpanId has cached data. */ has(rootSpanId: string): boolean { + if (this.disabled) { + return false; + } return this.rootSpanIndex.has(rootSpanId); } @@ -237,9 +270,9 @@ export class SpanCache { this.fileHandle = null; } - if (this.cacheFilePath) { + if (this.cacheFilePath && canUseSpanCache()) { try { - fs.unlinkSync(this.cacheFilePath); + iso.unlinkSync!(this.cacheFilePath); } catch { // Ignore cleanup errors } diff --git a/js/src/trace.ts b/js/src/trace.ts index 00a503388..ada3cc1fb 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -1,5 +1,5 @@ import { _internalGetGlobalState } from "./logger"; -import { createHash } from "node:crypto"; +import iso from "./isomorph"; const MAX_FETCH_RETRIES = 8; const INITIAL_RETRY_DELAY_MS = 250; @@ -25,7 +25,13 @@ const getMessageHash = ( hashCache: Map, ): string => { const messageString = JSON.stringify(message); - const hashString = createHash("md5").update(messageString).digest("hex"); + + // In browser without hash support, return unique string to force cache miss + if (!iso.hash) { + return `no-hash-${Date.now()}-${Math.random()}`; + } + + const hashString = iso.hash(messageString); // Cache the result hashCache.set(messageString, hashString); From a34741d94c6e3099bc7c56425a968c66f98354a2 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Dec 2025 14:49:17 -0800 Subject: [PATCH 21/65] don't pass trace to scoring args --- js/src/framework.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 982b7998b..ab68956bf 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -1123,12 +1123,15 @@ async function runEvaluatorInternal( return results; }; + // Exclude trace from logged input since it contains internal state + // that shouldn't be serialized (spansFlushPromise, spansFlushed, etc.) + const { trace: _trace, ...scoringArgsForLogging } = scoringArgs; const results = await rootSpan.traced(runScorer, { name: scorerNames[score_idx], spanAttributes: { type: SpanTypeAttribute.SCORE, }, - event: { input: scoringArgs }, + event: { input: scoringArgsForLogging }, }); return { kind: "score", value: results } as const; } catch (e) { From b1dc35026354434aee600312ac6d4705ccfcd0b1 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 09:48:16 -0800 Subject: [PATCH 22/65] pass state into the trace object --- js/src/framework.ts | 7 ++++++- js/src/trace.ts | 11 ++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index ab68956bf..737f2783f 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -955,6 +955,7 @@ async function runEvaluatorInternal( }; const callback = async (rootSpan: Span) => { + const state = _internalGetGlobalState(); const ensureSpansFlushed = async () => { // Flush native Braintrust spans if (experiment) { @@ -966,18 +967,22 @@ async function runEvaluatorInternal( } // Also flush OTEL spans if registered - const state = _internalGetGlobalState(); if (state) { await state.flushOtel(); } }; + if (!state) { + throw new Error("BraintrustState not initialized"); + } + const trace = new Trace({ experimentId: experimentIdPromise ? await experimentIdPromise : undefined, rootSpanId: rootSpan.rootSpanId, ensureSpansFlushed, + state, }); let metadata: Record = { ...("metadata" in datum ? datum.metadata : {}), diff --git a/js/src/trace.ts b/js/src/trace.ts index ada3cc1fb..bb19a4996 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -1,4 +1,4 @@ -import { _internalGetGlobalState } from "./logger"; +import type { BraintrustState } from "./logger"; import iso from "./isomorph"; const MAX_FETCH_RETRIES = 8; @@ -14,6 +14,7 @@ export interface TraceOptions { logsId?: string; rootSpanId: string; ensureSpansFlushed?: () => Promise; + state: BraintrustState; } function isObject(value: any): value is Record { @@ -49,6 +50,7 @@ export class Trace { private readonly logsId?: string; private readonly rootSpanId: string; private readonly ensureSpansFlushed?: () => Promise; + private readonly state: BraintrustState; private spansFlushed = false; private spansFlushPromise: Promise | null = null; @@ -57,11 +59,13 @@ export class Trace { logsId, rootSpanId, ensureSpansFlushed, + state, }: TraceOptions) { this.experimentId = experimentId; this.logsId = logsId; this.rootSpanId = rootSpanId; this.ensureSpansFlushed = ensureSpansFlushed; + this.state = state; } getConfiguration() { @@ -84,10 +88,7 @@ export class Trace { return []; } - const state = _internalGetGlobalState(); - if (!state) { - return []; - } + const state = this.state; // Try local cache first const cachedSpans = state.spanCache.getByRootSpanId(this.rootSpanId); From f67aba52ea9d104fb4e5653b7096bc4c132bd37b Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 14:03:49 -0800 Subject: [PATCH 23/65] get passed in state --- js/src/framework.ts | 6 ++--- js/src/trace.ts | 53 --------------------------------------------- 2 files changed, 3 insertions(+), 56 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 737f2783f..6e5356e1a 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -955,13 +955,13 @@ async function runEvaluatorInternal( }; const callback = async (rootSpan: Span) => { - const state = _internalGetGlobalState(); + const state = evaluator.state; const ensureSpansFlushed = async () => { // Flush native Braintrust spans if (experiment) { await flush({ state: experiment.loggingState }); - } else if (evaluator.state) { - await flush({ state: evaluator.state }); + } else if (state) { + await flush({ state }); } else { await flush(); } diff --git a/js/src/trace.ts b/js/src/trace.ts index bb19a4996..fa842a365 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -166,59 +166,6 @@ export class Trace { return []; } - /** - * Fetch the thread of messages for this trace. - * - * @experimental This method is experimental and may change in the future. - */ - async getThread() { - const spans = await this.getSpans({ spanType: ["llm"] }); - const hashCache = new Map(); - const messages: any[] = []; - const hashes = new Set(); - const addMessage = ( - rawMessage: any, - { skipDedupe = false }: { skipDedupe?: boolean } = {}, - ) => { - if (!isObject(rawMessage)) { - return; - } - const message = { ...rawMessage }; - const messageHash = getMessageHash(message, hashCache); - if (!skipDedupe && hashes.has(messageHash)) { - return; - } - messages.push(message); - hashes.add(messageHash); - }; - for (const span of spans) { - if (span.input instanceof Array) { - for (const message of span.input) { - addMessage(message); - } - } else if (isObject(span.input)) { - addMessage(span.input); - } else if (typeof span.input === "string") { - addMessage({ role: "user", content: span.input }); - } - - // Always include outputs - if (span.output instanceof Array) { - for (const message of span.output) { - addMessage(message, { skipDedupe: true }); - } - } else if (isObject(span.output)) { - addMessage(span.output, { skipDedupe: true }); - } else if (typeof span.output === "string") { - addMessage( - { role: "assistant", content: span.output }, - { skipDedupe: true }, - ); - } - } - return messages; - } - private async ensureSpansReady() { if (this.spansFlushed || !this.ensureSpansFlushed) { return; From dcc7dd725d4c688e5fd28053636226ce7f339a30 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 14:21:04 -0800 Subject: [PATCH 24/65] make cache writes not block --- js/src/framework.ts | 2 +- js/src/span-cache.ts | 170 ++++++++++++++++++++++++++----------------- 2 files changed, 106 insertions(+), 66 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 6e5356e1a..d896a4b1f 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -705,7 +705,7 @@ export async function Eval< } finally { progressReporter.stop(); // Clean up disk-based span cache after eval completes - _internalGetGlobalState()?.spanCache?.dispose(); + evaluator.state?.spanCache?.dispose(); } } diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index b8c356584..825bc9f25 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -113,58 +113,81 @@ export class SpanCache { return this.initPromise; } + // Buffer for pending writes - flushed asynchronously + private writeBuffer: DiskSpanRecord[] = []; + private flushScheduled = false; + private flushPromise: Promise | null = null; + /** - * Write or update a span in the cache. - * - * @param rootSpanId The root span ID that groups this span - * @param spanId The unique ID of this span - * @param data The span data to cache + * Queue a span write for async flushing. + * This is non-blocking - writes are buffered in memory and flushed + * to disk on the next microtask. */ - async write( - rootSpanId: string, - spanId: string, - data: CachedSpan, - ): Promise { + queueWrite(rootSpanId: string, spanId: string, data: CachedSpan): void { if (this.disabled) { return; } - await this.ensureInitialized(); - const record: DiskSpanRecord = { rootSpanId, spanId, data }; - const line = JSON.stringify(record) + "\n"; - - await this.fileHandle!.appendFile(line, "utf8"); + this.writeBuffer.push(record); this.rootSpanIndex.add(rootSpanId); + + // Schedule async flush if not already scheduled + if (!this.flushScheduled) { + this.flushScheduled = true; + this.flushPromise = this.flushWriteBuffer(); + } } /** - * Synchronous write - fire and forget. - * Uses sync file operations to avoid blocking the caller. + * Flush the write buffer to disk asynchronously. + * Called automatically after queueWrite, but can also be called explicitly. */ - writeSync(rootSpanId: string, spanId: string, data: CachedSpan): void { - if (this.disabled) { + async flushWriteBuffer(): Promise { + // Take a snapshot of records to flush, but DON'T clear the buffer yet. + // Records stay in writeBuffer until disk write succeeds so getByRootSpanId can find them. + const recordsToFlush = [...this.writeBuffer]; + this.flushScheduled = false; + + if (recordsToFlush.length === 0) { return; } - // Lazy init - create file synchronously if needed - if (!this.initialized) { - const tmpDir = iso.tmpdir!(); - const uniqueId = `${Date.now()}-${Math.random().toString(36).slice(2)}`; - this.cacheFilePath = iso.pathJoin!( - tmpDir, - `braintrust-span-cache-${uniqueId}.jsonl`, - ); - // Touch the file - iso.writeFileSync!(this.cacheFilePath, ""); - this.initialized = true; + await this.ensureInitialized(); + + if (!this.fileHandle) { + return; } - const record: DiskSpanRecord = { rootSpanId, spanId, data }; - const line = JSON.stringify(record) + "\n"; + const lines = recordsToFlush.map((r) => JSON.stringify(r) + "\n").join(""); + await this.fileHandle.appendFile(lines, "utf8"); - iso.appendFileSync!(this.cacheFilePath!, line); - this.rootSpanIndex.add(rootSpanId); + // Only now remove the flushed records from the buffer. + // Filter out the records we just wrote (compare by reference). + this.writeBuffer = this.writeBuffer.filter( + (r) => !recordsToFlush.includes(r), + ); + } + + /** + * Wait for any pending writes to complete. + * Call this before reading from the cache to ensure consistency. + */ + async waitForPendingWrites(): Promise { + if (this.flushPromise) { + await this.flushPromise; + this.flushPromise = null; + } + } + + /** + * @deprecated Use queueWrite instead - writeSync blocks the event loop. + * Synchronous write - fire and forget. + * Uses sync file operations to avoid blocking the caller. + */ + writeSync(rootSpanId: string, spanId: string, data: CachedSpan): void { + // Delegate to the non-blocking version + this.queueWrite(rootSpanId, spanId, data); } /** @@ -180,51 +203,63 @@ export class SpanCache { return undefined; } - if (!this.initialized || !this.cacheFilePath) { - return undefined; - } - // Quick check using in-memory index if (!this.rootSpanIndex.has(rootSpanId)) { return undefined; } - try { - const content = iso.readFileSync!(this.cacheFilePath, "utf8"); - const lines = content.trim().split("\n").filter(Boolean); - - // Accumulate spans by spanId, merging updates - const spanMap = new Map(); + // Accumulate spans by spanId, merging updates + const spanMap = new Map(); - for (const line of lines) { - try { - const record = JSON.parse(line) as DiskSpanRecord; - if (record.rootSpanId !== rootSpanId) { - continue; - } - - const existing = spanMap.get(record.spanId); - if (existing) { - spanMap.set( - record.spanId, - this.mergeSpanData(existing, record.data), - ); - } else { - spanMap.set(record.spanId, record.data); + // First, read from disk if initialized + if (this.initialized && this.cacheFilePath) { + try { + const content = iso.readFileSync!(this.cacheFilePath, "utf8"); + const lines = content.trim().split("\n").filter(Boolean); + + for (const line of lines) { + try { + const record = JSON.parse(line) as DiskSpanRecord; + if (record.rootSpanId !== rootSpanId) { + continue; + } + + const existing = spanMap.get(record.spanId); + if (existing) { + spanMap.set( + record.spanId, + this.mergeSpanData(existing, record.data), + ); + } else { + spanMap.set(record.spanId, record.data); + } + } catch { + // Skip malformed lines } - } catch { - // Skip malformed lines } + } catch { + // Continue to check buffer even if disk read fails } + } - if (spanMap.size === 0) { - return undefined; + // Also check the in-memory write buffer for unflushed data + for (const record of this.writeBuffer) { + if (record.rootSpanId !== rootSpanId) { + continue; + } + const existing = spanMap.get(record.spanId); + if (existing) { + spanMap.set(record.spanId, this.mergeSpanData(existing, record.data)); + } else { + spanMap.set(record.spanId, record.data); } + } - return Array.from(spanMap.values()); - } catch { + if (spanMap.size === 0) { return undefined; } + + return Array.from(spanMap.values()); } /** @@ -265,6 +300,11 @@ export class SpanCache { * Clean up the cache file. Call this when the eval is complete. */ dispose(): void { + // Clear pending writes + this.writeBuffer = []; + this.flushScheduled = false; + this.flushPromise = null; + if (this.fileHandle) { this.fileHandle.close().catch(() => {}); this.fileHandle = null; From 7cf3a5a0e3761870808850da3c16d900975cb7e5 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 14:26:01 -0800 Subject: [PATCH 25/65] remove trace re-export --- js/src/framework.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index d896a4b1f..604fa7ee9 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -11,7 +11,6 @@ import iso from "./isomorph"; import { GenericFunction } from "./framework-types"; import { CodeFunction, CodePrompt } from "./framework2"; import { Trace } from "./trace"; -export { Trace } from "./trace"; import { BaseMetadata, BraintrustState, From 0300f0778820b03f06de773300512eb07c62ceee Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 14:34:35 -0800 Subject: [PATCH 26/65] fix test --- js/src/framework.ts | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 604fa7ee9..724dad1eb 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -971,18 +971,16 @@ async function runEvaluatorInternal( } }; - if (!state) { - throw new Error("BraintrustState not initialized"); - } - - const trace = new Trace({ - experimentId: experimentIdPromise - ? await experimentIdPromise - : undefined, - rootSpanId: rootSpan.rootSpanId, - ensureSpansFlushed, - state, - }); + const trace = state + ? new Trace({ + experimentId: experimentIdPromise + ? await experimentIdPromise + : undefined, + rootSpanId: rootSpan.rootSpanId, + ensureSpansFlushed, + state, + }) + : undefined; let metadata: Record = { ...("metadata" in datum ? datum.metadata : {}), }; From 469fa205b54082af3daed4d7a7e6239bedef1d9b Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 15:19:21 -0800 Subject: [PATCH 27/65] extend object fetcher --- js/src/logger.ts | 4 +- js/src/trace.ts | 150 ++++++++++++++++++++++++----------------------- 2 files changed, 79 insertions(+), 75 deletions(-) diff --git a/js/src/logger.ts b/js/src/logger.ts index 0a406d32e..dcf820ec8 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -4833,9 +4833,9 @@ export type WithTransactionId = R & { }; export const DEFAULT_FETCH_BATCH_SIZE = 1000; -const MAX_BTQL_ITERATIONS = 10000; +export const MAX_BTQL_ITERATIONS = 10000; -class ObjectFetcher +export class ObjectFetcher implements AsyncIterable> { private _fetchedData: WithTransactionId[] | undefined = undefined; diff --git a/js/src/trace.ts b/js/src/trace.ts index fa842a365..e3135ffab 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -1,13 +1,4 @@ -import type { BraintrustState } from "./logger"; -import iso from "./isomorph"; - -const MAX_FETCH_RETRIES = 8; -const INITIAL_RETRY_DELAY_MS = 250; - -const sleep = (ms: number) => - new Promise((resolve) => { - setTimeout(resolve, ms); - }); +import { BraintrustState, ObjectFetcher, WithTransactionId } from "./logger"; export interface TraceOptions { experimentId?: string; @@ -17,27 +8,67 @@ export interface TraceOptions { state: BraintrustState; } -function isObject(value: any): value is Record { - return value !== null && typeof value === "object" && !Array.isArray(value); -} +// eslint-disable-next-line @typescript-eslint/no-explicit-any +type SpanRecord = any; -const getMessageHash = ( - message: any, - hashCache: Map, -): string => { - const messageString = JSON.stringify(message); +/** + * Internal fetcher for spans by root_span_id, using the ObjectFetcher pattern. + */ +class SpanFetcher extends ObjectFetcher { + constructor( + private readonly experimentId: string, + private readonly rootSpanId: string, + private readonly _state: BraintrustState, + private readonly spanTypeFilter?: string[], + ) { + // Build the filter expression for root_span_id and optionally span_attributes.type + const filterExpr = SpanFetcher.buildFilter(rootSpanId, spanTypeFilter); + + super("experiment", undefined, undefined, { + filter: filterExpr, + order_by: [{ expr: { op: "ident", name: ["_xact_id"] }, asc: true }], + }); + } - // In browser without hash support, return unique string to force cache miss - if (!iso.hash) { - return `no-hash-${Date.now()}-${Math.random()}`; + private static buildFilter( + rootSpanId: string, + spanTypeFilter?: string[], + ): Record { + // Base filter: root_span_id = 'value' + const rootSpanFilter = { + op: "eq", + left: { op: "ident", name: ["root_span_id"] }, + right: { op: "literal", value: rootSpanId }, + }; + + // If no spanType filter, just return root_span_id filter + if (!spanTypeFilter || spanTypeFilter.length === 0) { + return rootSpanFilter; + } + + // Add span_attributes.type IN [...] filter + const spanTypeInFilter = { + op: "in", + left: { op: "ident", name: ["span_attributes", "type"] }, + right: spanTypeFilter.map((t) => ({ op: "literal", value: t })), + }; + + // Combine with AND + return { + op: "and", + left: rootSpanFilter, + right: spanTypeInFilter, + }; } - const hashString = iso.hash(messageString); + public get id(): Promise { + return Promise.resolve(this.experimentId); + } - // Cache the result - hashCache.set(messageString, hashString); - return hashString; -}; + protected async getState(): Promise { + return this._state; + } +} /** * Carries identifying information about the evaluation so scorers can perform @@ -114,56 +145,29 @@ export class Trace { })); } - // Cache miss - fall back to BTQL + // Cache miss - fall back to BTQL via ObjectFetcher pattern await this.ensureSpansReady(); - await state.login({}); - const query = ` - from: experiment('${this.experimentId}') - | filter: root_span_id = '${this.rootSpanId}' ${spanType ? `AND span_attributes.type IN ${JSON.stringify(spanType)}` : ""} - | select: * - | sort: _xact_id asc - `; - - for (let attempt = 0; attempt < MAX_FETCH_RETRIES; attempt++) { - const response = await state.apiConn().post( - "btql", - { - query, - use_columnstore: true, - brainstore_realtime: true, - }, - { headers: { "Accept-Encoding": "gzip" } }, - ); - - const payload = await response.json(); - const rows = payload?.data ?? []; - const freshness = payload?.freshness_state; - const isFresh = - freshness?.last_processed_xact_id != null && - freshness?.last_processed_xact_id === - freshness?.last_considered_xact_id; - - if ((rows.length > 0 && isFresh) || attempt === MAX_FETCH_RETRIES - 1) { - return rows - .filter((row: any) => row.span_attributes?.type !== "score") - .map((row: any) => ({ - input: row.input, - output: row.output, - metadata: row.metadata, - span_id: row.span_id, - span_parents: row.span_parents, - span_attributes: row.span_attributes, - })); - } - - const backoff = - INITIAL_RETRY_DELAY_MS * Math.pow(2, Math.min(attempt, 3)); - await sleep(backoff); - } - - return []; + const fetcher = new SpanFetcher( + this.experimentId, + this.rootSpanId, + state, + spanType, + ); + + const rows: WithTransactionId[] = await fetcher.fetchedData(); + + return rows + .filter((row) => row.span_attributes?.type !== "score") + .map((row) => ({ + input: row.input, + output: row.output, + metadata: row.metadata, + span_id: row.span_id, + span_parents: row.span_parents, + span_attributes: row.span_attributes, + })); } private async ensureSpansReady() { From 5295cd4f6b854a3eda3faae1205ea5ff812232d1 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 26 Dec 2025 15:49:34 -0800 Subject: [PATCH 28/65] refactors for objects --- js/src/framework.ts | 7 ++++--- js/src/logger.ts | 2 +- js/src/trace.ts | 38 +++++++++++++++++--------------------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 724dad1eb..d865ceba8 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -973,9 +973,10 @@ async function runEvaluatorInternal( const trace = state ? new Trace({ - experimentId: experimentIdPromise - ? await experimentIdPromise - : undefined, + objectType: "experiment", + objectId: experimentIdPromise + ? (await experimentIdPromise) ?? "" + : "", rootSpanId: rootSpan.rootSpanId, ensureSpansFlushed, state, diff --git a/js/src/logger.ts b/js/src/logger.ts index dcf820ec8..3adb557e4 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -4841,7 +4841,7 @@ export class ObjectFetcher private _fetchedData: WithTransactionId[] | undefined = undefined; constructor( - private objectType: "dataset" | "experiment", + private objectType: "dataset" | "experiment" | "project_logs", private pinnedVersion: string | undefined, // eslint-disable-next-line @typescript-eslint/no-explicit-any private mutateRecord?: (r: any) => WithTransactionId, diff --git a/js/src/trace.ts b/js/src/trace.ts index e3135ffab..800481855 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -1,8 +1,8 @@ import { BraintrustState, ObjectFetcher, WithTransactionId } from "./logger"; export interface TraceOptions { - experimentId?: string; - logsId?: string; + objectType: "experiment" | "project_logs"; + objectId: string; rootSpanId: string; ensureSpansFlushed?: () => Promise; state: BraintrustState; @@ -16,7 +16,8 @@ type SpanRecord = any; */ class SpanFetcher extends ObjectFetcher { constructor( - private readonly experimentId: string, + objectType: "experiment" | "project_logs", + private readonly _objectId: string, private readonly rootSpanId: string, private readonly _state: BraintrustState, private readonly spanTypeFilter?: string[], @@ -24,7 +25,7 @@ class SpanFetcher extends ObjectFetcher { // Build the filter expression for root_span_id and optionally span_attributes.type const filterExpr = SpanFetcher.buildFilter(rootSpanId, spanTypeFilter); - super("experiment", undefined, undefined, { + super(objectType, undefined, undefined, { filter: filterExpr, order_by: [{ expr: { op: "ident", name: ["_xact_id"] }, asc: true }], }); @@ -62,7 +63,7 @@ class SpanFetcher extends ObjectFetcher { } public get id(): Promise { - return Promise.resolve(this.experimentId); + return Promise.resolve(this._objectId); } protected async getState(): Promise { @@ -77,8 +78,8 @@ class SpanFetcher extends ObjectFetcher { */ export class Trace { // Store values privately so future helper methods can expose them safely. - private readonly experimentId?: string; - private readonly logsId?: string; + private readonly objectType: "experiment" | "project_logs"; + private readonly objectId: string; private readonly rootSpanId: string; private readonly ensureSpansFlushed?: () => Promise; private readonly state: BraintrustState; @@ -86,14 +87,14 @@ export class Trace { private spansFlushPromise: Promise | null = null; constructor({ - experimentId, - logsId, + objectType, + objectId, rootSpanId, ensureSpansFlushed, state, }: TraceOptions) { - this.experimentId = experimentId; - this.logsId = logsId; + this.objectType = objectType; + this.objectId = objectId; this.rootSpanId = rootSpanId; this.ensureSpansFlushed = ensureSpansFlushed; this.state = state; @@ -101,24 +102,18 @@ export class Trace { getConfiguration() { return { - experimentId: this.experimentId, - logsId: this.logsId, + objectType: this.objectType, + objectId: this.objectId, rootSpanId: this.rootSpanId, }; } /** - * Fetch all rows for this root span from its parent experiment. - * Returns an empty array when no experiment is associated with the context. - * + * Fetch all rows for this root span from its parent object (experiment or project logs). * First checks the local span cache for recently logged spans, then falls * back to BTQL API if not found in cache. */ async getSpans({ spanType }: { spanType?: string[] } = {}): Promise { - if (!this.experimentId) { - return []; - } - const state = this.state; // Try local cache first @@ -150,7 +145,8 @@ export class Trace { await state.login({}); const fetcher = new SpanFetcher( - this.experimentId, + this.objectType, + this.objectId, this.rootSpanId, state, spanType, From b5b35117090ed9b60e33236f380728eca2181a31 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 29 Dec 2025 10:22:03 -0800 Subject: [PATCH 29/65] state argument for init function --- js/src/functions/invoke.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/js/src/functions/invoke.ts b/js/src/functions/invoke.ts index 1814ff0a9..bcc082dcf 100644 --- a/js/src/functions/invoke.ts +++ b/js/src/functions/invoke.ts @@ -250,13 +250,16 @@ export function initFunction({ projectName, slug, version, + state, }: { projectName: string; slug: string; version?: string; + state?: BraintrustState; }) { // Disable span cache since remote function spans won't be in the local cache - _internalGetGlobalState()?.spanCache?.disable(); + const s = state ?? _internalGetGlobalState(); + s?.spanCache?.disable(); // eslint-disable-next-line @typescript-eslint/no-explicit-any const f = async (input: any): Promise => { From c31413898a4ff3467456fbd5a302cf8077694bb5 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 29 Dec 2025 10:22:51 -0800 Subject: [PATCH 30/65] forgot doc --- js/src/functions/invoke.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/js/src/functions/invoke.ts b/js/src/functions/invoke.ts index bcc082dcf..3880306d9 100644 --- a/js/src/functions/invoke.ts +++ b/js/src/functions/invoke.ts @@ -244,6 +244,7 @@ export async function invoke( * @param options.projectName The project name containing the function. * @param options.slug The slug of the function to invoke. * @param options.version Optional version of the function to use. Defaults to latest. + * @param options.state Optional Braintrust state to use. * @returns A function that can be used as a task or scorer in Eval(). */ export function initFunction({ From 3ff1c5d1d9d27d0a754d58c4da379a8adf99c746 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 29 Dec 2025 12:22:03 -0800 Subject: [PATCH 31/65] evaluator doesn't always have state --- js/package.json | 1 - js/src/framework.ts | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/js/package.json b/js/package.json index 9681647f6..7e963b817 100644 --- a/js/package.json +++ b/js/package.json @@ -1,7 +1,6 @@ { "name": "braintrust", "version": "2.0.0", - "version": "1.1.1", "description": "SDK for integrating Braintrust", "repository": { "type": "git", diff --git a/js/src/framework.ts b/js/src/framework.ts index d865ceba8..ac075cadc 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -954,7 +954,7 @@ async function runEvaluatorInternal( }; const callback = async (rootSpan: Span) => { - const state = evaluator.state; + const state = evaluator.state ?? _internalGetGlobalState(); const ensureSpansFlushed = async () => { // Flush native Braintrust spans if (experiment) { @@ -970,6 +970,7 @@ async function runEvaluatorInternal( await state.flushOtel(); } }; + console.log("state", state); const trace = state ? new Trace({ From b252963bc6d444b54e6e572e8734e6755e4e80e8 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Mon, 29 Dec 2025 12:24:21 -0800 Subject: [PATCH 32/65] console --- js/src/framework.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index ac075cadc..6629411a0 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -970,7 +970,6 @@ async function runEvaluatorInternal( await state.flushOtel(); } }; - console.log("state", state); const trace = state ? new Trace({ From 56c5bf9b4ebb0ac9956ce1dfd2e2f6e160947cd6 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Tue, 30 Dec 2025 23:37:13 -0800 Subject: [PATCH 33/65] export trace --- js/src/exports.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/js/src/exports.ts b/js/src/exports.ts index da13fd94f..e3cc62d93 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -193,6 +193,8 @@ export { toolFunctionDefinitionSchema, } from "./framework2"; +export { Trace } from "./trace"; + export type { ParentExperimentIds, ParentProjectLogIds, From 38dcd0091a4813a704e7b54daa6394969a299157 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Thu, 1 Jan 2026 10:04:39 -0800 Subject: [PATCH 34/65] convert Trace to be an interface --- js/src/exports.ts | 2 +- js/src/framework.ts | 4 ++-- js/src/trace.ts | 40 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/js/src/exports.ts b/js/src/exports.ts index e3cc62d93..2bb97e391 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -193,7 +193,7 @@ export { toolFunctionDefinitionSchema, } from "./framework2"; -export { Trace } from "./trace"; +export type { Trace, SpanData } from "./trace"; export type { ParentExperimentIds, diff --git a/js/src/framework.ts b/js/src/framework.ts index 6629411a0..d1e92086f 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -10,7 +10,7 @@ import { queue } from "async"; import iso from "./isomorph"; import { GenericFunction } from "./framework-types"; import { CodeFunction, CodePrompt } from "./framework2"; -import { Trace } from "./trace"; +import { Trace, LocalTrace } from "./trace"; import { BaseMetadata, BraintrustState, @@ -972,7 +972,7 @@ async function runEvaluatorInternal( }; const trace = state - ? new Trace({ + ? new LocalTrace({ objectType: "experiment", objectId: experimentIdPromise ? (await experimentIdPromise) ?? "" diff --git a/js/src/trace.ts b/js/src/trace.ts index 800481855..a2b61f61f 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -72,11 +72,41 @@ class SpanFetcher extends ObjectFetcher { } /** + * Span data returned by getSpans(). + */ +export interface SpanData { + input?: unknown; + output?: unknown; + metadata?: Record; + span_id?: string; + span_parents?: string[]; + span_attributes?: { + type?: string; + name?: string; + [key: string]: unknown; + }; + [key: string]: unknown; +} + +/** + * Interface for trace objects that can be used by scorers. + * Both the SDK's LocalTrace class and the API wrapper's WrapperTrace implement this. + */ +export interface Trace { + getConfiguration(): { + objectType: string; + objectId: string; + rootSpanId: string; + }; + getSpans(options?: { spanType?: string[] }): Promise; +} + +/** + * SDK implementation of Trace that uses local span cache and falls back to BTQL. * Carries identifying information about the evaluation so scorers can perform - * richer logging or side effects. Additional behavior will be layered on top - * of this skeleton class later. + * richer logging or side effects. */ -export class Trace { +export class LocalTrace implements Trace { // Store values privately so future helper methods can expose them safely. private readonly objectType: "experiment" | "project_logs"; private readonly objectId: string; @@ -113,7 +143,9 @@ export class Trace { * First checks the local span cache for recently logged spans, then falls * back to BTQL API if not found in cache. */ - async getSpans({ spanType }: { spanType?: string[] } = {}): Promise { + async getSpans({ spanType }: { spanType?: string[] } = {}): Promise< + SpanData[] + > { const state = this.state; // Try local cache first From 087e2a2ac38bcdc8f1bbc236a8d89342dfa7a829 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Thu, 1 Jan 2026 10:33:40 -0800 Subject: [PATCH 35/65] export more stuff --- js/src/exports.ts | 2 ++ js/src/trace.ts | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/js/src/exports.ts b/js/src/exports.ts index 2bb97e391..b96024326 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -45,6 +45,7 @@ export { ContextManager, DEFAULT_FETCH_BATCH_SIZE, Dataset, + ObjectFetcher, ERR_PERMALINK, Experiment, ExternalAttachment, @@ -194,6 +195,7 @@ export { } from "./framework2"; export type { Trace, SpanData } from "./trace"; +export { SpanFetcher } from "./trace"; export type { ParentExperimentIds, diff --git a/js/src/trace.ts b/js/src/trace.ts index a2b61f61f..45185d40a 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -12,9 +12,10 @@ export interface TraceOptions { type SpanRecord = any; /** - * Internal fetcher for spans by root_span_id, using the ObjectFetcher pattern. + * Fetcher for spans by root_span_id, using the ObjectFetcher pattern. + * Handles pagination automatically via cursor-based iteration. */ -class SpanFetcher extends ObjectFetcher { +export class SpanFetcher extends ObjectFetcher { constructor( objectType: "experiment" | "project_logs", private readonly _objectId: string, From ddccd089e254d273899a2d3d21d3430b5332262f Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Fri, 2 Jan 2026 13:17:39 -0800 Subject: [PATCH 36/65] fix --- js/src/trace.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/js/src/trace.ts b/js/src/trace.ts index 45185d40a..647911c9c 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -28,7 +28,6 @@ export class SpanFetcher extends ObjectFetcher { super(objectType, undefined, undefined, { filter: filterExpr, - order_by: [{ expr: { op: "ident", name: ["_xact_id"] }, asc: true }], }); } @@ -52,14 +51,13 @@ export class SpanFetcher extends ObjectFetcher { const spanTypeInFilter = { op: "in", left: { op: "ident", name: ["span_attributes", "type"] }, - right: spanTypeFilter.map((t) => ({ op: "literal", value: t })), + right: { op: "literal", value: spanTypeFilter }, }; // Combine with AND return { op: "and", - left: rootSpanFilter, - right: spanTypeInFilter, + children: [rootSpanFilter, spanTypeInFilter], }; } From 4c789a05e1c883783a89681e541d7cc6e0b3184a Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Fri, 2 Jan 2026 17:09:28 -0800 Subject: [PATCH 37/65] use span_attributes.purpsoe --- js/src/trace.ts | 50 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/js/src/trace.ts b/js/src/trace.ts index 647911c9c..9e08b427a 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -1,3 +1,4 @@ +import { Expr } from "../btql/ast"; import { BraintrustState, ObjectFetcher, WithTransactionId } from "./logger"; export interface TraceOptions { @@ -35,29 +36,44 @@ export class SpanFetcher extends ObjectFetcher { rootSpanId: string, spanTypeFilter?: string[], ): Record { - // Base filter: root_span_id = 'value' - const rootSpanFilter = { - op: "eq", - left: { op: "ident", name: ["root_span_id"] }, - right: { op: "literal", value: rootSpanId }, - }; + const children: Expr[] = [ + // Base filter: root_span_id = 'value' + { + op: "eq", + left: { op: "ident", name: ["root_span_id"] }, + right: { op: "literal", value: rootSpanId }, + }, + // Exclude span_attributes.purpose = 'score' + { + op: "or", + children: [ + { + op: "isnull", + expr: { op: "ident", name: ["span_attributes", "purpose"] }, + }, + { + op: "ne", + left: { op: "ident", name: ["span_attributes", "purpose"] }, + right: { op: "literal", value: "scorer" }, + }, + ], + }, + ]; // If no spanType filter, just return root_span_id filter - if (!spanTypeFilter || spanTypeFilter.length === 0) { - return rootSpanFilter; + if (spanTypeFilter && spanTypeFilter.length > 0) { + // Add span_attributes.type IN [...] filter + children.push({ + op: "in", + left: { op: "ident", name: ["span_attributes", "type"] }, + right: { op: "literal", value: spanTypeFilter }, + }); } - // Add span_attributes.type IN [...] filter - const spanTypeInFilter = { - op: "in", - left: { op: "ident", name: ["span_attributes", "type"] }, - right: { op: "literal", value: spanTypeFilter }, - }; - // Combine with AND return { op: "and", - children: [rootSpanFilter, spanTypeInFilter], + children, }; } @@ -186,7 +202,7 @@ export class LocalTrace implements Trace { const rows: WithTransactionId[] = await fetcher.fetchedData(); return rows - .filter((row) => row.span_attributes?.type !== "score") + .filter((row) => row.span_attributes?.purpose !== "scorer") .map((row) => ({ input: row.input, output: row.output, From ca8b861d46a5d76d50d705a7b23e0b2d0a3cd28e Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Fri, 2 Jan 2026 17:15:55 -0800 Subject: [PATCH 38/65] fix --- js/src/trace.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/trace.ts b/js/src/trace.ts index 9e08b427a..e6be84613 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -167,7 +167,7 @@ export class LocalTrace implements Trace { const cachedSpans = state.spanCache.getByRootSpanId(this.rootSpanId); if (cachedSpans && cachedSpans.length > 0) { let spans = cachedSpans.filter( - (span) => span.span_attributes?.type !== "score", + (span) => span.span_attributes?.purpose !== "scorer", ); // Apply spanType filter if specified From c5e43fc067ba8001a7d33d8d53242467f5d4b3f9 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Fri, 9 Jan 2026 16:21:06 -0800 Subject: [PATCH 39/65] only turn cache on for evals --- js/src/framework.ts | 8 +++- js/src/functions/invoke.test.ts | 6 +-- js/src/span-cache.test.ts | 79 ++++++++++++++++++++++++++++++++- js/src/span-cache.ts | 29 +++++++++++- 4 files changed, 114 insertions(+), 8 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 4d8eaf8ba..16de6f050 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -617,6 +617,8 @@ export async function Eval< } const resolvedReporter = options.reporter || defaultReporter; + // Start span cache for this eval (it's disabled by default to avoid temp files outside of evals) + (evaluator.state ?? _internalGetGlobalState())?.spanCache?.start(); try { const { data, baseExperiment: defaultBaseExperiment } = callEvaluatorData( evaluator.data, @@ -708,8 +710,10 @@ export async function Eval< } } finally { progressReporter.stop(); - // Clean up disk-based span cache after eval completes - evaluator.state?.spanCache?.dispose(); + // Clean up disk-based span cache after eval completes and stop caching + const spanCache = (evaluator.state ?? _internalGetGlobalState())?.spanCache; + spanCache?.dispose(); + spanCache?.stop(); } } diff --git a/js/src/functions/invoke.test.ts b/js/src/functions/invoke.test.ts index 13ecb7e66..f887512ca 100644 --- a/js/src/functions/invoke.test.ts +++ b/js/src/functions/invoke.test.ts @@ -16,8 +16,8 @@ describe("initFunction", () => { test("should disable span cache when called", async () => { const state = _internalGetGlobalState(); - // Cache should not be disabled initially - expect(state.spanCache.disabled).toBe(false); + // Cache should be disabled by default (it's only enabled during evals) + expect(state.spanCache.disabled).toBe(true); // Call initFunction initFunction({ @@ -25,7 +25,7 @@ describe("initFunction", () => { slug: "test-function", }); - // Cache should now be disabled + // Cache should still be disabled (initFunction also explicitly disables it) expect(state.spanCache.disabled).toBe(true); }); diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts index 96ff07ed8..82270eb61 100644 --- a/js/src/span-cache.test.ts +++ b/js/src/span-cache.test.ts @@ -18,6 +18,7 @@ describe("SpanCache (disk-based)", () => { beforeEach(() => { cache = new SpanCache(); + cache.start(); // Start for testing (cache is disabled by default) }); afterEach(() => { @@ -157,7 +158,7 @@ describe("SpanCache (disk-based)", () => { expect(cache.size).toBe(0); expect(cache.has("root-1")).toBe(false); - // Should be able to write again after dispose + // Should be able to write again after dispose (cache is still enabled) cache.writeSync("root-2", "span-2", { span_id: "span-2" }); expect(cache.size).toBe(1); }); @@ -186,9 +187,15 @@ describe("SpanCache (disk-based)", () => { }); test("disabled getter should reflect disabled state", () => { + // Cache is enabled in beforeEach, so starts as not disabled expect(cache.disabled).toBe(false); cache.disable(); expect(cache.disabled).toBe(true); + + // Creating a new cache without enable() should be disabled by default + const newCache = new SpanCache(); + expect(newCache.disabled).toBe(true); + newCache.dispose(); }); test("should be disabled from constructor option", () => { @@ -203,4 +210,74 @@ describe("SpanCache (disk-based)", () => { disabledCache.dispose(); }); }); + + describe("start/stop lifecycle", () => { + test("stop() allows start() to work again", () => { + const freshCache = new SpanCache(); + + // Initially disabled by default + expect(freshCache.disabled).toBe(true); + + // Start for first "eval" + freshCache.start(); + expect(freshCache.disabled).toBe(false); + freshCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(freshCache.size).toBe(1); + + // Stop after first "eval" (like calling stop() in finally block) + freshCache.dispose(); + freshCache.stop(); + expect(freshCache.disabled).toBe(true); + + // Start for second "eval" - should work! + freshCache.start(); + expect(freshCache.disabled).toBe(false); + freshCache.writeSync("root-2", "span-2", { span_id: "span-2" }); + expect(freshCache.size).toBe(1); + + freshCache.dispose(); + }); + + test("disable() prevents start() from working", () => { + const freshCache = new SpanCache(); + + // Simulate OTEL/initFunction calling disable() + freshCache.disable(); + expect(freshCache.disabled).toBe(true); + + // start() should be a no-op after disable() + freshCache.start(); + expect(freshCache.disabled).toBe(true); + + // Writes should still be no-ops + freshCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(freshCache.size).toBe(0); + + freshCache.dispose(); + }); + + test("disable() during active cache prevents future start()", () => { + const freshCache = new SpanCache(); + + // Start for "eval" + freshCache.start(); + expect(freshCache.disabled).toBe(false); + freshCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + expect(freshCache.size).toBe(1); + + // Simulate OTEL being registered mid-eval + freshCache.disable(); + expect(freshCache.disabled).toBe(true); + + // Stop after eval + freshCache.dispose(); + freshCache.stop(); + + // Future start() should be blocked because disable() was called + freshCache.start(); + expect(freshCache.disabled).toBe(true); + + freshCache.dispose(); + }); + }); }); diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index 825bc9f25..039dc80fc 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -61,14 +61,18 @@ export class SpanCache { private fileHandle: any | null = null; // type-erased fs.promises.FileHandle private initialized = false; private initPromise: Promise | null = null; + // Tracks whether the cache was explicitly disabled (via constructor or disable()) private _explicitlyDisabled: boolean; + // Tracks whether the cache has been enabled (for evals only) + private _enabled: boolean = false; // Small in-memory index tracking which rootSpanIds have data private rootSpanIndex: Set = new Set(); constructor(options?: { disabled?: boolean }) { - // Only track explicit disable from constructor - platform check is done at runtime + // Track if user explicitly disabled the cache this._explicitlyDisabled = options?.disabled ?? false; + // Cache is disabled by default until enable() is called (e.g., during Eval) // Initialization is lazy - file is created on first write } @@ -80,8 +84,29 @@ export class SpanCache { this._explicitlyDisabled = true; } + /** + * Start caching spans for use during evaluations. + * This only starts caching if the cache wasn't permanently disabled. + * Called by Eval() to turn on caching for the duration of the eval. + */ + start(): void { + if (!this._explicitlyDisabled) { + this._enabled = true; + } + } + + /** + * Stop caching spans and return to the default disabled state. + * Unlike disable(), this allows start() to work again for future evals. + * Called after an eval completes to return to the default state. + */ + stop(): void { + this._enabled = false; + } + get disabled(): boolean { - return this._explicitlyDisabled || !canUseSpanCache(); + // Disabled if: explicitly disabled, not enabled, or platform doesn't support it + return this._explicitlyDisabled || !this._enabled || !canUseSpanCache(); } private async ensureInitialized(): Promise { From 863b5e625cbdbec24f1d2f2e66e93ca9bfde326a Mon Sep 17 00:00:00 2001 From: Alex Z Date: Sun, 11 Jan 2026 23:22:48 -0800 Subject: [PATCH 40/65] get rid of syncwrite --- js/src/logger.ts | 6 ++++- js/src/span-cache.test.ts | 48 +++++++++++++++++++-------------------- js/src/span-cache.ts | 10 -------- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/js/src/logger.ts b/js/src/logger.ts index 3d733c2bc..221c3c0d4 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -5714,7 +5714,11 @@ export class SpanImpl implements Span { span_attributes: partialRecord.span_attributes as CachedSpan["span_attributes"], }; - this._state.spanCache.writeSync(this._rootSpanId, this._spanId, cachedSpan); + this._state.spanCache.queueWrite( + this._rootSpanId, + this._spanId, + cachedSpan, + ); const computeRecord = async () => ({ ...partialRecord, diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts index 82270eb61..16ac6682e 100644 --- a/js/src/span-cache.test.ts +++ b/js/src/span-cache.test.ts @@ -40,8 +40,8 @@ describe("SpanCache (disk-based)", () => { output: { response: "bar" }, }; - cache.writeSync(rootSpanId, span1.span_id, span1); - cache.writeSync(rootSpanId, span2.span_id, span2); + cache.queueWrite(rootSpanId, span1.span_id, span1); + cache.queueWrite(rootSpanId, span2.span_id, span2); const spans = cache.getByRootSpanId(rootSpanId); expect(spans).toHaveLength(2); @@ -58,12 +58,12 @@ describe("SpanCache (disk-based)", () => { const rootSpanId = "root-123"; const spanId = "span-1"; - cache.writeSync(rootSpanId, spanId, { + cache.queueWrite(rootSpanId, spanId, { span_id: spanId, input: { text: "hello" }, }); - cache.writeSync(rootSpanId, spanId, { + cache.queueWrite(rootSpanId, spanId, { span_id: spanId, output: { response: "world" }, }); @@ -81,12 +81,12 @@ describe("SpanCache (disk-based)", () => { const rootSpanId = "root-123"; const spanId = "span-1"; - cache.writeSync(rootSpanId, spanId, { + cache.queueWrite(rootSpanId, spanId, { span_id: spanId, metadata: { key1: "value1" }, }); - cache.writeSync(rootSpanId, spanId, { + cache.queueWrite(rootSpanId, spanId, { span_id: spanId, metadata: { key2: "value2" }, }); @@ -101,7 +101,7 @@ describe("SpanCache (disk-based)", () => { describe("has", () => { test("should return true when rootSpanId exists", () => { - cache.writeSync("root-123", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-123", "span-1", { span_id: "span-1" }); expect(cache.has("root-123")).toBe(true); }); @@ -112,8 +112,8 @@ describe("SpanCache (disk-based)", () => { describe("clear", () => { test("should remove spans for a specific rootSpanId from index", () => { - cache.writeSync("root-1", "span-1", { span_id: "span-1" }); - cache.writeSync("root-2", "span-2", { span_id: "span-2" }); + cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-2", "span-2", { span_id: "span-2" }); cache.clear("root-1"); @@ -124,8 +124,8 @@ describe("SpanCache (disk-based)", () => { describe("clearAll", () => { test("should remove all cached spans", () => { - cache.writeSync("root-1", "span-1", { span_id: "span-1" }); - cache.writeSync("root-2", "span-2", { span_id: "span-2" }); + cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-2", "span-2", { span_id: "span-2" }); cache.clearAll(); @@ -137,20 +137,20 @@ describe("SpanCache (disk-based)", () => { test("should return the number of root spans tracked", () => { expect(cache.size).toBe(0); - cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(cache.size).toBe(1); - cache.writeSync("root-1", "span-2", { span_id: "span-2" }); // Same root + cache.queueWrite("root-1", "span-2", { span_id: "span-2" }); // Same root expect(cache.size).toBe(1); - cache.writeSync("root-2", "span-3", { span_id: "span-3" }); // Different root + cache.queueWrite("root-2", "span-3", { span_id: "span-3" }); // Different root expect(cache.size).toBe(2); }); }); describe("dispose", () => { test("should clean up and allow reuse", () => { - cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(cache.size).toBe(1); cache.dispose(); @@ -159,25 +159,25 @@ describe("SpanCache (disk-based)", () => { expect(cache.has("root-1")).toBe(false); // Should be able to write again after dispose (cache is still enabled) - cache.writeSync("root-2", "span-2", { span_id: "span-2" }); + cache.queueWrite("root-2", "span-2", { span_id: "span-2" }); expect(cache.size).toBe(1); }); }); describe("disable", () => { test("should prevent writes after disable() is called", () => { - cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(cache.size).toBe(1); cache.disable(); // Writes after disable should be no-ops - cache.writeSync("root-2", "span-2", { span_id: "span-2" }); + cache.queueWrite("root-2", "span-2", { span_id: "span-2" }); expect(cache.size).toBe(1); // Still 1, not 2 }); test("should return undefined from getByRootSpanId after disable()", () => { - cache.writeSync("root-1", "span-1", { span_id: "span-1" }); + cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(cache.getByRootSpanId("root-1")).toBeDefined(); cache.disable(); @@ -203,7 +203,7 @@ describe("SpanCache (disk-based)", () => { expect(disabledCache.disabled).toBe(true); // Writes should be no-ops - disabledCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + disabledCache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(disabledCache.size).toBe(0); expect(disabledCache.getByRootSpanId("root-1")).toBeUndefined(); @@ -221,7 +221,7 @@ describe("SpanCache (disk-based)", () => { // Start for first "eval" freshCache.start(); expect(freshCache.disabled).toBe(false); - freshCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + freshCache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(freshCache.size).toBe(1); // Stop after first "eval" (like calling stop() in finally block) @@ -232,7 +232,7 @@ describe("SpanCache (disk-based)", () => { // Start for second "eval" - should work! freshCache.start(); expect(freshCache.disabled).toBe(false); - freshCache.writeSync("root-2", "span-2", { span_id: "span-2" }); + freshCache.queueWrite("root-2", "span-2", { span_id: "span-2" }); expect(freshCache.size).toBe(1); freshCache.dispose(); @@ -250,7 +250,7 @@ describe("SpanCache (disk-based)", () => { expect(freshCache.disabled).toBe(true); // Writes should still be no-ops - freshCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + freshCache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(freshCache.size).toBe(0); freshCache.dispose(); @@ -262,7 +262,7 @@ describe("SpanCache (disk-based)", () => { // Start for "eval" freshCache.start(); expect(freshCache.disabled).toBe(false); - freshCache.writeSync("root-1", "span-1", { span_id: "span-1" }); + freshCache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(freshCache.size).toBe(1); // Simulate OTEL being registered mid-eval diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index 039dc80fc..a643ece66 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -205,16 +205,6 @@ export class SpanCache { } } - /** - * @deprecated Use queueWrite instead - writeSync blocks the event loop. - * Synchronous write - fire and forget. - * Uses sync file operations to avoid blocking the caller. - */ - writeSync(rootSpanId: string, spanId: string, data: CachedSpan): void { - // Delegate to the non-blocking version - this.queueWrite(rootSpanId, spanId, data); - } - /** * Get all cached spans for a given rootSpanId. * From 7992b7414e9843acc6ddaa469cb520167a627b66 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Mon, 12 Jan 2026 21:21:04 -0800 Subject: [PATCH 41/65] factor out cached span fetcher --- js/src/exports.ts | 2 +- js/src/trace.test.ts | 226 +++++++++++++++++++++++++++++++++++++++++++ js/src/trace.ts | 166 +++++++++++++++++++++++++------ 3 files changed, 362 insertions(+), 32 deletions(-) create mode 100644 js/src/trace.test.ts diff --git a/js/src/exports.ts b/js/src/exports.ts index b96024326..389d0fab2 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -195,7 +195,7 @@ export { } from "./framework2"; export type { Trace, SpanData } from "./trace"; -export { SpanFetcher } from "./trace"; +export { SpanFetcher, CachedSpanFetcher } from "./trace"; export type { ParentExperimentIds, diff --git a/js/src/trace.test.ts b/js/src/trace.test.ts new file mode 100644 index 000000000..60bc8b9b0 --- /dev/null +++ b/js/src/trace.test.ts @@ -0,0 +1,226 @@ +import { describe, expect, test, vi, beforeEach } from "vitest"; +import { CachedSpanFetcher, SpanData, SpanFetchFn } from "./trace"; + +describe("CachedSpanFetcher", () => { + // Helper to create mock spans + const makeSpan = ( + spanId: string, + type: string, + extra: Partial = {}, + ): SpanData => ({ + span_id: spanId, + input: { text: `input-${spanId}` }, + output: { text: `output-${spanId}` }, + span_attributes: { type }, + ...extra, + }); + + describe("basic fetching", () => { + test("should fetch all spans when no filter specified", async () => { + const mockSpans = [ + makeSpan("span-1", "llm"), + makeSpan("span-2", "function"), + makeSpan("span-3", "llm"), + ]; + + const fetchFn = vi.fn().mockResolvedValue(mockSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + const result = await fetcher.getSpans(); + + expect(fetchFn).toHaveBeenCalledTimes(1); + expect(fetchFn).toHaveBeenCalledWith(undefined); + expect(result).toHaveLength(3); + // Order may differ since spans are grouped by type in cache + expect(result.map((s) => s.span_id).sort()).toEqual([ + "span-1", + "span-2", + "span-3", + ]); + }); + + test("should fetch specific span types when filter specified", async () => { + const llmSpans = [makeSpan("span-1", "llm"), makeSpan("span-2", "llm")]; + + const fetchFn = vi.fn().mockResolvedValue(llmSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + const result = await fetcher.getSpans({ spanType: ["llm"] }); + + expect(fetchFn).toHaveBeenCalledTimes(1); + expect(fetchFn).toHaveBeenCalledWith(["llm"]); + expect(result).toHaveLength(2); + }); + }); + + describe("caching behavior", () => { + test("should return cached spans without re-fetching after fetching all", async () => { + const mockSpans = [ + makeSpan("span-1", "llm"), + makeSpan("span-2", "function"), + ]; + + const fetchFn = vi.fn().mockResolvedValue(mockSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + // First call - fetches + await fetcher.getSpans(); + expect(fetchFn).toHaveBeenCalledTimes(1); + + // Second call - should use cache + const result = await fetcher.getSpans(); + expect(fetchFn).toHaveBeenCalledTimes(1); // Still 1 + expect(result).toHaveLength(2); + }); + + test("should return cached spans for previously fetched types", async () => { + const llmSpans = [makeSpan("span-1", "llm"), makeSpan("span-2", "llm")]; + + const fetchFn = vi.fn().mockResolvedValue(llmSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + // First call - fetches llm spans + await fetcher.getSpans({ spanType: ["llm"] }); + expect(fetchFn).toHaveBeenCalledTimes(1); + + // Second call for same type - should use cache + const result = await fetcher.getSpans({ spanType: ["llm"] }); + expect(fetchFn).toHaveBeenCalledTimes(1); // Still 1 + expect(result).toHaveLength(2); + }); + + test("should only fetch missing span types", async () => { + const llmSpans = [makeSpan("span-1", "llm")]; + const functionSpans = [makeSpan("span-2", "function")]; + + const fetchFn = vi + .fn() + .mockResolvedValueOnce(llmSpans) + .mockResolvedValueOnce(functionSpans); + + const fetcher = new CachedSpanFetcher(fetchFn); + + // First call - fetches llm spans + await fetcher.getSpans({ spanType: ["llm"] }); + expect(fetchFn).toHaveBeenCalledTimes(1); + expect(fetchFn).toHaveBeenLastCalledWith(["llm"]); + + // Second call for both types - should only fetch function + const result = await fetcher.getSpans({ spanType: ["llm", "function"] }); + expect(fetchFn).toHaveBeenCalledTimes(2); + expect(fetchFn).toHaveBeenLastCalledWith(["function"]); + expect(result).toHaveLength(2); + }); + + test("should not re-fetch after fetching all spans", async () => { + const allSpans = [ + makeSpan("span-1", "llm"), + makeSpan("span-2", "function"), + makeSpan("span-3", "tool"), + ]; + + const fetchFn = vi.fn().mockResolvedValue(allSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + // Fetch all spans + await fetcher.getSpans(); + expect(fetchFn).toHaveBeenCalledTimes(1); + + // Subsequent filtered calls should use cache + const llmResult = await fetcher.getSpans({ spanType: ["llm"] }); + expect(fetchFn).toHaveBeenCalledTimes(1); // Still 1 + expect(llmResult).toHaveLength(1); + expect(llmResult[0].span_id).toBe("span-1"); + + const functionResult = await fetcher.getSpans({ spanType: ["function"] }); + expect(fetchFn).toHaveBeenCalledTimes(1); // Still 1 + expect(functionResult).toHaveLength(1); + expect(functionResult[0].span_id).toBe("span-2"); + }); + }); + + describe("filtering from cache", () => { + test("should filter by multiple span types from cache", async () => { + const allSpans = [ + makeSpan("span-1", "llm"), + makeSpan("span-2", "function"), + makeSpan("span-3", "tool"), + makeSpan("span-4", "llm"), + ]; + + const fetchFn = vi.fn().mockResolvedValue(allSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + // Fetch all first + await fetcher.getSpans(); + + // Filter for llm and tool + const result = await fetcher.getSpans({ spanType: ["llm", "tool"] }); + expect(result).toHaveLength(3); + expect(result.map((s) => s.span_id).sort()).toEqual([ + "span-1", + "span-3", + "span-4", + ]); + }); + + test("should return empty array for non-existent span type", async () => { + const allSpans = [makeSpan("span-1", "llm")]; + + const fetchFn = vi.fn().mockResolvedValue(allSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + // Fetch all first + await fetcher.getSpans(); + + // Query for non-existent type + const result = await fetcher.getSpans({ spanType: ["nonexistent"] }); + expect(result).toHaveLength(0); + }); + + test("should handle spans with no type (empty string type)", async () => { + const spans = [ + makeSpan("span-1", "llm"), + { span_id: "span-2", input: {}, span_attributes: {} }, // No type + { span_id: "span-3", input: {} }, // No span_attributes + ]; + + const fetchFn = vi.fn().mockResolvedValue(spans); + const fetcher = new CachedSpanFetcher(fetchFn); + + // Fetch all + const result = await fetcher.getSpans(); + expect(result).toHaveLength(3); + + // Spans without type go into "" bucket + const noTypeResult = await fetcher.getSpans({ spanType: [""] }); + expect(noTypeResult).toHaveLength(2); + }); + }); + + describe("edge cases", () => { + test("should handle empty results", async () => { + const fetchFn = vi.fn().mockResolvedValue([]); + const fetcher = new CachedSpanFetcher(fetchFn); + + const result = await fetcher.getSpans(); + expect(result).toHaveLength(0); + expect(fetchFn).toHaveBeenCalledTimes(1); + + // Should still mark as fetched + await fetcher.getSpans({ spanType: ["llm"] }); + expect(fetchFn).toHaveBeenCalledTimes(1); // Cache hit + }); + + test("should handle empty spanType array same as undefined", async () => { + const mockSpans = [makeSpan("span-1", "llm")]; + const fetchFn = vi.fn().mockResolvedValue(mockSpans); + const fetcher = new CachedSpanFetcher(fetchFn); + + const result = await fetcher.getSpans({ spanType: [] }); + + expect(fetchFn).toHaveBeenCalledWith(undefined); + expect(result).toHaveLength(1); + }); + }); +}); diff --git a/js/src/trace.ts b/js/src/trace.ts index e6be84613..49c1794f2 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -103,6 +103,125 @@ export interface SpanData { [key: string]: unknown; } +/** Function signature for fetching spans by type */ +export type SpanFetchFn = ( + spanType: string[] | undefined, +) => Promise; + +/** + * Cached span fetcher that handles fetching and caching spans by type. + * + * Caching strategy: + * - Cache spans by span type (Map) + * - Track if all spans have been fetched (allFetched flag) + * - When filtering by spanType, only fetch types not already in cache + */ +export class CachedSpanFetcher { + private spanCache = new Map(); + private allFetched = false; + private fetchFn: SpanFetchFn; + + constructor( + objectType: "experiment" | "project_logs", + objectId: string, + rootSpanId: string, + getState: () => Promise, + ); + constructor(fetchFn: SpanFetchFn); + constructor( + objectTypeOrFetchFn: "experiment" | "project_logs" | SpanFetchFn, + objectId?: string, + rootSpanId?: string, + getState?: () => Promise, + ) { + if (typeof objectTypeOrFetchFn === "function") { + // Direct fetch function injection (for testing) + this.fetchFn = objectTypeOrFetchFn; + } else { + // Standard constructor with SpanFetcher + const objectType = objectTypeOrFetchFn; + this.fetchFn = async (spanType) => { + const state = await getState!(); + const fetcher = new SpanFetcher( + objectType, + objectId!, + rootSpanId!, + state, + spanType, + ); + const rows: WithTransactionId[] = + await fetcher.fetchedData(); + return rows + .filter((row) => row.span_attributes?.purpose !== "scorer") + .map((row) => ({ + input: row.input, + output: row.output, + metadata: row.metadata, + span_id: row.span_id, + span_parents: row.span_parents, + span_attributes: row.span_attributes, + id: row.id, + _xact_id: row._xact_id, + _pagination_key: row._pagination_key, + root_span_id: row.root_span_id, + })); + }; + } + } + + async getSpans({ spanType }: { spanType?: string[] } = {}): Promise< + SpanData[] + > { + // If we've fetched all spans, just filter from cache + if (this.allFetched) { + return this.getFromCache(spanType); + } + + // If no filter requested, fetch everything + if (!spanType || spanType.length === 0) { + await this.fetchSpans(undefined); + this.allFetched = true; + return this.getFromCache(undefined); + } + + // Find which spanTypes we don't have in cache yet + const missingTypes = spanType.filter((t) => !this.spanCache.has(t)); + + // If all requested types are cached, return from cache + if (missingTypes.length === 0) { + return this.getFromCache(spanType); + } + + // Fetch only the missing types + await this.fetchSpans(missingTypes); + return this.getFromCache(spanType); + } + + private async fetchSpans(spanType: string[] | undefined): Promise { + const spans = await this.fetchFn(spanType); + + for (const span of spans) { + const type = span.span_attributes?.type ?? ""; + const existing = this.spanCache.get(type) ?? []; + existing.push(span); + this.spanCache.set(type, existing); + } + } + + private getFromCache(spanType: string[] | undefined): SpanData[] { + if (!spanType || spanType.length === 0) { + return Array.from(this.spanCache.values()).flat(); + } + + const result: SpanData[] = []; + for (const type of spanType) { + const spans = this.spanCache.get(type); + if (spans) result.push(...spans); + } + return result; + } +} + /** * Interface for trace objects that can be used by scorers. * Both the SDK's LocalTrace class and the API wrapper's WrapperTrace implement this. @@ -122,7 +241,6 @@ export interface Trace { * richer logging or side effects. */ export class LocalTrace implements Trace { - // Store values privately so future helper methods can expose them safely. private readonly objectType: "experiment" | "project_logs"; private readonly objectId: string; private readonly rootSpanId: string; @@ -130,6 +248,7 @@ export class LocalTrace implements Trace { private readonly state: BraintrustState; private spansFlushed = false; private spansFlushPromise: Promise | null = null; + private cachedFetcher: CachedSpanFetcher; constructor({ objectType, @@ -143,6 +262,16 @@ export class LocalTrace implements Trace { this.rootSpanId = rootSpanId; this.ensureSpansFlushed = ensureSpansFlushed; this.state = state; + this.cachedFetcher = new CachedSpanFetcher( + objectType, + objectId, + rootSpanId, + async () => { + await this.ensureSpansReady(); + await state.login({}); + return state; + }, + ); } getConfiguration() { @@ -156,21 +285,18 @@ export class LocalTrace implements Trace { /** * Fetch all rows for this root span from its parent object (experiment or project logs). * First checks the local span cache for recently logged spans, then falls - * back to BTQL API if not found in cache. + * back to CachedSpanFetcher which handles BTQL fetching and caching. */ async getSpans({ spanType }: { spanType?: string[] } = {}): Promise< SpanData[] > { - const state = this.state; - - // Try local cache first - const cachedSpans = state.spanCache.getByRootSpanId(this.rootSpanId); + // Try local span cache first (for recently logged spans not yet flushed) + const cachedSpans = this.state.spanCache.getByRootSpanId(this.rootSpanId); if (cachedSpans && cachedSpans.length > 0) { let spans = cachedSpans.filter( (span) => span.span_attributes?.purpose !== "scorer", ); - // Apply spanType filter if specified if (spanType && spanType.length > 0) { spans = spans.filter((span) => spanType.includes(span.span_attributes?.type ?? ""), @@ -187,30 +313,8 @@ export class LocalTrace implements Trace { })); } - // Cache miss - fall back to BTQL via ObjectFetcher pattern - await this.ensureSpansReady(); - await state.login({}); - - const fetcher = new SpanFetcher( - this.objectType, - this.objectId, - this.rootSpanId, - state, - spanType, - ); - - const rows: WithTransactionId[] = await fetcher.fetchedData(); - - return rows - .filter((row) => row.span_attributes?.purpose !== "scorer") - .map((row) => ({ - input: row.input, - output: row.output, - metadata: row.metadata, - span_id: row.span_id, - span_parents: row.span_parents, - span_attributes: row.span_attributes, - })); + // Fall back to CachedSpanFetcher for BTQL fetching with caching + return this.cachedFetcher.getSpans({ spanType }); } private async ensureSpansReady() { From e7219107bf250421a0375db94eeed28bb31d5b80 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 12:06:09 -0800 Subject: [PATCH 42/65] spancache cleanup --- js/src/span-cache.ts | 60 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index a643ece66..1bb88615b 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -11,6 +11,10 @@ import iso from "./isomorph"; +// Global registry of active span caches for process exit cleanup +const activeCaches = new Set(); +let exitHandlersRegistered = false; + /** * Check if the span cache can be used (requires filesystem APIs). * This is called at runtime, not at module load time, to allow @@ -129,15 +133,68 @@ export class SpanCache { tmpDir, `braintrust-span-cache-${uniqueId}.jsonl`, ); + console.log("WRITING TO FILE:"); + console.log(this.cacheFilePath); // Open file for append+read this.fileHandle = await iso.openFile!(this.cacheFilePath, "a+"); this.initialized = true; + + // Register cleanup handler on first initialization + this.registerExitHandler(); })(); return this.initPromise; } + /** + * Register a handler to clean up the temp file on process exit. + * Uses a global registry to avoid registering multiple handlers. + */ + private registerExitHandler(): void { + // Add this cache to the global registry + activeCaches.add(this); + + // Only register process handlers once globally + if ( + typeof process !== "undefined" && + process.on && + !exitHandlersRegistered + ) { + exitHandlersRegistered = true; + + const cleanupAllCaches = () => { + // Clean up all active caches + for (const cache of activeCaches) { + // Close file handle if open + if (cache.fileHandle) { + try { + cache.fileHandle.close().catch(() => {}); + cache.fileHandle = null; + } catch { + // Ignore errors during exit cleanup + } + } + + // Delete the temp file + if (cache.cacheFilePath && canUseSpanCache()) { + try { + iso.unlinkSync!(cache.cacheFilePath); + } catch { + // Ignore cleanup errors - file might not exist or already deleted + } + } + } + }; + + // Register for multiple exit scenarios + process.on("exit", cleanupAllCaches); + process.on("SIGINT", cleanupAllCaches); + process.on("SIGTERM", cleanupAllCaches); + process.on("beforeExit", cleanupAllCaches); + } + } + // Buffer for pending writes - flushed asynchronously private writeBuffer: DiskSpanRecord[] = []; private flushScheduled = false; @@ -315,6 +372,9 @@ export class SpanCache { * Clean up the cache file. Call this when the eval is complete. */ dispose(): void { + // Remove from global registry + activeCaches.delete(this); + // Clear pending writes this.writeBuffer = []; this.flushScheduled = false; From 98ae95717986d3ab9a00c8c4e83bf87a24410dec Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 13:24:23 -0800 Subject: [PATCH 43/65] init cache in inner eval method --- js/src/framework.ts | 929 ++++++++++++++++++++++---------------------- 1 file changed, 471 insertions(+), 458 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 16de6f050..12bf88928 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -617,8 +617,6 @@ export async function Eval< } const resolvedReporter = options.reporter || defaultReporter; - // Start span cache for this eval (it's disabled by default to avoid temp files outside of evals) - (evaluator.state ?? _internalGetGlobalState())?.spanCache?.start(); try { const { data, baseExperiment: defaultBaseExperiment } = callEvaluatorData( evaluator.data, @@ -710,10 +708,6 @@ export async function Eval< } } finally { progressReporter.stop(); - // Clean up disk-based span cache after eval completes and stop caching - const spanCache = (evaluator.state ?? _internalGetGlobalState())?.spanCache; - spanCache?.dispose(); - spanCache?.stop(); } } @@ -845,511 +839,530 @@ async function runEvaluatorInternal( collectResults: boolean, // eslint-disable-next-line @typescript-eslint/no-explicit-any ): Promise> { - if (typeof evaluator.data === "string") { - throw new Error("Unimplemented: string data paths"); - } - let dataResult = - typeof evaluator.data === "function" ? evaluator.data() : evaluator.data; - - parameters = validateParameters(parameters ?? {}, evaluator.parameters ?? {}); - - if ("_type" in dataResult) { - if (dataResult._type !== "BaseExperiment") { - // For some reason, the typesystem won't let me check if dataResult._type === "BaseExperiment" - throw new Error("Invalid _type"); - } - if (!experiment) { - throw new Error( - "Cannot use BaseExperiment() without connecting to Braintrust (you most likely set --no-send-logs)", - ); - } - let name = dataResult.name; - if (isEmpty(name)) { - const baseExperiment = await experiment.fetchBaseExperiment(); - if (!baseExperiment) { - throw new Error("BaseExperiment() failed to fetch base experiment"); - } - name = baseExperiment.name; + // Start span cache for this eval (it's disabled by default to avoid temp files outside of evals) + (evaluator.state ?? _internalGetGlobalState())?.spanCache?.start(); + try { + if (typeof evaluator.data === "string") { + throw new Error("Unimplemented: string data paths"); } + let dataResult = + typeof evaluator.data === "function" ? evaluator.data() : evaluator.data; - dataResult = initExperiment(evaluator.state, { - ...(evaluator.projectId - ? { projectId: evaluator.projectId } - : { project: evaluator.projectName }), - experiment: name, - open: true, - }).asDataset(); - } - - const resolvedDataResult = - dataResult instanceof Promise ? await dataResult : dataResult; + parameters = validateParameters( + parameters ?? {}, + evaluator.parameters ?? {}, + ); - const dataIterable: AsyncIterable> = (() => { - if (isAsyncIterable>(resolvedDataResult)) { - return resolvedDataResult; - } - if ( - Array.isArray(resolvedDataResult) || - isIterable>(resolvedDataResult) - ) { - const iterable = resolvedDataResult as Iterable>; - return (async function* () { - for (const datum of iterable) { - yield datum; + if ("_type" in dataResult) { + if (dataResult._type !== "BaseExperiment") { + // For some reason, the typesystem won't let me check if dataResult._type === "BaseExperiment" + throw new Error("Invalid _type"); + } + if (!experiment) { + throw new Error( + "Cannot use BaseExperiment() without connecting to Braintrust (you most likely set --no-send-logs)", + ); + } + let name = dataResult.name; + if (isEmpty(name)) { + const baseExperiment = await experiment.fetchBaseExperiment(); + if (!baseExperiment) { + throw new Error("BaseExperiment() failed to fetch base experiment"); } - })(); + name = baseExperiment.name; + } + + dataResult = initExperiment(evaluator.state, { + ...(evaluator.projectId + ? { projectId: evaluator.projectId } + : { project: evaluator.projectName }), + experiment: name, + open: true, + }).asDataset(); } - throw new Error( - "Evaluator data must be an array, iterable, or async iterable", - ); - })(); - progressReporter.start(evaluator.evalName, 0); + const resolvedDataResult = + dataResult instanceof Promise ? await dataResult : dataResult; - const experimentIdPromise: Promise | undefined = - experiment - ? (async () => { - try { - return await experiment.id; - } catch { - return undefined; - } - })() - : undefined; - - const collectedResults: EvalResult[] = []; - const localScoreAccumulator: ScoreAccumulator | null = experiment ? null : {}; - let cancelled = false; - let scheduledTrials = 0; - const q = queue( - async ({ - datum, - trialIndex, - }: { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - datum: EvalCase; - trialIndex: number; - }) => { - if (cancelled) { - return; + const dataIterable: AsyncIterable> = (() => { + if (isAsyncIterable>(resolvedDataResult)) { + return resolvedDataResult; } - const eventDataset: Dataset | undefined = experiment - ? experiment.dataset - : Dataset.isDataset(evaluator.data) - ? evaluator.data - : undefined; - - const baseEvent: StartSpanArgs = { - name: "eval", - spanAttributes: { - type: SpanTypeAttribute.EVAL, - }, - event: { - input: datum.input, - expected: "expected" in datum ? datum.expected : undefined, - tags: datum.tags, - origin: - eventDataset && datum.id && datum._xact_id - ? { - object_type: "dataset", - object_id: await eventDataset.id, - id: datum.id, - created: datum.created, - _xact_id: datum._xact_id, - } - : undefined, - ...(datum.upsert_id ? { id: datum.upsert_id } : {}), - }, - }; - - const callback = async (rootSpan: Span) => { - const state = evaluator.state ?? _internalGetGlobalState(); - const ensureSpansFlushed = async () => { - // Flush native Braintrust spans - if (experiment) { - await flush({ state: experiment.loggingState }); - } else if (state) { - await flush({ state }); - } else { - await flush(); + if ( + Array.isArray(resolvedDataResult) || + isIterable>(resolvedDataResult) + ) { + const iterable = resolvedDataResult as Iterable< + EvalCase + >; + return (async function* () { + for (const datum of iterable) { + yield datum; } + })(); + } + throw new Error( + "Evaluator data must be an array, iterable, or async iterable", + ); + })(); - // Also flush OTEL spans if registered - if (state) { - await state.flushOtel(); - } - }; + progressReporter.start(evaluator.evalName, 0); - const trace = state - ? new LocalTrace({ - objectType: "experiment", - objectId: experimentIdPromise - ? (await experimentIdPromise) ?? "" - : "", - rootSpanId: rootSpan.rootSpanId, - ensureSpansFlushed, - state, - }) - : undefined; - let metadata: Record = { - ...("metadata" in datum ? datum.metadata : {}), + const experimentIdPromise: Promise | undefined = + experiment + ? (async () => { + try { + return await experiment.id; + } catch { + return undefined; + } + })() + : undefined; + + const collectedResults: EvalResult[] = []; + const localScoreAccumulator: ScoreAccumulator | null = experiment + ? null + : {}; + let cancelled = false; + let scheduledTrials = 0; + const q = queue( + async ({ + datum, + trialIndex, + }: { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + datum: EvalCase; + trialIndex: number; + }) => { + if (cancelled) { + return; + } + const eventDataset: Dataset | undefined = experiment + ? experiment.dataset + : Dataset.isDataset(evaluator.data) + ? evaluator.data + : undefined; + + const baseEvent: StartSpanArgs = { + name: "eval", + spanAttributes: { + type: SpanTypeAttribute.EVAL, + }, + event: { + input: datum.input, + expected: "expected" in datum ? datum.expected : undefined, + tags: datum.tags, + origin: + eventDataset && datum.id && datum._xact_id + ? { + object_type: "dataset", + object_id: await eventDataset.id, + id: datum.id, + created: datum.created, + _xact_id: datum._xact_id, + } + : undefined, + ...(datum.upsert_id ? { id: datum.upsert_id } : {}), + }, }; - const expected = "expected" in datum ? datum.expected : undefined; - let output: unknown = undefined; - let error: unknown | undefined = undefined; - let tags: string[] = [...(datum.tags ?? [])]; - const scores: Record = {}; - const scorerNames = evaluator.scores.map(scorerName); - let unhandledScores: string[] | null = scorerNames; - try { - const meta = (o: Record) => - (metadata = { ...metadata, ...o }); - - await rootSpan.traced( - async (span: Span) => { - const hooksForTask: EvalHooks< - unknown, - Record, - EvalParameters - > = { - meta, - metadata, - expected, - span, - parameters: parameters ?? {}, - reportProgress: (event: TaskProgressEvent) => { - stream?.({ - ...event, - id: rootSpan.id, - origin: baseEvent.event?.origin, - name: evaluator.evalName, - object_type: "task", - }); - }, - trialIndex, - tags, - }; - - const outputResult = evaluator.task(datum.input, hooksForTask); - if (outputResult instanceof Promise) { - output = await outputResult; - } else { - output = outputResult; - } - tags = hooksForTask.tags ?? []; + const callback = async (rootSpan: Span) => { + const state = evaluator.state ?? _internalGetGlobalState(); + const ensureSpansFlushed = async () => { + // Flush native Braintrust spans + if (experiment) { + await flush({ state: experiment.loggingState }); + } else if (state) { + await flush({ state }); + } else { + await flush(); + } - span.log({ output }); - }, - { - name: "task", - spanAttributes: { type: SpanTypeAttribute.TASK }, - event: { input: datum.input }, - }, - ); - if (tags.length) { - rootSpan.log({ output, metadata, expected, tags }); - } else { - rootSpan.log({ output, metadata, expected }); - } + // Also flush OTEL spans if registered + if (state) { + await state.flushOtel(); + } + }; - const scoringArgs = { - input: datum.input, - expected: "expected" in datum ? datum.expected : undefined, - metadata, - output, - trace, + const trace = state + ? new LocalTrace({ + objectType: "experiment", + objectId: experimentIdPromise + ? (await experimentIdPromise) ?? "" + : "", + rootSpanId: rootSpan.rootSpanId, + ensureSpansFlushed, + state, + }) + : undefined; + let metadata: Record = { + ...("metadata" in datum ? datum.metadata : {}), }; - const scoreResults = await Promise.all( - evaluator.scores.map(async (score, score_idx) => { - try { - const runScorer = async (span: Span) => { - const scoreResult = score(scoringArgs); - const scoreValue = - scoreResult instanceof Promise - ? await scoreResult - : scoreResult; - - if (scoreValue === null) { - return null; - } + const expected = "expected" in datum ? datum.expected : undefined; + let output: unknown = undefined; + let error: unknown | undefined = undefined; + let tags: string[] = [...(datum.tags ?? [])]; + const scores: Record = {}; + const scorerNames = evaluator.scores.map(scorerName); + let unhandledScores: string[] | null = scorerNames; + try { + const meta = (o: Record) => + (metadata = { ...metadata, ...o }); + + await rootSpan.traced( + async (span: Span) => { + const hooksForTask: EvalHooks< + unknown, + Record, + EvalParameters + > = { + meta, + metadata, + expected, + span, + parameters: parameters ?? {}, + reportProgress: (event: TaskProgressEvent) => { + stream?.({ + ...event, + id: rootSpan.id, + origin: baseEvent.event?.origin, + name: evaluator.evalName, + object_type: "task", + }); + }, + trialIndex, + tags, + }; + + const outputResult = evaluator.task(datum.input, hooksForTask); + if (outputResult instanceof Promise) { + output = await outputResult; + } else { + output = outputResult; + } + + tags = hooksForTask.tags ?? []; + + span.log({ output }); + }, + { + name: "task", + spanAttributes: { type: SpanTypeAttribute.TASK }, + event: { input: datum.input }, + }, + ); + if (tags.length) { + rootSpan.log({ output, metadata, expected, tags }); + } else { + rootSpan.log({ output, metadata, expected }); + } + + const scoringArgs = { + input: datum.input, + expected: "expected" in datum ? datum.expected : undefined, + metadata, + output, + trace, + }; + const scoreResults = await Promise.all( + evaluator.scores.map(async (score, score_idx) => { + try { + const runScorer = async (span: Span) => { + const scoreResult = score(scoringArgs); + const scoreValue = + scoreResult instanceof Promise + ? await scoreResult + : scoreResult; + + if (scoreValue === null) { + return null; + } - if (Array.isArray(scoreValue)) { - for (const s of scoreValue) { - if (!(typeof s === "object" && !isEmpty(s))) { - throw new Error( - `When returning an array of scores, each score must be a non-empty object. Got: ${JSON.stringify( - s, - )}`, - ); + if (Array.isArray(scoreValue)) { + for (const s of scoreValue) { + if (!(typeof s === "object" && !isEmpty(s))) { + throw new Error( + `When returning an array of scores, each score must be a non-empty object. Got: ${JSON.stringify( + s, + )}`, + ); + } } } - } - const results = Array.isArray(scoreValue) - ? scoreValue - : typeof scoreValue === "object" && !isEmpty(scoreValue) - ? [scoreValue] - : [ - { - name: scorerNames[score_idx], - score: scoreValue, - }, - ]; - - const getOtherFields = (s: Score) => { - const { metadata: _metadata, name: _name, ...rest } = s; - return rest; + const results = Array.isArray(scoreValue) + ? scoreValue + : typeof scoreValue === "object" && !isEmpty(scoreValue) + ? [scoreValue] + : [ + { + name: scorerNames[score_idx], + score: scoreValue, + }, + ]; + + const getOtherFields = (s: Score) => { + const { metadata: _metadata, name: _name, ...rest } = s; + return rest; + }; + + const resultMetadata = + results.length === 1 + ? results[0].metadata + : results.reduce( + (prev, s) => + mergeDicts(prev, { + [s.name]: s.metadata, + }), + {}, + ); + + const resultOutput = + results.length === 1 + ? getOtherFields(results[0]) + : results.reduce( + (prev, s) => + mergeDicts(prev, { [s.name]: getOtherFields(s) }), + {}, + ); + + const scores = results.reduce( + (prev, s) => mergeDicts(prev, { [s.name]: s.score }), + {}, + ); + + span.log({ + output: resultOutput, + metadata: resultMetadata, + scores: scores, + }); + return results; }; - const resultMetadata = - results.length === 1 - ? results[0].metadata - : results.reduce( - (prev, s) => - mergeDicts(prev, { - [s.name]: s.metadata, - }), - {}, - ); - - const resultOutput = - results.length === 1 - ? getOtherFields(results[0]) - : results.reduce( - (prev, s) => - mergeDicts(prev, { [s.name]: getOtherFields(s) }), - {}, - ); - - const scores = results.reduce( - (prev, s) => mergeDicts(prev, { [s.name]: s.score }), - {}, - ); - - span.log({ - output: resultOutput, - metadata: resultMetadata, - scores: scores, + // Exclude trace from logged input since it contains internal state + // that shouldn't be serialized (spansFlushPromise, spansFlushed, etc.) + const { trace: _trace, ...scoringArgsForLogging } = + scoringArgs; + const results = await rootSpan.traced(runScorer, { + name: scorerNames[score_idx], + spanAttributes: { + type: SpanTypeAttribute.SCORE, + purpose: "scorer", + }, + propagatedEvent: makeScorerPropagatedEvent( + await rootSpan.export(), + ), + event: { input: scoringArgsForLogging }, }); - return results; - }; - - // Exclude trace from logged input since it contains internal state - // that shouldn't be serialized (spansFlushPromise, spansFlushed, etc.) - const { trace: _trace, ...scoringArgsForLogging } = scoringArgs; - const results = await rootSpan.traced(runScorer, { - name: scorerNames[score_idx], - spanAttributes: { - type: SpanTypeAttribute.SCORE, - purpose: "scorer", - }, - propagatedEvent: makeScorerPropagatedEvent( - await rootSpan.export(), - ), - event: { input: scoringArgsForLogging }, + return { kind: "score", value: results } as const; + } catch (e) { + return { kind: "error", value: e } as const; + } + }), + ); + // Resolve each promise on its own so that we can separate the passing + // from the failing ones. + const failingScorersAndResults: { name: string; error: unknown }[] = + []; + scoreResults.forEach((results, i) => { + const name = scorerNames[i]; + if (results.kind === "score") { + (results.value || []).forEach((result) => { + scores[result.name] = result.score; }); - return { kind: "score", value: results } as const; - } catch (e) { - return { kind: "error", value: e } as const; + } else { + failingScorersAndResults.push({ name, error: results.value }); } - }), - ); - // Resolve each promise on its own so that we can separate the passing - // from the failing ones. - const failingScorersAndResults: { name: string; error: unknown }[] = - []; - scoreResults.forEach((results, i) => { - const name = scorerNames[i]; - if (results.kind === "score") { - (results.value || []).forEach((result) => { - scores[result.name] = result.score; + }); + + unhandledScores = null; + if (failingScorersAndResults.length) { + const scorerErrors = Object.fromEntries( + failingScorersAndResults.map(({ name, error }) => [ + name, + error instanceof Error ? error.stack : `${error}`, + ]), + ); + metadata["scorer_errors"] = scorerErrors; + rootSpan.log({ + metadata: { scorer_errors: scorerErrors }, }); - } else { - failingScorersAndResults.push({ name, error: results.value }); + const names = Object.keys(scorerErrors).join(", "); + const errors = failingScorersAndResults.map((item) => item.error); + unhandledScores = Object.keys(scorerErrors); + console.warn( + `Found exceptions for the following scorers: ${names}`, + errors, + ); } - }); - - unhandledScores = null; - if (failingScorersAndResults.length) { - const scorerErrors = Object.fromEntries( - failingScorersAndResults.map(({ name, error }) => [ - name, - error instanceof Error ? error.stack : `${error}`, - ]), - ); - metadata["scorer_errors"] = scorerErrors; - rootSpan.log({ - metadata: { scorer_errors: scorerErrors }, - }); - const names = Object.keys(scorerErrors).join(", "); - const errors = failingScorersAndResults.map((item) => item.error); - unhandledScores = Object.keys(scorerErrors); - console.warn( - `Found exceptions for the following scorers: ${names}`, - errors, - ); + } catch (e) { + logSpanError(rootSpan, e); + error = e; + } finally { + progressReporter.increment(evaluator.evalName); } - } catch (e) { - logSpanError(rootSpan, e); - error = e; - } finally { - progressReporter.increment(evaluator.evalName); - } - const mergedScores = { - ...(evaluator.errorScoreHandler && unhandledScores - ? evaluator.errorScoreHandler({ - rootSpan, - data: datum, - unhandledScores, - }) - : undefined), - ...scores, - } as Record; + const mergedScores = { + ...(evaluator.errorScoreHandler && unhandledScores + ? evaluator.errorScoreHandler({ + rootSpan, + data: datum, + unhandledScores, + }) + : undefined), + ...scores, + } as Record; + + if (localScoreAccumulator) { + accumulateScores(localScoreAccumulator, mergedScores); + } - if (localScoreAccumulator) { - accumulateScores(localScoreAccumulator, mergedScores); - } + if (collectResults) { + collectedResults.push({ + input: datum.input, + ...("expected" in datum ? { expected: datum.expected } : {}), + output, + tags: tags.length ? tags : undefined, + metadata, + scores: mergedScores, + error, + origin: baseEvent.event?.origin, + }); + } + }; - if (collectResults) { - collectedResults.push({ - input: datum.input, - ...("expected" in datum ? { expected: datum.expected } : {}), - output, - tags: tags.length ? tags : undefined, - metadata, - scores: mergedScores, - error, - origin: baseEvent.event?.origin, + if (!experiment) { + // This will almost always be a no-op span, but it means that if the Eval + // is run in the context of a different type of span, it will be logged. + return await traced(callback, { + ...baseEvent, + state: evaluator.state, }); + } else { + const result = await experiment.traced(callback, baseEvent); + // Flush logs after each task to provide backpressure and prevent memory accumulation + // when maxConcurrency is set. This ensures logs are sent before the next task starts, + // preventing unbounded memory growth with large log payloads. + if (evaluator.maxConcurrency !== undefined) { + await experiment.flush(); + } + return result; } - }; - - if (!experiment) { - // This will almost always be a no-op span, but it means that if the Eval - // is run in the context of a different type of span, it will be logged. - return await traced(callback, { - ...baseEvent, - state: evaluator.state, - }); - } else { - const result = await experiment.traced(callback, baseEvent); - // Flush logs after each task to provide backpressure and prevent memory accumulation - // when maxConcurrency is set. This ensures logs are sent before the next task starts, - // preventing unbounded memory growth with large log payloads. - if (evaluator.maxConcurrency !== undefined) { - await experiment.flush(); - } - return result; - } - }, - Math.max(evaluator.maxConcurrency ?? Number.MAX_SAFE_INTEGER, 1), - ); + }, + Math.max(evaluator.maxConcurrency ?? Number.MAX_SAFE_INTEGER, 1), + ); - const enqueuePromise = (async () => { - for await (const datum of dataIterable) { - if (cancelled) { - break; - } - if (!filters.every((f) => evaluateFilter(datum, f))) { - continue; - } - const trialCount = evaluator.trialCount ?? 1; - for (let trialIndex = 0; trialIndex < trialCount; trialIndex++) { + const enqueuePromise = (async () => { + for await (const datum of dataIterable) { if (cancelled) { break; } - scheduledTrials++; - progressReporter.setTotal?.(evaluator.evalName, scheduledTrials); - q.push({ datum, trialIndex }); - } - } - })(); - - const cancel = async () => { - await new Promise((_, reject) => { - // If already cancelled, reject immediately - if (cancelled) { - reject(new InternalAbortError("Evaluator already cancelled")); - return; + if (!filters.every((f) => evaluateFilter(datum, f))) { + continue; + } + const trialCount = evaluator.trialCount ?? 1; + for (let trialIndex = 0; trialIndex < trialCount; trialIndex++) { + if (cancelled) { + break; + } + scheduledTrials++; + progressReporter.setTotal?.(evaluator.evalName, scheduledTrials); + q.push({ datum, trialIndex }); + } } + })(); - let timeoutId: ReturnType | undefined; - let abortHandler: (() => void) | undefined; - - const rejectOnce = (error: InternalAbortError) => { + const cancel = async () => { + await new Promise((_, reject) => { + // If already cancelled, reject immediately if (cancelled) { + reject(new InternalAbortError("Evaluator already cancelled")); return; } - cancelled = true; - if (timeoutId) { - clearTimeout(timeoutId); - timeoutId = undefined; + + let timeoutId: ReturnType | undefined; + let abortHandler: (() => void) | undefined; + + const rejectOnce = (error: InternalAbortError) => { + if (cancelled) { + return; + } + cancelled = true; + if (timeoutId) { + clearTimeout(timeoutId); + timeoutId = undefined; + } + if (abortHandler && evaluator.signal) { + evaluator.signal.removeEventListener("abort", abortHandler); + } + reject(error); + }; + + if (evaluator.timeout) { + timeoutId = setTimeout(() => { + rejectOnce(new InternalAbortError("Evaluator timed out")); + }, evaluator.timeout); } - if (abortHandler && evaluator.signal) { - evaluator.signal.removeEventListener("abort", abortHandler); + if (evaluator.signal) { + abortHandler = () => { + rejectOnce(new InternalAbortError("Evaluator aborted")); + }; + evaluator.signal.addEventListener("abort", abortHandler); } - reject(error); - }; + }); + }; - if (evaluator.timeout) { - timeoutId = setTimeout(() => { - rejectOnce(new InternalAbortError("Evaluator timed out")); - }, evaluator.timeout); - } - if (evaluator.signal) { - abortHandler = () => { - rejectOnce(new InternalAbortError("Evaluator aborted")); - }; - evaluator.signal.addEventListener("abort", abortHandler); + const waitForQueue = (async () => { + await enqueuePromise; + if (q.idle()) { + return; } - }); - }; - - const waitForQueue = (async () => { - await enqueuePromise; - if (q.idle()) { - return; - } - await q.drain(); - })(); + await q.drain(); + })(); - // wait for tasks to be completed or the evaluator to be cancelled - // if the evaluator is cancelled, the remaining tasks that have not been started will be killed - try { - await Promise.race([waitForQueue, cancel()]); - } catch (e) { - // Always kill the queue to prevent hanging tasks and memory leaks - q.kill(); + // wait for tasks to be completed or the evaluator to be cancelled + // if the evaluator is cancelled, the remaining tasks that have not been started will be killed + try { + await Promise.race([waitForQueue, cancel()]); + } catch (e) { + // Always kill the queue to prevent hanging tasks and memory leaks + q.kill(); + + if (e instanceof InternalAbortError) { + // Log cancellation for debugging + if (iso.getEnv("BRAINTRUST_VERBOSE")) { + console.warn("Evaluator cancelled:", (e as Error).message); + } + } - if (e instanceof InternalAbortError) { - // Log cancellation for debugging - if (iso.getEnv("BRAINTRUST_VERBOSE")) { - console.warn("Evaluator cancelled:", (e as Error).message); + throw e; + } finally { + // Ensure results are cleared if not collecting to free memory + if (!collectResults) { + collectedResults.length = 0; } } - throw e; + const summary = experiment + ? await experiment.summarize({ + summarizeScores: evaluator.summarizeScores, + }) + : buildLocalSummary( + evaluator, + collectResults ? collectedResults : [], + localScoreAccumulator ?? undefined, + ); + + return new EvalResultWithSummary( + summary, + collectResults ? collectedResults : [], + ); } finally { - // Ensure results are cleared if not collecting to free memory - if (!collectResults) { - collectedResults.length = 0; - } + // Clean up disk-based span cache after eval completes and stop caching + const spanCache = (evaluator.state ?? _internalGetGlobalState())?.spanCache; + spanCache?.dispose(); + spanCache?.stop(); } - - const summary = experiment - ? await experiment.summarize({ summarizeScores: evaluator.summarizeScores }) - : buildLocalSummary( - evaluator, - collectResults ? collectedResults : [], - localScoreAccumulator ?? undefined, - ); - - return new EvalResultWithSummary( - summary, - collectResults ? collectedResults : [], - ); } export const error = (text: string) => `Error: ${text}`; From db9ad2a397fa3f72446a093864a7c8533f2fba43 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 14:39:05 -0800 Subject: [PATCH 44/65] handle parallell evals --- js/src/span-cache.test.ts | 123 +++++++++++++++++++++++++++++++++++++- js/src/span-cache.ts | 17 +++++- 2 files changed, 137 insertions(+), 3 deletions(-) diff --git a/js/src/span-cache.test.ts b/js/src/span-cache.test.ts index 16ac6682e..e3e010cfa 100644 --- a/js/src/span-cache.test.ts +++ b/js/src/span-cache.test.ts @@ -153,12 +153,15 @@ describe("SpanCache (disk-based)", () => { cache.queueWrite("root-1", "span-1", { span_id: "span-1" }); expect(cache.size).toBe(1); + // Stop first to decrement refcount, then dispose + cache.stop(); cache.dispose(); expect(cache.size).toBe(0); expect(cache.has("root-1")).toBe(false); - // Should be able to write again after dispose (cache is still enabled) + // Should be able to write again after dispose (if we start again) + cache.start(); cache.queueWrite("root-2", "span-2", { span_id: "span-2" }); expect(cache.size).toBe(1); }); @@ -225,8 +228,8 @@ describe("SpanCache (disk-based)", () => { expect(freshCache.size).toBe(1); // Stop after first "eval" (like calling stop() in finally block) - freshCache.dispose(); freshCache.stop(); + freshCache.dispose(); expect(freshCache.disabled).toBe(true); // Start for second "eval" - should work! @@ -235,6 +238,7 @@ describe("SpanCache (disk-based)", () => { freshCache.queueWrite("root-2", "span-2", { span_id: "span-2" }); expect(freshCache.size).toBe(1); + freshCache.stop(); freshCache.dispose(); }); @@ -280,4 +284,119 @@ describe("SpanCache (disk-based)", () => { freshCache.dispose(); }); }); + + describe("parallel eval support with reference counting", () => { + test("should not dispose cache while evals are still running", () => { + const sharedCache = new SpanCache(); + + // Simulate two evals starting + sharedCache.start(); // Eval 1 + expect(sharedCache.disabled).toBe(false); + expect(sharedCache["_activeEvalCount"]).toBe(1); + + sharedCache.start(); // Eval 2 + expect(sharedCache.disabled).toBe(false); + expect(sharedCache["_activeEvalCount"]).toBe(2); + + // Write data from both evals + sharedCache.queueWrite("root-1", "span-1", { span_id: "span-1" }); + sharedCache.queueWrite("root-2", "span-2", { span_id: "span-2" }); + expect(sharedCache.size).toBe(2); + + // Eval 1 finishes first + sharedCache.dispose(); // Should NOT dispose (refcount = 2) + sharedCache.stop(); // Decrements to 1 + + // Cache should still be enabled and data intact + expect(sharedCache.disabled).toBe(false); + expect(sharedCache["_activeEvalCount"]).toBe(1); + expect(sharedCache.size).toBe(2); + expect(sharedCache.getByRootSpanId("root-1")).toBeDefined(); + expect(sharedCache.getByRootSpanId("root-2")).toBeDefined(); + + // Eval 2 finishes + sharedCache.dispose(); // Should NOT dispose yet (refcount = 1) + sharedCache.stop(); // Decrements to 0, disables cache + + // Now cache should be disabled but data still exists + expect(sharedCache.disabled).toBe(true); + expect(sharedCache["_activeEvalCount"]).toBe(0); + + // Final dispose should now work + sharedCache.dispose(); // NOW it disposes (refcount = 0) + expect(sharedCache.size).toBe(0); + }); + + test("should not increment refcount when explicitly disabled", () => { + const disabledCache = new SpanCache({ disabled: true }); + + disabledCache.start(); + expect(disabledCache["_activeEvalCount"]).toBe(0); + expect(disabledCache.disabled).toBe(true); + + disabledCache.start(); + expect(disabledCache["_activeEvalCount"]).toBe(0); + expect(disabledCache.disabled).toBe(true); + + disabledCache.dispose(); + }); + + test("should handle refcount underflow gracefully", () => { + const cache = new SpanCache(); + + // Call stop without start + cache.stop(); + expect(cache["_activeEvalCount"]).toBe(0); + + cache.stop(); + expect(cache["_activeEvalCount"]).toBe(0); // Should not go negative + + // Should still work normally after + cache.start(); + expect(cache["_activeEvalCount"]).toBe(1); + + cache.dispose(); + }); + + test("should simulate realistic parallel eval scenario", async () => { + const sharedCache = new SpanCache(); + + // Simulate Eval 1 starting + sharedCache.start(); + sharedCache.queueWrite("eval1-root", "span-1", { + span_id: "span-1", + input: "eval1-input", + }); + + // Simulate Eval 2 starting (before Eval 1 finishes) + sharedCache.start(); + sharedCache.queueWrite("eval2-root", "span-2", { + span_id: "span-2", + input: "eval2-input", + }); + + // Both evals should see their data + expect(sharedCache.getByRootSpanId("eval1-root")).toBeDefined(); + expect(sharedCache.getByRootSpanId("eval2-root")).toBeDefined(); + + // Eval 1 finishes + sharedCache.dispose(); + sharedCache.stop(); + + // Eval 2 should still have access + expect(sharedCache.disabled).toBe(false); + expect(sharedCache.getByRootSpanId("eval2-root")).toBeDefined(); + expect(sharedCache.getByRootSpanId("eval1-root")).toBeDefined(); + + // Eval 2 finishes + sharedCache.dispose(); + sharedCache.stop(); + + // Now cache is disabled + expect(sharedCache.disabled).toBe(true); + + // Final cleanup + sharedCache.dispose(); + }); + }); }); diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index 1bb88615b..b4d183c34 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -69,6 +69,8 @@ export class SpanCache { private _explicitlyDisabled: boolean; // Tracks whether the cache has been enabled (for evals only) private _enabled: boolean = false; + // Reference count of active evals using this cache + private _activeEvalCount: number = 0; // Small in-memory index tracking which rootSpanIds have data private rootSpanIndex: Set = new Set(); @@ -92,10 +94,12 @@ export class SpanCache { * Start caching spans for use during evaluations. * This only starts caching if the cache wasn't permanently disabled. * Called by Eval() to turn on caching for the duration of the eval. + * Uses reference counting to support parallel evals. */ start(): void { if (!this._explicitlyDisabled) { this._enabled = true; + this._activeEvalCount++; } } @@ -103,9 +107,14 @@ export class SpanCache { * Stop caching spans and return to the default disabled state. * Unlike disable(), this allows start() to work again for future evals. * Called after an eval completes to return to the default state. + * Uses reference counting - only disables when all evals are complete. */ stop(): void { - this._enabled = false; + this._activeEvalCount--; + if (this._activeEvalCount <= 0) { + this._activeEvalCount = 0; + this._enabled = false; + } } get disabled(): boolean { @@ -370,8 +379,14 @@ export class SpanCache { /** * Clean up the cache file. Call this when the eval is complete. + * Only performs cleanup when all active evals have completed (refcount = 0). */ dispose(): void { + // Only dispose if no active evals are using this cache + if (this._activeEvalCount > 0) { + return; + } + // Remove from global registry activeCaches.delete(this); From 763dab6b417fb3c5cc37b1793a931e57ad85b839 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 14:44:51 -0800 Subject: [PATCH 45/65] belt and suspenders --- js/src/logger.ts | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/js/src/logger.ts b/js/src/logger.ts index 221c3c0d4..51454a6e6 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -5705,20 +5705,23 @@ export class SpanImpl implements Span { } // Write to local span cache for scorer access - const cachedSpan: CachedSpan = { - input: partialRecord.input, - output: partialRecord.output, - metadata: partialRecord.metadata as Record | undefined, - span_id: this._spanId, - span_parents: this._spanParents, - span_attributes: - partialRecord.span_attributes as CachedSpan["span_attributes"], - }; - this._state.spanCache.queueWrite( - this._rootSpanId, - this._spanId, - cachedSpan, - ); + // Only cache experiment spans - regular logs don't need caching + if (this.parentObjectType === SpanObjectTypeV3.EXPERIMENT) { + const cachedSpan: CachedSpan = { + input: partialRecord.input, + output: partialRecord.output, + metadata: partialRecord.metadata as Record | undefined, + span_id: this._spanId, + span_parents: this._spanParents, + span_attributes: + partialRecord.span_attributes as CachedSpan["span_attributes"], + }; + this._state.spanCache.queueWrite( + this._rootSpanId, + this._spanId, + cachedSpan, + ); + } const computeRecord = async () => ({ ...partialRecord, From 3fca8aae231b893f404017f6350a128e210fc484 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 15:29:44 -0800 Subject: [PATCH 46/65] remove logs --- js/src/span-cache.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index b4d183c34..a624e20cf 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -142,8 +142,6 @@ export class SpanCache { tmpDir, `braintrust-span-cache-${uniqueId}.jsonl`, ); - console.log("WRITING TO FILE:"); - console.log(this.cacheFilePath); // Open file for append+read this.fileHandle = await iso.openFile!(this.cacheFilePath, "a+"); From 6324d8044fc585c55b66846a4cae520267fb4f80 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 15:53:35 -0800 Subject: [PATCH 47/65] use mergeDict --- js/src/span-cache.ts | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index a624e20cf..09d3cf1ca 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -10,6 +10,7 @@ */ import iso from "./isomorph"; +import { mergeDicts } from "../util/object_util"; // Global registry of active span caches for process exit cleanup const activeCaches = new Set(); @@ -307,7 +308,10 @@ export class SpanCache { if (existing) { spanMap.set( record.spanId, - this.mergeSpanData(existing, record.data), + mergeDicts( + { ...existing } as Record, + record.data as unknown as Record, + ) as unknown as CachedSpan, ); } else { spanMap.set(record.spanId, record.data); @@ -328,7 +332,13 @@ export class SpanCache { } const existing = spanMap.get(record.spanId); if (existing) { - spanMap.set(record.spanId, this.mergeSpanData(existing, record.data)); + spanMap.set( + record.spanId, + mergeDicts( + { ...existing } as Record, + record.data as unknown as Record, + ) as unknown as CachedSpan, + ); } else { spanMap.set(record.spanId, record.data); } @@ -411,26 +421,4 @@ export class SpanCache { this.initPromise = null; this.rootSpanIndex.clear(); } - - private mergeSpanData( - existing: CachedSpan, - incoming: CachedSpan, - ): CachedSpan { - // Merge strategy: incoming values override existing ONLY if defined. - // Undefined values in incoming should not overwrite existing values. - return { - span_id: incoming.span_id, - span_parents: incoming.span_parents ?? existing.span_parents, - input: incoming.input !== undefined ? incoming.input : existing.input, - output: incoming.output !== undefined ? incoming.output : existing.output, - metadata: - existing.metadata || incoming.metadata - ? { ...existing.metadata, ...incoming.metadata } - : undefined, - span_attributes: - existing.span_attributes || incoming.span_attributes - ? { ...existing.span_attributes, ...incoming.span_attributes } - : undefined, - }; - } } From 53e63806ecd26702dbb87ef6f56f1267346fe086 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 15:56:49 -0800 Subject: [PATCH 48/65] cleanup --- js/src/span-cache.ts | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index 09d3cf1ca..baa2f170f 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -306,12 +306,9 @@ export class SpanCache { const existing = spanMap.get(record.spanId); if (existing) { - spanMap.set( - record.spanId, - mergeDicts( - { ...existing } as Record, - record.data as unknown as Record, - ) as unknown as CachedSpan, + mergeDicts( + existing as unknown as Record, + record.data as unknown as Record, ); } else { spanMap.set(record.spanId, record.data); @@ -332,12 +329,9 @@ export class SpanCache { } const existing = spanMap.get(record.spanId); if (existing) { - spanMap.set( - record.spanId, - mergeDicts( - { ...existing } as Record, - record.data as unknown as Record, - ) as unknown as CachedSpan, + mergeDicts( + existing as unknown as Record, + record.data as unknown as Record, ); } else { spanMap.set(record.spanId, record.data); From ebf9524a916e41b83309f00c8daac20f858454d3 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 16:07:07 -0800 Subject: [PATCH 49/65] don't crash --- js/src/span-cache.ts | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/js/src/span-cache.ts b/js/src/span-cache.ts index baa2f170f..67ef64ba2 100644 --- a/js/src/span-cache.ts +++ b/js/src/span-cache.ts @@ -137,15 +137,20 @@ export class SpanCache { } this.initPromise = (async () => { - const tmpDir = iso.tmpdir!(); + if (!iso.tmpdir || !iso.pathJoin || !iso.openFile) { + // Filesystem not available - silently skip initialization + return; + } + + const tmpDir = iso.tmpdir(); const uniqueId = `${Date.now()}-${Math.random().toString(36).slice(2)}`; - this.cacheFilePath = iso.pathJoin!( + this.cacheFilePath = iso.pathJoin( tmpDir, `braintrust-span-cache-${uniqueId}.jsonl`, ); // Open file for append+read - this.fileHandle = await iso.openFile!(this.cacheFilePath, "a+"); + this.fileHandle = await iso.openFile(this.cacheFilePath, "a+"); this.initialized = true; // Register cleanup handler on first initialization @@ -185,9 +190,9 @@ export class SpanCache { } // Delete the temp file - if (cache.cacheFilePath && canUseSpanCache()) { + if (cache.cacheFilePath && canUseSpanCache() && iso.unlinkSync) { try { - iso.unlinkSync!(cache.cacheFilePath); + iso.unlinkSync(cache.cacheFilePath); } catch { // Ignore cleanup errors - file might not exist or already deleted } @@ -292,9 +297,9 @@ export class SpanCache { const spanMap = new Map(); // First, read from disk if initialized - if (this.initialized && this.cacheFilePath) { + if (this.initialized && this.cacheFilePath && iso.readFileSync) { try { - const content = iso.readFileSync!(this.cacheFilePath, "utf8"); + const content = iso.readFileSync(this.cacheFilePath, "utf8"); const lines = content.trim().split("\n").filter(Boolean); for (const line of lines) { @@ -402,9 +407,9 @@ export class SpanCache { this.fileHandle = null; } - if (this.cacheFilePath && canUseSpanCache()) { + if (this.cacheFilePath && canUseSpanCache() && iso.unlinkSync) { try { - iso.unlinkSync!(this.cacheFilePath); + iso.unlinkSync(this.cacheFilePath); } catch { // Ignore cleanup errors } From 91dcaa1d00d328820de779442389daffc927b972 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 13 Jan 2026 17:37:14 -0800 Subject: [PATCH 50/65] PYTHON --- py/examples/evals/eval_example.py | 60 ++++- py/src/braintrust/framework.py | 75 ++++++ py/src/braintrust/logger.py | 19 ++ py/src/braintrust/span_cache.py | 328 +++++++++++++++++++++++ py/src/braintrust/test_span_cache.py | 344 +++++++++++++++++++++++++ py/src/braintrust/test_trace.py | 267 +++++++++++++++++++ py/src/braintrust/trace.py | 372 +++++++++++++++++++++++++++ 7 files changed, 1461 insertions(+), 4 deletions(-) create mode 100644 py/src/braintrust/span_cache.py create mode 100644 py/src/braintrust/test_span_cache.py create mode 100644 py/src/braintrust/test_trace.py create mode 100644 py/src/braintrust/trace.py diff --git a/py/examples/evals/eval_example.py b/py/examples/evals/eval_example.py index 9e7651747..1d605a080 100644 --- a/py/examples/evals/eval_example.py +++ b/py/examples/evals/eval_example.py @@ -1,12 +1,64 @@ +import json + from braintrust import Eval NUM_EXAMPLES = 10 -def exact_match_scorer(input, output, expected): - if expected is None: - return 0.0 - return 1.0 if output == expected else 0.0 +async def exact_match_scorer(input, output, expected, trace=None): + """Async scorer that prints trace spans.""" + score = 0.0 + if expected is not None: + score = 1.0 if output == expected else 0.0 + + if trace: + print("\n" + "="*80) + print(f"🔍 TRACE INFO for input: {input}") + print("="*80) + + # Print trace configuration + config = trace.get_configuration() + print(f"\n📋 Configuration:") + print(f" Object Type: {config.get('objectType')}") + print(f" Object ID: {config.get('objectId')}") + print(f" Root Span: {config.get('rootSpanId')}") + + # Fetch and print spans + try: + spans = await trace.get_spans() + print(f"\n✨ Found {len(spans)} spans:") + print("-"*80) + + for i, span in enumerate(spans, 1): + print(f"\n Span {i}:") + print(f" ID: {span.span_id}") + span_type = span.span_attributes.get('type', 'N/A') if span.span_attributes else 'N/A' + span_name = span.span_attributes.get('name', 'N/A') if span.span_attributes else 'N/A' + print(f" Type: {span_type}") + print(f" Name: {span_name}") + + if span.input: + input_str = json.dumps(span.input) + if len(input_str) > 100: + input_str = input_str[:100] + "..." + print(f" Input: {input_str}") + if span.output: + output_str = json.dumps(span.output) + if len(output_str) > 100: + output_str = output_str[:100] + "..." + print(f" Output: {output_str}") + if span.metadata: + print(f" Metadata: {list(span.metadata.keys())}") + + print("\n" + "="*80 + "\n") + except Exception as e: + print(f"\n⚠️ Error fetching spans: {e}") + import traceback + traceback.print_exc() + else: + print(f"⚠️ No trace available for input: {input}") + + return score def data_fn(): diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index a53ec62e7..f0335406c 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1280,6 +1280,29 @@ async def _run_evaluator_internal( filters: list[Filter], stream: Callable[[SSEProgressEvent], None] | None = None, state: BraintrustState | None = None, +): + # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals) + if state is None: + from braintrust.logger import _internal_get_global_state + + state = _internal_get_global_state() + + state.span_cache.start() + try: + return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state) + finally: + # Clean up disk-based span cache after eval completes and stop caching + state.span_cache.dispose() + state.span_cache.stop() + + +async def _run_evaluator_internal_impl( + experiment, + evaluator: Evaluator, + position: int | None, + filters: list[Filter], + stream: Callable[[SSEProgressEvent], None] | None = None, + state: BraintrustState | None = None, ): event_loop = asyncio.get_event_loop() @@ -1415,6 +1438,57 @@ def report_progress(event: TaskProgressEvent): tags = hooks.tags if hooks.tags else None root_span.log(output=output, metadata=metadata, tags=tags) + # Create trace object for scorers + from braintrust.trace import LocalTrace + + async def ensure_spans_flushed(): + # Flush native Braintrust spans + if experiment: + from braintrust.logger import flush as flush_logger + + await asyncio.get_event_loop().run_in_executor( + None, lambda: flush_logger(state=experiment._state) + ) + elif state: + from braintrust.logger import flush as flush_logger + + await asyncio.get_event_loop().run_in_executor(None, lambda: flush_logger(state=state)) + else: + from braintrust.logger import flush as flush_logger + + await asyncio.get_event_loop().run_in_executor(None, flush_logger) + + experiment_id = None + if experiment: + try: + experiment_id = experiment.id + except: + experiment_id = None + + trace = None + if state or experiment: + # Get the state to use + trace_state = state + if not trace_state and experiment: + trace_state = experiment._state + if not trace_state: + # Fall back to global state + from braintrust.logger import _internal_get_global_state + + trace_state = _internal_get_global_state() + + # Access root_span_id from the concrete SpanImpl instance + # The Span interface doesn't expose this but SpanImpl has it + root_span_id_value = getattr(root_span, "root_span_id", root_span.id) + + trace = LocalTrace( + object_type="experiment", + object_id=experiment_id or "", + root_span_id=root_span_id_value, + ensure_spans_flushed=ensure_spans_flushed, + state=trace_state, + ) + score_promises = [ asyncio.create_task( await_or_run_scorer( @@ -1426,6 +1500,7 @@ def report_progress(event: TaskProgressEvent): "expected": datum.expected, "metadata": metadata, "output": output, + "trace": trace, }, ) ) diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index 31fcfaee2..cd2c7df89 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -396,6 +396,10 @@ def default_get_api_conn(): ), ) + from braintrust.span_cache import SpanCache + + self.span_cache = SpanCache() + def reset_login_info(self): self.app_url: str | None = None self.app_public_url: str | None = None @@ -3855,6 +3859,21 @@ def log_internal(self, event: dict[str, Any] | None = None, internal_data: dict[ if serializable_partial_record.get("metrics", {}).get("end") is not None: self._logged_end_time = serializable_partial_record["metrics"]["end"] + # Write to local span cache for scorer access + # Only cache experiment spans - regular logs don't need caching + if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT: + from braintrust.span_cache import CachedSpan + + cached_span = CachedSpan( + span_id=self.span_id, + input=serializable_partial_record.get("input"), + output=serializable_partial_record.get("output"), + metadata=serializable_partial_record.get("metadata"), + span_parents=self.span_parents, + span_attributes=serializable_partial_record.get("span_attributes"), + ) + self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span) + def compute_record() -> dict[str, Any]: exporter = _get_exporter() return dict( diff --git a/py/src/braintrust/span_cache.py b/py/src/braintrust/span_cache.py new file mode 100644 index 000000000..9f5338e04 --- /dev/null +++ b/py/src/braintrust/span_cache.py @@ -0,0 +1,328 @@ +""" +SpanCache provides a disk-based cache for span data, allowing +scorers to read spans without making server round-trips when possible. + +Spans are stored on disk to minimize memory usage during evaluations. +The cache file is automatically cleaned up when dispose() is called. +""" + +import atexit +import json +import os +import tempfile +import uuid +from typing import Any, Optional + +from braintrust.util import merge_dicts + +# Global registry of active span caches for process exit cleanup +_active_caches: set["SpanCache"] = set() +_exit_handlers_registered = False + + +class CachedSpan: + """Cached span data structure.""" + + def __init__( + self, + span_id: str, + input: Optional[Any] = None, + output: Optional[Any] = None, + metadata: Optional[dict[str, Any]] = None, + span_parents: Optional[list[str]] = None, + span_attributes: Optional[dict[str, Any]] = None, + ): + self.span_id = span_id + self.input = input + self.output = output + self.metadata = metadata + self.span_parents = span_parents + self.span_attributes = span_attributes + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization.""" + result = {"span_id": self.span_id} + if self.input is not None: + result["input"] = self.input + if self.output is not None: + result["output"] = self.output + if self.metadata is not None: + result["metadata"] = self.metadata + if self.span_parents is not None: + result["span_parents"] = self.span_parents + if self.span_attributes is not None: + result["span_attributes"] = self.span_attributes + return result + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "CachedSpan": + """Create from dictionary.""" + return cls( + span_id=data["span_id"], + input=data.get("input"), + output=data.get("output"), + metadata=data.get("metadata"), + span_parents=data.get("span_parents"), + span_attributes=data.get("span_attributes"), + ) + + +class DiskSpanRecord: + """Record structure for disk storage.""" + + def __init__(self, root_span_id: str, span_id: str, data: CachedSpan): + self.root_span_id = root_span_id + self.span_id = span_id + self.data = data + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + "rootSpanId": self.root_span_id, + "spanId": self.span_id, + "data": self.data.to_dict(), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "DiskSpanRecord": + """Create from dictionary.""" + return cls( + root_span_id=data["rootSpanId"], + span_id=data["spanId"], + data=CachedSpan.from_dict(data["data"]), + ) + + +class SpanCache: + """ + Disk-based cache for span data, keyed by rootSpanId. + + This cache writes spans to a temporary file to minimize memory usage. + It uses append-only writes and reads the full file when querying. + """ + + def __init__(self, disabled: bool = False): + self._cache_file_path: Optional[str] = None + self._initialized = False + # Tracks whether the cache was explicitly disabled (via constructor or disable()) + self._explicitly_disabled = disabled + # Tracks whether the cache has been enabled (for evals only) + self._enabled = False + # Reference count of active evals using this cache + self._active_eval_count = 0 + # Small in-memory index tracking which rootSpanIds have data + self._root_span_index: set[str] = set() + # Buffer for pending writes + self._write_buffer: list[DiskSpanRecord] = [] + + def disable(self) -> None: + """ + Disable the cache at runtime. This is called automatically when + OTEL is registered, since OTEL spans won't be in the cache. + """ + self._explicitly_disabled = True + + def start(self) -> None: + """ + Start caching spans for use during evaluations. + This only starts caching if the cache wasn't permanently disabled. + Called by Eval() to turn on caching for the duration of the eval. + Uses reference counting to support parallel evals. + """ + if not self._explicitly_disabled: + self._enabled = True + self._active_eval_count += 1 + + def stop(self) -> None: + """ + Stop caching spans and return to the default disabled state. + Unlike disable(), this allows start() to work again for future evals. + Called after an eval completes to return to the default state. + Uses reference counting - only disables when all evals are complete. + """ + self._active_eval_count -= 1 + if self._active_eval_count <= 0: + self._active_eval_count = 0 + self._enabled = False + + @property + def disabled(self) -> bool: + """Check if cache is disabled.""" + return self._explicitly_disabled or not self._enabled + + def _ensure_initialized(self) -> None: + """Initialize the cache file if not already done.""" + if self.disabled or self._initialized: + return + + # Create temporary file + unique_id = f"{int(os.times().elapsed * 1000000)}-{uuid.uuid4().hex[:8]}" + self._cache_file_path = os.path.join(tempfile.gettempdir(), f"braintrust-span-cache-{unique_id}.jsonl") + + # Create the file + with open(self._cache_file_path, "w") as f: + pass + + self._initialized = True + self._register_exit_handler() + + def _register_exit_handler(self) -> None: + """Register a handler to clean up the temp file on process exit.""" + global _exit_handlers_registered + _active_caches.add(self) + + if not _exit_handlers_registered: + _exit_handlers_registered = True + + def cleanup_all_caches(): + """Clean up all active caches.""" + for cache in _active_caches: + if cache._cache_file_path and os.path.exists(cache._cache_file_path): + try: + os.unlink(cache._cache_file_path) + except: + pass + + atexit.register(cleanup_all_caches) + + def queue_write(self, root_span_id: str, span_id: str, data: CachedSpan) -> None: + """ + Write a span to the cache. + In Python, we write synchronously (no async queue like in TS). + """ + if self.disabled: + return + + self._ensure_initialized() + + record = DiskSpanRecord(root_span_id, span_id, data) + self._write_buffer.append(record) + self._root_span_index.add(root_span_id) + + # Write to disk immediately (simplified compared to TS async version) + self._flush_write_buffer() + + def _flush_write_buffer(self) -> None: + """Flush the write buffer to disk.""" + if not self._write_buffer or not self._cache_file_path: + return + + try: + with open(self._cache_file_path, "a") as f: + for record in self._write_buffer: + f.write(json.dumps(record.to_dict()) + "\n") + self._write_buffer.clear() + except Exception: + # Silently fail - cache is best-effort + pass + + def get_by_root_span_id(self, root_span_id: str) -> Optional[list[CachedSpan]]: + """ + Get all cached spans for a given rootSpanId. + + This reads the file and merges all records for the given rootSpanId. + + Args: + root_span_id: The root span ID to look up + + Returns: + List of cached spans, or None if not in cache + """ + if self.disabled: + return None + + # Quick check using in-memory index + if root_span_id not in self._root_span_index: + return None + + # Accumulate spans by spanId, merging updates + span_map: dict[str, dict[str, Any]] = {} + + # Read from disk if initialized + if self._initialized and self._cache_file_path and os.path.exists(self._cache_file_path): + try: + with open(self._cache_file_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + record_dict = json.loads(line) + record = DiskSpanRecord.from_dict(record_dict) + if record.root_span_id != root_span_id: + continue + + if record.span_id in span_map: + merge_dicts(span_map[record.span_id], record.data.to_dict()) + else: + span_map[record.span_id] = record.data.to_dict() + except: + # Skip malformed lines + pass + except: + # Continue to check buffer even if disk read fails + pass + + # Also check the in-memory write buffer for unflushed data + for record in self._write_buffer: + if record.root_span_id != root_span_id: + continue + if record.span_id in span_map: + merge_dicts(span_map[record.span_id], record.data.to_dict()) + else: + span_map[record.span_id] = record.data.to_dict() + + if not span_map: + return None + + return [CachedSpan.from_dict(data) for data in span_map.values()] + + def has(self, root_span_id: str) -> bool: + """Check if a rootSpanId has cached data.""" + if self.disabled: + return False + return root_span_id in self._root_span_index + + def clear(self, root_span_id: str) -> None: + """ + Clear all cached spans for a given rootSpanId. + Note: This only removes from the index. The data remains in the file + but will be ignored on reads. + """ + self._root_span_index.discard(root_span_id) + + def clear_all(self) -> None: + """Clear all cached data and remove the cache file.""" + self._root_span_index.clear() + self.dispose() + + @property + def size(self) -> int: + """Get the number of root spans currently tracked.""" + return len(self._root_span_index) + + def dispose(self) -> None: + """ + Clean up the cache file. Call this when the eval is complete. + Only performs cleanup when all active evals have completed (refcount = 0). + """ + # Only dispose if no active evals are using this cache + if self._active_eval_count > 0: + return + + # Remove from global registry + _active_caches.discard(self) + + # Clear pending writes + self._write_buffer.clear() + + if self._cache_file_path and os.path.exists(self._cache_file_path): + try: + os.unlink(self._cache_file_path) + except: + # Ignore cleanup errors + pass + self._cache_file_path = None + + self._initialized = False + self._root_span_index.clear() diff --git a/py/src/braintrust/test_span_cache.py b/py/src/braintrust/test_span_cache.py new file mode 100644 index 000000000..fc0b6c7ef --- /dev/null +++ b/py/src/braintrust/test_span_cache.py @@ -0,0 +1,344 @@ +"""Tests for SpanCache (disk-based cache).""" + + +from braintrust.span_cache import CachedSpan, SpanCache + + +def test_span_cache_write_and_read(): + """Test storing and retrieving spans by rootSpanId.""" + cache = SpanCache() + cache.start() # Start for testing (cache is disabled by default) + + root_span_id = "root-123" + span1 = CachedSpan( + span_id="span-1", + input={"text": "hello"}, + output={"response": "world"}, + ) + span2 = CachedSpan( + span_id="span-2", + input={"text": "foo"}, + output={"response": "bar"}, + ) + + cache.queue_write(root_span_id, span1.span_id, span1) + cache.queue_write(root_span_id, span2.span_id, span2) + + spans = cache.get_by_root_span_id(root_span_id) + assert spans is not None + assert len(spans) == 2 + + span_ids = {s.span_id for s in spans} + assert "span-1" in span_ids + assert "span-2" in span_ids + + cache.stop() + cache.dispose() + + +def test_span_cache_return_none_for_unknown(): + """Test that unknown rootSpanId returns None.""" + cache = SpanCache() + cache.start() + + spans = cache.get_by_root_span_id("nonexistent") + assert spans is None + + cache.stop() + cache.dispose() + + +def test_span_cache_merge_on_duplicate_writes(): + """Test that subsequent writes to same spanId merge data.""" + cache = SpanCache() + cache.start() + + root_span_id = "root-123" + span_id = "span-1" + + cache.queue_write( + root_span_id, + span_id, + CachedSpan(span_id=span_id, input={"text": "hello"}), + ) + + cache.queue_write( + root_span_id, + span_id, + CachedSpan(span_id=span_id, output={"response": "world"}), + ) + + spans = cache.get_by_root_span_id(root_span_id) + assert spans is not None + assert len(spans) == 1 + assert spans[0].span_id == span_id + assert spans[0].input == {"text": "hello"} + assert spans[0].output == {"response": "world"} + + cache.stop() + cache.dispose() + + +def test_span_cache_merge_metadata(): + """Test that metadata objects are merged.""" + cache = SpanCache() + cache.start() + + root_span_id = "root-123" + span_id = "span-1" + + cache.queue_write( + root_span_id, + span_id, + CachedSpan(span_id=span_id, metadata={"key1": "value1"}), + ) + + cache.queue_write( + root_span_id, + span_id, + CachedSpan(span_id=span_id, metadata={"key2": "value2"}), + ) + + spans = cache.get_by_root_span_id(root_span_id) + assert spans is not None + assert spans[0].metadata == {"key1": "value1", "key2": "value2"} + + cache.stop() + cache.dispose() + + +def test_span_cache_has(): + """Test the has() method.""" + cache = SpanCache() + cache.start() + + cache.queue_write("root-123", "span-1", CachedSpan(span_id="span-1")) + assert cache.has("root-123") is True + assert cache.has("nonexistent") is False + + cache.stop() + cache.dispose() + + +def test_span_cache_clear(): + """Test clearing spans for a specific rootSpanId.""" + cache = SpanCache() + cache.start() + + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2")) + + cache.clear("root-1") + + assert cache.has("root-1") is False + assert cache.has("root-2") is True + + cache.stop() + cache.dispose() + + +def test_span_cache_clear_all(): + """Test clearing all cached spans.""" + cache = SpanCache() + cache.start() + + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2")) + + cache.clear_all() + + assert cache.size == 0 + + cache.stop() + cache.dispose() + + +def test_span_cache_size(): + """Test the size property.""" + cache = SpanCache() + cache.start() + + assert cache.size == 0 + + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 1 + + cache.queue_write("root-1", "span-2", CachedSpan(span_id="span-2")) # Same root + assert cache.size == 1 + + cache.queue_write("root-2", "span-3", CachedSpan(span_id="span-3")) # Different root + assert cache.size == 2 + + cache.stop() + cache.dispose() + + +def test_span_cache_dispose(): + """Test that dispose cleans up and allows reuse.""" + cache = SpanCache() + cache.start() + + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 1 + + # Stop first to decrement refcount, then dispose + cache.stop() + cache.dispose() + + assert cache.size == 0 + assert cache.has("root-1") is False + + # Should be able to write again after dispose (if we start again) + cache.start() + cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2")) + assert cache.size == 1 + + cache.stop() + cache.dispose() + + +def test_span_cache_disable(): + """Test that disable() prevents writes.""" + cache = SpanCache() + cache.start() + + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 1 + + cache.disable() + + # Writes after disable should be no-ops + cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2")) + assert cache.size == 1 # Still 1, not 2 + + cache.stop() + cache.dispose() + + +def test_span_cache_disabled_getter(): + """Test the disabled property.""" + # Cache is disabled by default until start() is called + cache = SpanCache() + assert cache.disabled is True + + cache.start() + assert cache.disabled is False + + cache.disable() + assert cache.disabled is True + + cache.dispose() + + +def test_span_cache_disabled_from_constructor(): + """Test that cache can be disabled via constructor.""" + cache = SpanCache(disabled=True) + assert cache.disabled is True + + # Writes should be no-ops + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 0 + assert cache.get_by_root_span_id("root-1") is None + + cache.dispose() + + +def test_span_cache_start_stop_lifecycle(): + """Test that stop() allows start() to work again.""" + cache = SpanCache() + + # Initially disabled by default + assert cache.disabled is True + + # Start for first "eval" + cache.start() + assert cache.disabled is False + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 1 + + # Stop after first "eval" + cache.stop() + cache.dispose() + assert cache.disabled is True + + # Start for second "eval" - should work! + cache.start() + assert cache.disabled is False + cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2")) + assert cache.size == 1 + + cache.stop() + cache.dispose() + + +def test_span_cache_disable_prevents_start(): + """Test that disable() prevents start() from working.""" + cache = SpanCache() + + # Simulate disable being called + cache.disable() + assert cache.disabled is True + + # start() should be a no-op after disable() + cache.start() + assert cache.disabled is True + + # Writes should still be no-ops + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 0 + + cache.dispose() + + +def test_span_cache_parallel_eval_refcount(): + """Test reference counting for parallel evals.""" + cache = SpanCache() + + # Simulate two evals starting + cache.start() # Eval 1 + assert cache.disabled is False + + cache.start() # Eval 2 + assert cache.disabled is False + + # Write data from both evals + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2")) + assert cache.size == 2 + + # Eval 1 finishes first + cache.dispose() # Should NOT dispose (refcount = 2) + cache.stop() # Decrements to 1 + + # Cache should still be enabled and data intact + assert cache.disabled is False + assert cache.size == 2 + assert cache.get_by_root_span_id("root-1") is not None + assert cache.get_by_root_span_id("root-2") is not None + + # Eval 2 finishes + cache.dispose() # Should NOT dispose yet (refcount = 1) + cache.stop() # Decrements to 0, disables cache + + # Now cache should be disabled + assert cache.disabled is True + + # Final dispose should now work + cache.dispose() # NOW it disposes (refcount = 0) + assert cache.size == 0 + + +def test_span_cache_refcount_underflow(): + """Test that refcount handles underflow gracefully.""" + cache = SpanCache() + + # Call stop without start + cache.stop() + + # Should work normally after + cache.start() + cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1")) + assert cache.size == 1 + + cache.stop() + cache.dispose() diff --git a/py/src/braintrust/test_trace.py b/py/src/braintrust/test_trace.py new file mode 100644 index 000000000..e2de657d6 --- /dev/null +++ b/py/src/braintrust/test_trace.py @@ -0,0 +1,267 @@ +"""Tests for Trace functionality.""" + +import pytest +from braintrust.trace import CachedSpanFetcher, SpanData + + +# Helper to create mock spans +def make_span(span_id: str, span_type: str, **extra) -> SpanData: + return SpanData( + span_id=span_id, + input={"text": f"input-{span_id}"}, + output={"text": f"output-{span_id}"}, + span_attributes={"type": span_type}, + **extra, + ) + + +class TestCachedSpanFetcher: + """Test CachedSpanFetcher caching behavior.""" + + @pytest.mark.asyncio + async def test_fetch_all_spans_without_filter(self): + """Test fetching all spans when no filter specified.""" + mock_spans = [ + make_span("span-1", "llm"), + make_span("span-2", "function"), + make_span("span-3", "llm"), + ] + + call_count = 0 + + async def fetch_fn(span_type): + nonlocal call_count + call_count += 1 + return mock_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + result = await fetcher.get_spans() + + assert call_count == 1 + assert len(result) == 3 + assert {s.span_id for s in result} == {"span-1", "span-2", "span-3"} + + @pytest.mark.asyncio + async def test_fetch_specific_span_types(self): + """Test fetching specific span types when filter specified.""" + llm_spans = [make_span("span-1", "llm"), make_span("span-2", "llm")] + + call_count = 0 + + async def fetch_fn(span_type): + nonlocal call_count + call_count += 1 + assert span_type == ["llm"] + return llm_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + result = await fetcher.get_spans(span_type=["llm"]) + + assert call_count == 1 + assert len(result) == 2 + + @pytest.mark.asyncio + async def test_return_cached_spans_after_fetching_all(self): + """Test that cached spans are returned without re-fetching after fetching all.""" + mock_spans = [ + make_span("span-1", "llm"), + make_span("span-2", "function"), + ] + + call_count = 0 + + async def fetch_fn(span_type): + nonlocal call_count + call_count += 1 + return mock_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # First call - fetches + await fetcher.get_spans() + assert call_count == 1 + + # Second call - should use cache + result = await fetcher.get_spans() + assert call_count == 1 # Still 1 + assert len(result) == 2 + + @pytest.mark.asyncio + async def test_return_cached_spans_for_previously_fetched_types(self): + """Test that previously fetched types are returned from cache.""" + llm_spans = [make_span("span-1", "llm"), make_span("span-2", "llm")] + + call_count = 0 + + async def fetch_fn(span_type): + nonlocal call_count + call_count += 1 + return llm_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # First call - fetches llm spans + await fetcher.get_spans(span_type=["llm"]) + assert call_count == 1 + + # Second call for same type - should use cache + result = await fetcher.get_spans(span_type=["llm"]) + assert call_count == 1 # Still 1 + assert len(result) == 2 + + @pytest.mark.asyncio + async def test_only_fetch_missing_span_types(self): + """Test that only missing span types are fetched.""" + llm_spans = [make_span("span-1", "llm")] + function_spans = [make_span("span-2", "function")] + + call_count = 0 + + async def fetch_fn(span_type): + nonlocal call_count + call_count += 1 + if span_type == ["llm"]: + return llm_spans + elif span_type == ["function"]: + return function_spans + return [] + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # First call - fetches llm spans + await fetcher.get_spans(span_type=["llm"]) + assert call_count == 1 + + # Second call for both types - should only fetch function + result = await fetcher.get_spans(span_type=["llm", "function"]) + assert call_count == 2 + assert len(result) == 2 + + @pytest.mark.asyncio + async def test_no_refetch_after_fetching_all_spans(self): + """Test that no re-fetching occurs after fetching all spans.""" + all_spans = [ + make_span("span-1", "llm"), + make_span("span-2", "function"), + make_span("span-3", "tool"), + ] + + call_count = 0 + + async def fetch_fn(span_type): + nonlocal call_count + call_count += 1 + return all_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # Fetch all spans + await fetcher.get_spans() + assert call_count == 1 + + # Subsequent filtered calls should use cache + llm_result = await fetcher.get_spans(span_type=["llm"]) + assert call_count == 1 # Still 1 + assert len(llm_result) == 1 + assert llm_result[0].span_id == "span-1" + + function_result = await fetcher.get_spans(span_type=["function"]) + assert call_count == 1 # Still 1 + assert len(function_result) == 1 + assert function_result[0].span_id == "span-2" + + @pytest.mark.asyncio + async def test_filter_by_multiple_span_types_from_cache(self): + """Test filtering by multiple span types from cache.""" + all_spans = [ + make_span("span-1", "llm"), + make_span("span-2", "function"), + make_span("span-3", "tool"), + make_span("span-4", "llm"), + ] + + async def fetch_fn(span_type): + return all_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # Fetch all first + await fetcher.get_spans() + + # Filter for llm and tool + result = await fetcher.get_spans(span_type=["llm", "tool"]) + assert len(result) == 3 + assert {s.span_id for s in result} == {"span-1", "span-3", "span-4"} + + @pytest.mark.asyncio + async def test_return_empty_for_nonexistent_span_type(self): + """Test that empty array is returned for non-existent span type.""" + all_spans = [make_span("span-1", "llm")] + + async def fetch_fn(span_type): + return all_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # Fetch all first + await fetcher.get_spans() + + # Query for non-existent type + result = await fetcher.get_spans(span_type=["nonexistent"]) + assert len(result) == 0 + + @pytest.mark.asyncio + async def test_handle_spans_with_no_type(self): + """Test handling spans without type (empty string type).""" + spans = [ + make_span("span-1", "llm"), + SpanData(span_id="span-2", input={}, span_attributes={}), # No type + SpanData(span_id="span-3", input={}), # No span_attributes + ] + + async def fetch_fn(span_type): + return spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + # Fetch all + result = await fetcher.get_spans() + assert len(result) == 3 + + # Spans without type go into "" bucket + no_type_result = await fetcher.get_spans(span_type=[""]) + assert len(no_type_result) == 2 + + @pytest.mark.asyncio + async def test_handle_empty_results(self): + """Test handling empty results.""" + + async def fetch_fn(span_type): + return [] + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + result = await fetcher.get_spans() + assert len(result) == 0 + + # Should still mark as fetched + await fetcher.get_spans(span_type=["llm"]) + # No additional assertions, just making sure it doesn't crash + + @pytest.mark.asyncio + async def test_handle_empty_span_type_array(self): + """Test that empty spanType array is handled same as undefined.""" + mock_spans = [make_span("span-1", "llm")] + + call_args = [] + + async def fetch_fn(span_type): + call_args.append(span_type) + return mock_spans + + fetcher = CachedSpanFetcher(fetch_fn=fetch_fn) + + result = await fetcher.get_spans(span_type=[]) + + assert call_args[0] is None or call_args[0] == [] + assert len(result) == 1 diff --git a/py/src/braintrust/trace.py b/py/src/braintrust/trace.py new file mode 100644 index 000000000..f798c9f35 --- /dev/null +++ b/py/src/braintrust/trace.py @@ -0,0 +1,372 @@ +""" +Trace objects for accessing spans in evaluations. + +This module provides the LocalTrace class which allows scorers to access +spans from the current evaluation task without making server round-trips. +""" + +import asyncio +from typing import Any, Awaitable, Callable, Optional, Protocol + +from braintrust.logger import BraintrustState, ObjectFetcher + + +class SpanData: + """Span data returned by get_spans().""" + + def __init__( + self, + input: Optional[Any] = None, + output: Optional[Any] = None, + metadata: Optional[dict[str, Any]] = None, + span_id: Optional[str] = None, + span_parents: Optional[list[str]] = None, + span_attributes: Optional[dict[str, Any]] = None, + **kwargs: Any, + ): + self.input = input + self.output = output + self.metadata = metadata + self.span_id = span_id + self.span_parents = span_parents + self.span_attributes = span_attributes + # Store any additional fields + for key, value in kwargs.items(): + setattr(self, key, value) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SpanData": + """Create SpanData from a dictionary.""" + return cls(**data) + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary.""" + result = {} + for key, value in self.__dict__.items(): + if value is not None: + result[key] = value + return result + + +class SpanFetcher(ObjectFetcher[dict[str, Any]]): + """ + Fetcher for spans by root_span_id, using the ObjectFetcher pattern. + Handles pagination automatically via cursor-based iteration. + """ + + def __init__( + self, + object_type: str, + object_id: str, + root_span_id: str, + state: BraintrustState, + span_type_filter: Optional[list[str]] = None, + ): + # Build the filter expression for root_span_id and optionally span_attributes.type + filter_expr = self._build_filter(root_span_id, span_type_filter) + + super().__init__( + object_type=object_type, + _internal_btql={"filter": filter_expr}, + ) + self._object_id = object_id + self._state = state + + @staticmethod + def _build_filter(root_span_id: str, span_type_filter: Optional[list[str]] = None) -> dict[str, Any]: + """Build BTQL filter expression.""" + children = [ + # Base filter: root_span_id = 'value' + { + "op": "eq", + "left": {"op": "ident", "name": ["root_span_id"]}, + "right": {"op": "literal", "value": root_span_id}, + }, + # Exclude span_attributes.purpose = 'scorer' + { + "op": "or", + "children": [ + { + "op": "isnull", + "expr": {"op": "ident", "name": ["span_attributes", "purpose"]}, + }, + { + "op": "ne", + "left": {"op": "ident", "name": ["span_attributes", "purpose"]}, + "right": {"op": "literal", "value": "scorer"}, + }, + ], + }, + ] + + # If span type filter specified, add it + if span_type_filter and len(span_type_filter) > 0: + children.append( + { + "op": "in", + "left": {"op": "ident", "name": ["span_attributes", "type"]}, + "right": {"op": "literal", "value": span_type_filter}, + } + ) + + return {"op": "and", "children": children} + + @property + def id(self) -> str: + return self._object_id + + def _get_state(self) -> BraintrustState: + return self._state + + +SpanFetchFn = Callable[[Optional[list[str]]], Awaitable[list[SpanData]]] + + +class CachedSpanFetcher: + """ + Cached span fetcher that handles fetching and caching spans by type. + + Caching strategy: + - Cache spans by span type (dict[spanType, list[SpanData]]) + - Track if all spans have been fetched (all_fetched flag) + - When filtering by spanType, only fetch types not already in cache + """ + + def __init__( + self, + object_type: Optional[str] = None, + object_id: Optional[str] = None, + root_span_id: Optional[str] = None, + get_state: Optional[Callable[[], Awaitable[BraintrustState]]] = None, + fetch_fn: Optional[SpanFetchFn] = None, + ): + self._span_cache: dict[str, list[SpanData]] = {} + self._all_fetched = False + + if fetch_fn is not None: + # Direct fetch function injection (for testing) + self._fetch_fn = fetch_fn + else: + # Standard constructor with SpanFetcher + if object_type is None or object_id is None or root_span_id is None or get_state is None: + raise ValueError("Must provide either fetch_fn or all of object_type, object_id, root_span_id, get_state") + + async def _fetch_fn(span_type: Optional[list[str]]) -> list[SpanData]: + state = await get_state() + fetcher = SpanFetcher( + object_type=object_type, + object_id=object_id, + root_span_id=root_span_id, + state=state, + span_type_filter=span_type, + ) + rows = list(fetcher.fetch()) + # Filter out scorer spans + filtered = [ + row + for row in rows + if not ( + isinstance(row.get("span_attributes"), dict) + and row.get("span_attributes", {}).get("purpose") == "scorer" + ) + ] + return [ + SpanData( + input=row.get("input"), + output=row.get("output"), + metadata=row.get("metadata"), + span_id=row.get("span_id"), + span_parents=row.get("span_parents"), + span_attributes=row.get("span_attributes"), + id=row.get("id"), + _xact_id=row.get("_xact_id"), + _pagination_key=row.get("_pagination_key"), + root_span_id=row.get("root_span_id"), + ) + for row in filtered + ] + + self._fetch_fn = _fetch_fn + + async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]: + """ + Get spans, using cache when possible. + + Args: + span_type: Optional list of span types to filter by + + Returns: + List of matching spans + """ + # If we've fetched all spans, just filter from cache + if self._all_fetched: + return self._get_from_cache(span_type) + + # If no filter requested, fetch everything + if not span_type or len(span_type) == 0: + await self._fetch_spans(None) + self._all_fetched = True + return self._get_from_cache(None) + + # Find which spanTypes we don't have in cache yet + missing_types = [t for t in span_type if t not in self._span_cache] + + # If all requested types are cached, return from cache + if not missing_types: + return self._get_from_cache(span_type) + + # Fetch only the missing types + await self._fetch_spans(missing_types) + return self._get_from_cache(span_type) + + async def _fetch_spans(self, span_type: Optional[list[str]]) -> None: + """Fetch spans from the server.""" + spans = await self._fetch_fn(span_type) + + for span in spans: + span_attrs = span.span_attributes or {} + span_type_str = span_attrs.get("type", "") + if span_type_str not in self._span_cache: + self._span_cache[span_type_str] = [] + self._span_cache[span_type_str].append(span) + + def _get_from_cache(self, span_type: Optional[list[str]]) -> list[SpanData]: + """Get spans from cache, optionally filtering by type.""" + if not span_type or len(span_type) == 0: + # Return all spans + result = [] + for spans in self._span_cache.values(): + result.extend(spans) + return result + + # Return only requested types + result = [] + for type_str in span_type: + if type_str in self._span_cache: + result.extend(self._span_cache[type_str]) + return result + + +class Trace(Protocol): + """ + Interface for trace objects that can be used by scorers. + Both the SDK's LocalTrace class and the API wrapper's WrapperTrace implement this. + """ + + def get_configuration(self) -> dict[str, str]: + """Get the trace configuration (objectType, objectId, rootSpanId).""" + ... + + async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]: + """ + Fetch all spans for this root span. + + Args: + span_type: Optional list of span types to filter by + + Returns: + List of matching spans + """ + ... + + +class LocalTrace: + """ + SDK implementation of Trace that uses local span cache and falls back to BTQL. + Carries identifying information about the evaluation so scorers can perform + richer logging or side effects. + """ + + def __init__( + self, + object_type: str, + object_id: str, + root_span_id: str, + ensure_spans_flushed: Optional[Callable[[], Awaitable[None]]], + state: BraintrustState, + ): + self._object_type = object_type + self._object_id = object_id + self._root_span_id = root_span_id + self._ensure_spans_flushed = ensure_spans_flushed + self._state = state + self._spans_flushed = False + self._spans_flush_promise: Optional[asyncio.Task[None]] = None + + async def get_state() -> BraintrustState: + await self._ensure_spans_ready() + # Ensure state is logged in + await asyncio.get_event_loop().run_in_executor(None, lambda: state.login()) + return state + + self._cached_fetcher = CachedSpanFetcher( + object_type=object_type, + object_id=object_id, + root_span_id=root_span_id, + get_state=get_state, + ) + + def get_configuration(self) -> dict[str, str]: + """Get the trace configuration.""" + return { + "objectType": self._object_type, + "objectId": self._object_id, + "rootSpanId": self._root_span_id, + } + + async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]: + """ + Fetch all rows for this root span from its parent object (experiment or project logs). + First checks the local span cache for recently logged spans, then falls + back to CachedSpanFetcher which handles BTQL fetching and caching. + + Args: + span_type: Optional list of span types to filter by + + Returns: + List of matching spans + """ + # Try local span cache first (for recently logged spans not yet flushed) + cached_spans = self._state.span_cache.get_by_root_span_id(self._root_span_id) + if cached_spans and len(cached_spans) > 0: + # Filter by purpose + spans = [span for span in cached_spans if not (span.span_attributes or {}).get("purpose") == "scorer"] + + # Filter by span type if requested + if span_type and len(span_type) > 0: + spans = [span for span in spans if (span.span_attributes or {}).get("type", "") in span_type] + + # Convert to SpanData + return [ + SpanData( + input=span.input, + output=span.output, + metadata=span.metadata, + span_id=span.span_id, + span_parents=span.span_parents, + span_attributes=span.span_attributes, + ) + for span in spans + ] + + # Fall back to CachedSpanFetcher for BTQL fetching with caching + return await self._cached_fetcher.get_spans(span_type) + + async def _ensure_spans_ready(self) -> None: + """Ensure spans are flushed before fetching.""" + if self._spans_flushed or not self._ensure_spans_flushed: + return + + if self._spans_flush_promise is None: + + async def flush_and_mark(): + try: + await self._ensure_spans_flushed() + self._spans_flushed = True + except Exception as err: + self._spans_flush_promise = None + raise err + + self._spans_flush_promise = asyncio.create_task(flush_and_mark()) + + await self._spans_flush_promise From bc1f47397575476422ab23cc39fb32f7d0242d38 Mon Sep 17 00:00:00 2001 From: Caitlin Pinn Date: Tue, 13 Jan 2026 16:53:20 -0800 Subject: [PATCH 51/65] fix bundler test --- js/src/cli/index.ts | 2 +- js/src/cli/util/external-packages-plugin.ts | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/js/src/cli/index.ts b/js/src/cli/index.ts index 4c04e312c..8080e5a30 100755 --- a/js/src/cli/index.ts +++ b/js/src/cli/index.ts @@ -338,7 +338,7 @@ async function initFile({ plugins, externalPackages, }), - external: [], + external: ["fsevents", "chokidar"], write: true, plugins: [], minify: true, diff --git a/js/src/cli/util/external-packages-plugin.ts b/js/src/cli/util/external-packages-plugin.ts index 5d540bb33..115a98bff 100644 --- a/js/src/cli/util/external-packages-plugin.ts +++ b/js/src/cli/util/external-packages-plugin.ts @@ -18,6 +18,8 @@ export function createMarkKnownPackagesExternalPlugin( "config", "lightningcss", "@mapbox/node-pre-gyp", + "fsevents", + "chokidar", ...additionalPackages, ]; const escapedPackages = knownPackages.map((pkg) => { From d2226f722fbb6c49e4dd9bcb46d8c42bb4143d9b Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Tue, 13 Jan 2026 20:40:20 -0800 Subject: [PATCH 52/65] ag fixes --- py/src/braintrust/framework.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index f0335406c..f266c8eb6 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1313,11 +1313,13 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs): {**parent_propagated}, {"span_attributes": {"purpose": "scorer"}}, ) + # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans + logged_input = {k: v for k, v in kwargs.items() if k != "trace"} with root_span.start_span( name=name, span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"}, propagated_event=merged_propagated, - input=dict(**kwargs), + input=logged_input, ) as span: score = scorer if hasattr(scorer, "eval_async"): From d00c30c5368030c87a192789ccb73b523acb37d1 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Wed, 14 Jan 2026 00:01:08 -0800 Subject: [PATCH 53/65] fix invoke --- py/src/braintrust/functions/invoke.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py index 3aecc3a73..b3b3faf4e 100644 --- a/py/src/braintrust/functions/invoke.py +++ b/py/src/braintrust/functions/invoke.py @@ -249,8 +249,20 @@ def f(*args: Any, **kwargs: Any) -> Any: # Task. return invoke(project_name=project_name, slug=slug, version=version, input=args[0]) else: - # Scorer. - return invoke(project_name=project_name, slug=slug, version=version, input=kwargs) + # Scorer - convert trace to trace_ref for remote invocation + scorer_input = {} + for k, v in kwargs.items(): + if k == "trace" and v is not None: + # Convert LocalTrace to trace_ref dict for remote invocation + config = v.get_configuration() + scorer_input["_trace_ref"] = { + "object_type": config.get("objectType"), + "object_id": config.get("objectId"), + "root_span_id": config.get("rootSpanId"), + } + else: + scorer_input[k] = v + return invoke(project_name=project_name, slug=slug, version=version, input=scorer_input) f.__name__ = f"init_function-{project_name}-{slug}-{version or 'latest'}" return f From a53b21a71047212fb24d4a418dd6123721b58a1e Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Wed, 14 Jan 2026 22:11:15 -0800 Subject: [PATCH 54/65] snapshot --- js/src/functions/invoke.ts | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/js/src/functions/invoke.ts b/js/src/functions/invoke.ts index d56754d68..715b8c356 100644 --- a/js/src/functions/invoke.ts +++ b/js/src/functions/invoke.ts @@ -281,11 +281,33 @@ export function initFunction({ // eslint-disable-next-line @typescript-eslint/no-explicit-any const f = async (input: any): Promise => { + // When used as a scorer, input contains { input, output, expected, metadata, trace } + // Strip trace (not serializable) and convert to trace_ref for the remote function + let invokeInput = input; + if (input && typeof input === "object" && "trace" in input) { + const { trace, ...rest } = input; + // Duck-type check for Trace interface (has getConfiguration method) + if (trace && typeof trace.getConfiguration === "function") { + const configuration = trace.getConfiguration(); + invokeInput = { + ...rest, + trace_ref: { + object_type: configuration.objectType, + object_id: configuration.objectId, + root_span_id: configuration.rootSpanId, + }, + }; + } else { + // trace exists but isn't a Trace object, just strip it + invokeInput = rest; + } + } + return await invoke({ projectName, slug, version, - input, + input: invokeInput, }); }; From d0c375c0e8a1eea0ef2bde85112621458e1f7032 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Wed, 14 Jan 2026 23:10:08 -0800 Subject: [PATCH 55/65] Revert "snapshot" This reverts commit a53b21a71047212fb24d4a418dd6123721b58a1e. --- js/src/functions/invoke.ts | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/js/src/functions/invoke.ts b/js/src/functions/invoke.ts index 715b8c356..d56754d68 100644 --- a/js/src/functions/invoke.ts +++ b/js/src/functions/invoke.ts @@ -281,33 +281,11 @@ export function initFunction({ // eslint-disable-next-line @typescript-eslint/no-explicit-any const f = async (input: any): Promise => { - // When used as a scorer, input contains { input, output, expected, metadata, trace } - // Strip trace (not serializable) and convert to trace_ref for the remote function - let invokeInput = input; - if (input && typeof input === "object" && "trace" in input) { - const { trace, ...rest } = input; - // Duck-type check for Trace interface (has getConfiguration method) - if (trace && typeof trace.getConfiguration === "function") { - const configuration = trace.getConfiguration(); - invokeInput = { - ...rest, - trace_ref: { - object_type: configuration.objectType, - object_id: configuration.objectId, - root_span_id: configuration.rootSpanId, - }, - }; - } else { - // trace exists but isn't a Trace object, just strip it - invokeInput = rest; - } - } - return await invoke({ projectName, slug, version, - input: invokeInput, + input, }); }; From 9eb327ee74e2d8df751a0a07ed3467f49df50753 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Wed, 14 Jan 2026 23:10:19 -0800 Subject: [PATCH 56/65] Revert "fix invoke" This reverts commit d00c30c5368030c87a192789ccb73b523acb37d1. --- py/src/braintrust/functions/invoke.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py index b3b3faf4e..3aecc3a73 100644 --- a/py/src/braintrust/functions/invoke.py +++ b/py/src/braintrust/functions/invoke.py @@ -249,20 +249,8 @@ def f(*args: Any, **kwargs: Any) -> Any: # Task. return invoke(project_name=project_name, slug=slug, version=version, input=args[0]) else: - # Scorer - convert trace to trace_ref for remote invocation - scorer_input = {} - for k, v in kwargs.items(): - if k == "trace" and v is not None: - # Convert LocalTrace to trace_ref dict for remote invocation - config = v.get_configuration() - scorer_input["_trace_ref"] = { - "object_type": config.get("objectType"), - "object_id": config.get("objectId"), - "root_span_id": config.get("rootSpanId"), - } - else: - scorer_input[k] = v - return invoke(project_name=project_name, slug=slug, version=version, input=scorer_input) + # Scorer. + return invoke(project_name=project_name, slug=slug, version=version, input=kwargs) f.__name__ = f"init_function-{project_name}-{slug}-{version or 'latest'}" return f From c27c5388837806a9bda7619f2cb6b2693a5a3653 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Thu, 15 Jan 2026 00:01:13 -0800 Subject: [PATCH 57/65] fix json serializability --- py/src/braintrust/trace.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/py/src/braintrust/trace.py b/py/src/braintrust/trace.py index f798c9f35..aab078ee7 100644 --- a/py/src/braintrust/trace.py +++ b/py/src/braintrust/trace.py @@ -254,7 +254,7 @@ class Trace(Protocol): """ def get_configuration(self) -> dict[str, str]: - """Get the trace configuration (objectType, objectId, rootSpanId).""" + """Get the trace configuration (object_type, object_id, root_span_id).""" ... async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]: @@ -270,11 +270,15 @@ async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanDat ... -class LocalTrace: +class LocalTrace(dict): """ SDK implementation of Trace that uses local span cache and falls back to BTQL. Carries identifying information about the evaluation so scorers can perform richer logging or side effects. + + Inherits from dict so that it serializes to {"trace_ref": {...}} when passed + to json.dumps(). This allows LocalTrace to be transparently serialized when + passed through invoke() or other JSON-serializing code paths. """ def __init__( @@ -285,6 +289,15 @@ def __init__( ensure_spans_flushed: Optional[Callable[[], Awaitable[None]]], state: BraintrustState, ): + # Initialize dict with trace_ref for JSON serialization + super().__init__({ + "trace_ref": { + "object_type": object_type, + "object_id": object_id, + "root_span_id": root_span_id, + } + }) + self._object_type = object_type self._object_id = object_id self._root_span_id = root_span_id @@ -309,9 +322,9 @@ async def get_state() -> BraintrustState: def get_configuration(self) -> dict[str, str]: """Get the trace configuration.""" return { - "objectType": self._object_type, - "objectId": self._object_id, - "rootSpanId": self._root_span_id, + "object_type": self._object_type, + "object_id": self._object_id, + "root_span_id": self._root_span_id, } async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]: From c53ff5079d5fd65a88c50d2608a829b9282eef5c Mon Sep 17 00:00:00 2001 From: Alex Z Date: Thu, 15 Jan 2026 10:54:36 -0800 Subject: [PATCH 58/65] rename config fields --- js/src/trace.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/js/src/trace.ts b/js/src/trace.ts index 49c1794f2..64d329837 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -228,9 +228,9 @@ export class CachedSpanFetcher { */ export interface Trace { getConfiguration(): { - objectType: string; - objectId: string; - rootSpanId: string; + object_type: string; + object_id: string; + root_span_id: string; }; getSpans(options?: { spanType?: string[] }): Promise; } @@ -276,9 +276,9 @@ export class LocalTrace implements Trace { getConfiguration() { return { - objectType: this.objectType, - objectId: this.objectId, - rootSpanId: this.rootSpanId, + object_type: this.objectType, + object_id: this.objectId, + root_span_id: this.rootSpanId, }; } From 131457322c311da83dd161889c4bbb9647bd2086 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Thu, 15 Jan 2026 19:50:47 -0800 Subject: [PATCH 59/65] wip otel stuff --- py/src/braintrust/functions/invoke.py | 4 +- py/src/braintrust/functions/test_invoke.py | 61 +++++++++++++++ py/src/braintrust/logger.py | 35 +++++++++ py/src/braintrust/test_logger.py | 89 ++++++++++++++++++++++ 4 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 py/src/braintrust/functions/test_invoke.py diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py index 3aecc3a73..5c566c3f9 100644 --- a/py/src/braintrust/functions/invoke.py +++ b/py/src/braintrust/functions/invoke.py @@ -3,7 +3,7 @@ from sseclient import SSEClient from .._generated_types import FunctionTypeEnum -from ..logger import Exportable, get_span_parent_object, login, proxy_conn +from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn from ..util import response_raise_for_status from .constants import INVOKE_API_VERSION from .stream import BraintrustInvokeError, BraintrustStream @@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None): :param version: Optional version of the function to use. Defaults to latest. :return: A function that can be used as a task or scorer. """ + # Disable span cache since remote function spans won't be in the local cache + _internal_get_global_state().span_cache.disable() def f(*args: Any, **kwargs: Any) -> Any: if len(args) > 0: diff --git a/py/src/braintrust/functions/test_invoke.py b/py/src/braintrust/functions/test_invoke.py new file mode 100644 index 000000000..c38e2e105 --- /dev/null +++ b/py/src/braintrust/functions/test_invoke.py @@ -0,0 +1,61 @@ +"""Tests for the invoke module, particularly init_function.""" + + +from braintrust.functions.invoke import init_function +from braintrust.logger import _internal_get_global_state, _internal_reset_global_state + + +class TestInitFunction: + """Tests for init_function.""" + + def setup_method(self): + """Reset state before each test.""" + _internal_reset_global_state() + + def teardown_method(self): + """Clean up after each test.""" + _internal_reset_global_state() + + def test_init_function_disables_span_cache(self): + """Test that init_function disables the span cache.""" + state = _internal_get_global_state() + + # Cache should be disabled by default (it's only enabled during evals) + assert state.span_cache.disabled is True + + # Enable the cache (simulating what happens during eval) + state.span_cache.start() + assert state.span_cache.disabled is False + + # Call init_function + f = init_function("test-project", "test-function") + + # Cache should now be disabled (init_function explicitly disables it) + assert state.span_cache.disabled is True + assert f.__name__ == "init_function-test-project-test-function-latest" + + def test_init_function_with_version(self): + """Test that init_function creates a function with the correct name including version.""" + f = init_function("my-project", "my-scorer", version="v1") + assert f.__name__ == "init_function-my-project-my-scorer-v1" + + def test_init_function_without_version_uses_latest(self): + """Test that init_function uses 'latest' in name when version not specified.""" + f = init_function("my-project", "my-scorer") + assert f.__name__ == "init_function-my-project-my-scorer-latest" + + def test_init_function_permanently_disables_cache(self): + """Test that init_function permanently disables the cache (can't be re-enabled).""" + state = _internal_get_global_state() + + # Enable the cache + state.span_cache.start() + assert state.span_cache.disabled is False + + # Call init_function + init_function("test-project", "test-function") + assert state.span_cache.disabled is True + + # Try to start again - should still be disabled because of explicit disable + state.span_cache.start() + assert state.span_cache.disabled is True diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index ffa99ade7..7bbf357c7 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -396,6 +396,7 @@ def default_get_api_conn(): from braintrust.span_cache import SpanCache self.span_cache = SpanCache() + self._otel_flush_callback: Any | None = None def reset_login_info(self): self.app_url: str | None = None @@ -453,6 +454,21 @@ def context_manager(self): return self._context_manager + def register_otel_flush(self, callback: Any) -> None: + """ + Register an OTEL flush callback. This is called by the OTEL integration + when it initializes a span processor/exporter. + """ + self._otel_flush_callback = callback + + async def flush_otel(self) -> None: + """ + Flush OTEL spans if a callback is registered. + Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL. + """ + if self._otel_flush_callback: + await self._otel_flush_callback() + def copy_state(self, other: "BraintrustState"): """Copy login information from another BraintrustState instance.""" self.__dict__.update({ @@ -1762,6 +1778,25 @@ def login( _state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login) +def register_otel_flush(callback: Any) -> None: + """ + Register a callback to flush OTEL spans. This is called by the OTEL integration + when it initializes a span processor/exporter. + + When ensure_spans_flushed is called (e.g., before a BTQL query in scorers), + this callback will be invoked to ensure OTEL spans are flushed to the server. + + Also disables the span cache, since OTEL spans aren't in the local cache + and we need BTQL to see the complete span tree (both native + OTEL spans). + + :param callback: The async callback function to flush OTEL spans. + """ + global _state + _state.register_otel_flush(callback) + # Disable span cache since OTEL spans aren't in the local cache + _state.span_cache.disable() + + def login_to_state( app_url: str | None = None, api_key: str | None = None, diff --git a/py/src/braintrust/test_logger.py b/py/src/braintrust/test_logger.py index 25fd4baa2..cdb222067 100644 --- a/py/src/braintrust/test_logger.py +++ b/py/src/braintrust/test_logger.py @@ -2434,6 +2434,95 @@ def test_logger_export_respects_otel_compat_enabled(): assert version == 4, f"Expected V4 encoding (version=4), got version={version}" +def test_register_otel_flush_callback(): + """Test that register_otel_flush registers a callback correctly.""" + import asyncio + + from braintrust import register_otel_flush + from braintrust.logger import _internal_get_global_state + from braintrust.test_helpers import init_test_logger + + init_test_logger(__name__) + state = _internal_get_global_state() + + # Track if callback was invoked + callback_invoked = False + + async def mock_flush(): + nonlocal callback_invoked + callback_invoked = True + + # Register the callback + register_otel_flush(mock_flush) + + # Calling flush_otel should invoke the registered callback + asyncio.run(state.flush_otel()) + + assert callback_invoked is True + + +def test_register_otel_flush_disables_span_cache(): + """Test that register_otel_flush disables the span cache.""" + from braintrust import register_otel_flush + from braintrust.logger import _internal_get_global_state + from braintrust.test_helpers import init_test_logger + + init_test_logger(__name__) + state = _internal_get_global_state() + + # Enable the cache (simulating what happens during eval) + state.span_cache.start() + assert state.span_cache.disabled is False + + async def mock_flush(): + pass + + # Register OTEL flush + register_otel_flush(mock_flush) + + # Cache should now be disabled + assert state.span_cache.disabled is True + + +def test_flush_otel_noop_when_no_callback(): + """Test that flush_otel is a no-op when no callback is registered.""" + import asyncio + + from braintrust.logger import _internal_get_global_state + from braintrust.test_helpers import init_test_logger + + init_test_logger(__name__) + state = _internal_get_global_state() + + # Should not throw even with no callback registered + asyncio.run(state.flush_otel()) + + +def test_register_otel_flush_permanently_disables_cache(): + """Test that register_otel_flush permanently disables the cache.""" + from braintrust import register_otel_flush + from braintrust.logger import _internal_get_global_state + from braintrust.test_helpers import init_test_logger + + init_test_logger(__name__) + state = _internal_get_global_state() + + # Enable the cache + state.span_cache.start() + assert state.span_cache.disabled is False + + async def mock_flush(): + pass + + # Register OTEL flush + register_otel_flush(mock_flush) + assert state.span_cache.disabled is True + + # Try to start again - should still be disabled because of explicit disable + state.span_cache.start() + assert state.span_cache.disabled is True + + class TestJSONAttachment(TestCase): def test_create_attachment_from_json_data(self): """Test creating an attachment from JSON data.""" From 67abadc3e23d3d59547894e3cc47b41f7208918d Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Thu, 15 Jan 2026 22:27:05 -0800 Subject: [PATCH 60/65] add a toJSON method --- js/src/trace.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/js/src/trace.ts b/js/src/trace.ts index 64d329837..965416e5b 100644 --- a/js/src/trace.ts +++ b/js/src/trace.ts @@ -282,6 +282,20 @@ export class LocalTrace implements Trace { }; } + /** + * Custom JSON serialization - returns trace_ref format so LocalTrace + * can be safely passed through JSON.stringify() (e.g., in invoke()). + */ + toJSON() { + return { + trace_ref: { + object_type: this.objectType, + object_id: this.objectId, + root_span_id: this.rootSpanId, + }, + }; + } + /** * Fetch all rows for this root span from its parent object (experiment or project logs). * First checks the local span cache for recently logged spans, then falls From 5c18fe46b087b4c7ffb30930f8b1f55da4f13d35 Mon Sep 17 00:00:00 2001 From: Ankur Goyal Date: Thu, 15 Jan 2026 18:28:56 -0800 Subject: [PATCH 61/65] init-dataset-with-id (#1276) Make it easier to initialize an experiment's dataset reference without calling `initDataset` --- js/src/logger.test.ts | 19 +++++++++++++++++ js/src/logger.ts | 35 ++++++++++++++++++++++++++++---- py/src/braintrust/logger.py | 27 ++++++++++++++++++++---- py/src/braintrust/test_logger.py | 27 ++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 8 deletions(-) diff --git a/js/src/logger.test.ts b/js/src/logger.test.ts index 4882c478a..7123985ed 100644 --- a/js/src/logger.test.ts +++ b/js/src/logger.test.ts @@ -147,6 +147,25 @@ test("init validation", () => { ); }); +test("init accepts dataset with id only", () => { + // Test that the type system accepts {id: string} + const datasetIdOnly = { id: "dataset-id-123" }; + + // This should compile without type errors + // We're testing the type system, not the runtime behavior + expect(datasetIdOnly.id).toBe("dataset-id-123"); + expect("version" in datasetIdOnly).toBe(false); +}); + +test("init accepts dataset with id and version", () => { + // Test that the type system accepts {id: string, version?: string} + const datasetWithVersion = { id: "dataset-id-123", version: "v2" }; + + // This should compile without type errors + expect(datasetWithVersion.id).toBe("dataset-id-123"); + expect(datasetWithVersion.version).toBe("v2"); +}); + describe("prompt.build structured output templating", () => { test("applies nunjucks templating inside schema", () => { const prompt = new Prompt( diff --git a/js/src/logger.ts b/js/src/logger.ts index 51454a6e6..c1e2c62e7 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -2953,10 +2953,18 @@ type InitOpenOption = { open?: IsOpen; }; +/** + * Reference to a dataset by ID and optional version. + */ +export interface DatasetRef { + id: string; + version?: string; +} + export type InitOptions = FullLoginOptions & { experiment?: string; description?: string; - dataset?: AnyDataset; + dataset?: AnyDataset | DatasetRef; update?: boolean; baseExperiment?: string; isPublic?: boolean; @@ -3167,8 +3175,21 @@ export function init( } if (dataset !== undefined) { - args["dataset_id"] = await dataset.id; - args["dataset_version"] = await dataset.version(); + if ( + "id" in dataset && + typeof dataset.id === "string" && + !("__braintrust_dataset_marker" in dataset) + ) { + // Simple {id: ..., version?: ...} object + args["dataset_id"] = dataset.id; + if ("version" in dataset && dataset.version !== undefined) { + args["dataset_version"] = dataset.version; + } + } else { + // Full Dataset object + args["dataset_id"] = await (dataset as AnyDataset).id; + args["dataset_version"] = await (dataset as AnyDataset).version(); + } } if (isPublic !== undefined) { @@ -3217,7 +3238,13 @@ export function init( }, ); - const ret = new Experiment(state, lazyMetadata, dataset); + const ret = new Experiment( + state, + lazyMetadata, + dataset !== undefined && "version" in dataset + ? (dataset as AnyDataset) + : undefined, + ); if (options.setCurrent ?? true) { state.currentExperiment = ret; } diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index 7bbf357c7..ab908f19d 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -98,6 +98,14 @@ Metadata = dict[str, Any] DATA_API_VERSION = 2 + +class DatasetRef(TypedDict, total=False): + """Reference to a dataset by ID and optional version.""" + + id: str + version: str + + T = TypeVar("T") TMapping = TypeVar("TMapping", bound=Mapping[str, Any]) TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any]) @@ -1314,7 +1322,7 @@ def init( project: str | None = None, experiment: str | None = None, description: str | None = None, - dataset: Optional["Dataset"] = None, + dataset: Optional["Dataset"] | DatasetRef = None, open: bool = False, base_experiment: str | None = None, is_public: bool = False, @@ -1431,8 +1439,15 @@ def compute_metadata(): args["ancestor_commits"] = list(get_past_n_ancestors()) if dataset is not None: - args["dataset_id"] = dataset.id - args["dataset_version"] = dataset.version + if isinstance(dataset, dict): + # Simple {"id": ..., "version": ...} dict + args["dataset_id"] = dataset["id"] + if "version" in dataset: + args["dataset_version"] = dataset["version"] + else: + # Full Dataset object + args["dataset_id"] = dataset.id + args["dataset_version"] = dataset.version if is_public is not None: args["public"] = is_public @@ -1463,7 +1478,11 @@ def compute_metadata(): # For experiments, disable queue size limit enforcement (unlimited queue) state.enforce_queue_size_limit(False) - ret = Experiment(lazy_metadata=LazyValue(compute_metadata, use_mutex=True), dataset=dataset, state=state) + ret = Experiment( + lazy_metadata=LazyValue(compute_metadata, use_mutex=True), + dataset=dataset if isinstance(dataset, Dataset) else None, + state=state, + ) if set_current: state.current_experiment = ret return ret diff --git a/py/src/braintrust/test_logger.py b/py/src/braintrust/test_logger.py index cdb222067..afac5c199 100644 --- a/py/src/braintrust/test_logger.py +++ b/py/src/braintrust/test_logger.py @@ -59,6 +59,33 @@ def test_init_validation(self): assert str(cm.exception) == "Cannot open an experiment without specifying its name" + def test_init_with_dataset_id_only(self): + """Test that init accepts dataset={'id': '...'} parameter""" + # Test the logic that extracts dataset_id from the dict + from braintrust.logger import Dataset + + # Test 1: dict with only id + dataset_dict = {"id": "dataset-id-123"} + assert isinstance(dataset_dict, dict) + assert not isinstance(dataset_dict, Dataset) + assert dataset_dict["id"] == "dataset-id-123" + + # Test 2: full Dataset object has different behavior + # (We can't easily instantiate a Dataset here, but we can verify + # that the isinstance check distinguishes them) + + def test_init_with_dataset_id_and_version(self): + """Test that init accepts dataset={'id': '...', 'version': '...'} parameter""" + # Test the logic that extracts both dataset_id and dataset_version from the dict + from braintrust.logger import Dataset + + # Test: dict with id and version + dataset_dict = {"id": "dataset-id-123", "version": "v2"} + assert isinstance(dataset_dict, dict) + assert not isinstance(dataset_dict, Dataset) + assert dataset_dict["id"] == "dataset-id-123" + assert dataset_dict["version"] == "v2" + class TestLogger(TestCase): def test_extract_attachments_no_op(self): From a7b13f8f42a50b2800612b53e4537ce3644da10a Mon Sep 17 00:00:00 2001 From: Alex Z Date: Tue, 20 Jan 2026 16:31:49 -0800 Subject: [PATCH 62/65] fix python --- integrations/openai-agents-js/src/index.ts | 3 +- .../src/openai-agents-integration.test.ts | 43 ++++++++++++++----- py/src/braintrust/framework.py | 10 ++--- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/integrations/openai-agents-js/src/index.ts b/integrations/openai-agents-js/src/index.ts index 95a18edd5..4a6a565d3 100644 --- a/integrations/openai-agents-js/src/index.ts +++ b/integrations/openai-agents-js/src/index.ts @@ -382,7 +382,8 @@ export class OpenAIAgentsTraceProcessor { if (!data.metrics.completion_tokens && usage.completionTokens) data.metrics.completion_tokens = usage.completionTokens; if (usage.input_tokens_details?.cached_tokens != null) - data.metrics.prompt_cached_tokens = usage.input_tokens_details.cached_tokens; + data.metrics.prompt_cached_tokens = + usage.input_tokens_details.cached_tokens; } return data; diff --git a/integrations/openai-agents-js/src/openai-agents-integration.test.ts b/integrations/openai-agents-js/src/openai-agents-integration.test.ts index 618da167b..75c954942 100644 --- a/integrations/openai-agents-js/src/openai-agents-integration.test.ts +++ b/integrations/openai-agents-js/src/openai-agents-integration.test.ts @@ -908,7 +908,7 @@ describe( output_tokens: 50, total_tokens: 150, input_tokens_details: { - cached_tokens: 80, // check for this later + cached_tokens: 80, // check for this later }, }, }, @@ -934,9 +934,17 @@ describe( const metrics = (responseSpanLog as any).metrics; assert.ok(metrics, "Response span should have metrics"); assert.equal(metrics.prompt_tokens, 100, "Should have prompt_tokens"); - assert.equal(metrics.completion_tokens, 50, "Should have completion_tokens"); + assert.equal( + metrics.completion_tokens, + 50, + "Should have completion_tokens", + ); assert.equal(metrics.tokens, 150, "Should have total tokens"); - assert.equal(metrics.prompt_cached_tokens, 80, "Should extract cached_tokens to prompt_cached_tokens"); + assert.equal( + metrics.prompt_cached_tokens, + 80, + "Should extract cached_tokens to prompt_cached_tokens", + ); }); test("Response span handles zero cached tokens correctly", async () => { @@ -965,7 +973,7 @@ describe( input_tokens: 100, output_tokens: 50, input_tokens_details: { - cached_tokens: 0, // Zero is a valid value + cached_tokens: 0, // Zero is a valid value }, }, }, @@ -977,7 +985,9 @@ describe( await processor.onSpanEnd(responseSpan); const spans = await backgroundLogger.drain(); - const responseSpanLog = spans.find((s: any) => s.span_attributes?.type === "llm"); + const responseSpanLog = spans.find( + (s: any) => s.span_attributes?.type === "llm", + ); const metrics = (responseSpanLog as any).metrics; // Zero should be logged, not skipped @@ -1024,11 +1034,16 @@ describe( await processor.onSpanEnd(responseSpan); const spans = await backgroundLogger.drain(); - const responseSpanLog = spans.find((s: any) => s.span_attributes?.type === "llm"); + const responseSpanLog = spans.find( + (s: any) => s.span_attributes?.type === "llm", + ); const metrics = (responseSpanLog as any).metrics; // Should not have prompt_cached_tokens if not present in usage - assert.isUndefined(metrics.prompt_cached_tokens, "Should not add prompt_cached_tokens if not in usage"); + assert.isUndefined( + metrics.prompt_cached_tokens, + "Should not add prompt_cached_tokens if not in usage", + ); }); test("Generation span extracts cached tokens from usage", async () => { @@ -1060,7 +1075,7 @@ describe( output_tokens: 75, total_tokens: 275, input_tokens_details: { - cached_tokens: 150, // Test Generation span extraction + cached_tokens: 150, // Test Generation span extraction }, }, }, @@ -1080,8 +1095,16 @@ describe( const metrics = (generationSpanLog as any).metrics; assert.ok(metrics, "Generation span should have metrics"); assert.equal(metrics.prompt_tokens, 200, "Should have prompt_tokens"); - assert.equal(metrics.completion_tokens, 75, "Should have completion_tokens"); - assert.equal(metrics.prompt_cached_tokens, 150, "Should extract cached_tokens from Generation span"); + assert.equal( + metrics.completion_tokens, + 75, + "Should have completion_tokens", + ); + assert.equal( + metrics.prompt_cached_tokens, + 150, + "Should extract cached_tokens from Generation span", + ); }); }, ); diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index f266c8eb6..b61f1826b 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1446,15 +1446,11 @@ def report_progress(event: TaskProgressEvent): async def ensure_spans_flushed(): # Flush native Braintrust spans if experiment: - from braintrust.logger import flush as flush_logger - await asyncio.get_event_loop().run_in_executor( - None, lambda: flush_logger(state=experiment._state) + None, lambda: experiment.state.flush() ) elif state: - from braintrust.logger import flush as flush_logger - - await asyncio.get_event_loop().run_in_executor(None, lambda: flush_logger(state=state)) + await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush()) else: from braintrust.logger import flush as flush_logger @@ -1472,7 +1468,7 @@ async def ensure_spans_flushed(): # Get the state to use trace_state = state if not trace_state and experiment: - trace_state = experiment._state + trace_state = experiment.state if not trace_state: # Fall back to global state from braintrust.logger import _internal_get_global_state From 8bb623d56526a3057fa779c174a752fc278c1600 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 21 Jan 2026 10:01:47 -0800 Subject: [PATCH 63/65] python updates --- py/src/braintrust/framework.py | 4 ++++ py/src/braintrust/span_cache.py | 39 ++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index b61f1826b..1b59478fd 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1456,6 +1456,10 @@ async def ensure_spans_flushed(): await asyncio.get_event_loop().run_in_executor(None, flush_logger) + # Also flush OTEL spans if registered + if state: + await state.flush_otel() + experiment_id = None if experiment: try: diff --git a/py/src/braintrust/span_cache.py b/py/src/braintrust/span_cache.py index 9f5338e04..17148cdec 100644 --- a/py/src/braintrust/span_cache.py +++ b/py/src/braintrust/span_cache.py @@ -155,16 +155,22 @@ def _ensure_initialized(self) -> None: if self.disabled or self._initialized: return - # Create temporary file - unique_id = f"{int(os.times().elapsed * 1000000)}-{uuid.uuid4().hex[:8]}" - self._cache_file_path = os.path.join(tempfile.gettempdir(), f"braintrust-span-cache-{unique_id}.jsonl") + try: + # Create temporary file + unique_id = f"{int(os.times().elapsed * 1000000)}-{uuid.uuid4().hex[:8]}" + self._cache_file_path = os.path.join(tempfile.gettempdir(), f"braintrust-span-cache-{unique_id}.jsonl") - # Create the file - with open(self._cache_file_path, "w") as f: - pass + # Create the file + with open(self._cache_file_path, "w") as f: + pass - self._initialized = True - self._register_exit_handler() + self._initialized = True + self._register_exit_handler() + except Exception: + # Silently fail if filesystem is unavailable - cache is best-effort + # This can happen if temp directory is not writable or disk is full + self._explicitly_disabled = True + return def _register_exit_handler(self) -> None: """Register a handler to clean up the temp file on process exit.""" @@ -180,7 +186,8 @@ def cleanup_all_caches(): if cache._cache_file_path and os.path.exists(cache._cache_file_path): try: os.unlink(cache._cache_file_path) - except: + except Exception: + # Ignore cleanup errors - file might not exist or already deleted pass atexit.register(cleanup_all_caches) @@ -213,7 +220,8 @@ def _flush_write_buffer(self) -> None: f.write(json.dumps(record.to_dict()) + "\n") self._write_buffer.clear() except Exception: - # Silently fail - cache is best-effort + # Silently fail if write fails - cache is best-effort + # This can happen if disk is full or file permissions changed pass def get_by_root_span_id(self, root_span_id: str) -> Optional[list[CachedSpan]]: @@ -256,11 +264,12 @@ def get_by_root_span_id(self, root_span_id: str) -> Optional[list[CachedSpan]]: merge_dicts(span_map[record.span_id], record.data.to_dict()) else: span_map[record.span_id] = record.data.to_dict() - except: - # Skip malformed lines + except Exception: + # Skip malformed lines - may occur if file was corrupted or truncated pass - except: + except Exception: # Continue to check buffer even if disk read fails + # This can happen if file was deleted or permissions changed pass # Also check the in-memory write buffer for unflushed data @@ -319,8 +328,8 @@ def dispose(self) -> None: if self._cache_file_path and os.path.exists(self._cache_file_path): try: os.unlink(self._cache_file_path) - except: - # Ignore cleanup errors + except Exception: + # Ignore cleanup errors - file might not exist or already deleted pass self._cache_file_path = None From c97bf753f3315ff262c1f19ba7c392e171cdbd53 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 21 Jan 2026 10:16:31 -0800 Subject: [PATCH 64/65] handle playground logs in py --- py/src/braintrust/framework.py | 24 ++++++++++++++++++++++-- py/src/braintrust/span_identifier_v3.py | 21 +++++++++++++++++++++ py/src/braintrust/trace.py | 6 +++--- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 1b59478fd..bccc10700 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1483,9 +1483,29 @@ async def ensure_spans_flushed(): # The Span interface doesn't expose this but SpanImpl has it root_span_id_value = getattr(root_span, "root_span_id", root_span.id) + # Check if there's a parent in the context to determine object_type and object_id + parent_str = trace_state.current_parent.get() + parent_components = None + if parent_str: + from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string + + try: + parent_components = SpanComponentsV3.from_str(parent_str) + except Exception: + # If parsing fails, parent_components stays None + pass + + # Determine object_type and object_id based on parent or experiment + if parent_components: + trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type) + trace_object_id = parent_components.object_id or "" + else: + trace_object_type = "experiment" + trace_object_id = experiment_id or "" + trace = LocalTrace( - object_type="experiment", - object_id=experiment_id or "", + object_type=trace_object_type, + object_id=trace_object_id, root_span_id=root_span_id_value, ensure_spans_flushed=ensure_spans_flushed, state=trace_state, diff --git a/py/src/braintrust/span_identifier_v3.py b/py/src/braintrust/span_identifier_v3.py index ea850dcbc..d86903153 100644 --- a/py/src/braintrust/span_identifier_v3.py +++ b/py/src/braintrust/span_identifier_v3.py @@ -38,6 +38,27 @@ def __str__(self): }[self] +def span_object_type_v3_to_typed_string( + object_type: SpanObjectTypeV3, +) -> str: + """Convert SpanObjectTypeV3 enum to typed string literal. + + Args: + object_type: The SpanObjectTypeV3 enum value + + Returns: + One of "experiment", "project_logs", or "playground_logs" + """ + if object_type == SpanObjectTypeV3.EXPERIMENT: + return "experiment" + elif object_type == SpanObjectTypeV3.PROJECT_LOGS: + return "project_logs" + elif object_type == SpanObjectTypeV3.PLAYGROUND_LOGS: + return "playground_logs" + else: + raise ValueError(f"Unknown SpanObjectTypeV3: {object_type}") + + class InternalSpanComponentUUIDFields(Enum): OBJECT_ID = 1 ROW_ID = 2 diff --git a/py/src/braintrust/trace.py b/py/src/braintrust/trace.py index aab078ee7..57344ee16 100644 --- a/py/src/braintrust/trace.py +++ b/py/src/braintrust/trace.py @@ -56,7 +56,7 @@ class SpanFetcher(ObjectFetcher[dict[str, Any]]): def __init__( self, - object_type: str, + object_type: str, # Literal["experiment", "project_logs", "playground_logs"] object_id: str, root_span_id: str, state: BraintrustState, @@ -134,7 +134,7 @@ class CachedSpanFetcher: def __init__( self, - object_type: Optional[str] = None, + object_type: Optional[str] = None, # Literal["experiment", "project_logs", "playground_logs"] object_id: Optional[str] = None, root_span_id: Optional[str] = None, get_state: Optional[Callable[[], Awaitable[BraintrustState]]] = None, @@ -283,7 +283,7 @@ class LocalTrace(dict): def __init__( self, - object_type: str, + object_type: str, # Literal["experiment", "project_logs", "playground_logs"] object_id: str, root_span_id: str, ensure_spans_flushed: Optional[Callable[[], Awaitable[None]]], From a815eb5a6b7537c3324742b37f77639dcf95bb66 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 21 Jan 2026 10:33:13 -0800 Subject: [PATCH 65/65] lint --- py/src/braintrust/framework.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index bccc10700..43ba78d23 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -1484,11 +1484,11 @@ async def ensure_spans_flushed(): root_span_id_value = getattr(root_span, "root_span_id", root_span.id) # Check if there's a parent in the context to determine object_type and object_id + from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string + parent_str = trace_state.current_parent.get() parent_components = None if parent_str: - from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string - try: parent_components = SpanComponentsV3.from_str(parent_str) except Exception: