From 04cb01772daead71f81122c2a4046e15ead6c4ce Mon Sep 17 00:00:00 2001 From: Olmo Maldonado Date: Thu, 29 Jan 2026 10:59:40 -0800 Subject: [PATCH 1/3] add set filtering function a general purpose to help customers decide if and when a span will go out. doesn't make any opinions about root/child, etc. --- js/src/logger.test.ts | 157 ++++++++++++++++++++++ js/src/logger.ts | 130 +++++++++++++++++- py/src/braintrust/logger.py | 111 +++++++++++++++- py/src/braintrust/test_logger.py | 218 +++++++++++++++++++++++++++++++ 4 files changed, 611 insertions(+), 5 deletions(-) diff --git a/js/src/logger.test.ts b/js/src/logger.test.ts index 7123985ed..009d77f8c 100644 --- a/js/src/logger.test.ts +++ b/js/src/logger.test.ts @@ -17,6 +17,7 @@ import { Attachment, deepCopyEvent, renderMessage, + setFilteringFunction, } from "./logger"; import { parseTemplateFormat, @@ -1197,3 +1198,159 @@ describe("sensitive data redaction", () => { expect(copy.input).toBe(""); }); }); + +describe("filtering functionality", () => { + let memoryLogger: any; + + beforeEach(() => { + _exportsForTestingOnly.simulateLoginForTests(); + memoryLogger = _exportsForTestingOnly.useTestBackgroundLogger(); + }); + + afterEach(() => { + setFilteringFunction(null); + _exportsForTestingOnly.clearTestBackgroundLogger(); + }); + + test("filter out events with langsmith:hidden tag", async () => { + const filteredSpanIds = new Set(); + + const filteringFunction = (event: any): any | null => { + const spanId = event.span_id; + if (spanId && filteredSpanIds.has(spanId)) { + return null; + } + + const rootTags = event.tags || []; + const metadataTags = event.metadata?.tags || []; + const allTags = [...rootTags, ...metadataTags]; + + if (allTags.includes("langsmith:hidden")) { + if (spanId) { + filteredSpanIds.add(spanId); + } + return null; + } + + return event; + }; + + setFilteringFunction(filteringFunction); + + const logger = initLogger({ + projectName: "test", + projectId: "test-project-id", + }); + + logger.log({ + input: "Hidden input", + output: "Hidden output", + tags: ["langsmith:hidden"], + }); + + logger.log({ + input: "Normal input", + output: "Normal output", + tags: ["normal"], + }); + + await memoryLogger.flush(); + const events = await memoryLogger.drain(); + + expect(events).toHaveLength(1); + expect(events[0].input).toBe("Normal input"); + expect(events[0].output).toBe("Normal output"); + }); + + test("filter by score threshold", async () => { + const filteredSpanIds = new Set(); + + const filteringFunction = (event: any): any | null => { + const spanId = event.span_id; + if (spanId && filteredSpanIds.has(spanId)) { + return null; + } + + if (event.scores?.accuracy !== undefined && event.scores.accuracy < 0.5) { + if (spanId) { + filteredSpanIds.add(spanId); + } + return null; + } + return event; + }; + + setFilteringFunction(filteringFunction); + + const experiment = _exportsForTestingOnly.initTestExperiment({ + projectName: "test", + experimentName: "test-exp", + }); + + experiment.log({ + input: "bad input", + output: "bad output", + scores: { accuracy: 0.3 }, + }); + + experiment.log({ + input: "good input", + output: "good output", + scores: { accuracy: 0.8 }, + }); + + await memoryLogger.flush(); + const events = await memoryLogger.drain(); + + expect(events).toHaveLength(1); + expect(events[0].input).toBe("good input"); + expect(events[0].scores.accuracy).toBe(0.8); + }); + + test("filter nested spans by name", async () => { + const filteredSpanIds = new Set(); + + const filteringFunction = (event: any): any | null => { + const spanId = event.span_id; + if (spanId && filteredSpanIds.has(spanId)) { + return null; + } + + if (event.span_attributes?.name === "internal_helper") { + if (spanId) { + filteredSpanIds.add(spanId); + } + return null; + } + + return event; + }; + + setFilteringFunction(filteringFunction); + + const logger = initLogger({ + projectName: "test", + projectId: "test-project-id", + }); + const parent = logger.startSpan({ name: "parent_span" }); + parent.log({ input: "parent input", output: "parent output" }); + + const internalChild = parent.startSpan({ name: "internal_helper" }); + internalChild.log({ input: "hidden input", output: "hidden output" }); + internalChild.end(); + + const publicChild = parent.startSpan({ name: "public_helper" }); + publicChild.log({ input: "public input", output: "public output" }); + publicChild.end(); + + parent.end(); + + await memoryLogger.flush(); + const events = await memoryLogger.drain(); + + const inputEvents = events.filter((e: any) => e.input !== undefined); + expect(inputEvents).toHaveLength(2); + expect(inputEvents[0].input).toBe("parent input"); + expect(inputEvents[1].input).toBe("public input"); + }); +}); diff --git a/js/src/logger.ts b/js/src/logger.ts index b325f2c61..1c9a0a9c0 100644 --- a/js/src/logger.ts +++ b/js/src/logger.ts @@ -760,6 +760,14 @@ export class BraintrustState { this.bgLogger().setMaskingFunction(maskingFunction); } + public setFilteringFunction( + filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null, + ): void { + this.bgLogger().setFilteringFunction(filteringFunction); + } + public async login(loginParams: LoginOptions & { forceLogin?: boolean }) { if (this.apiUrl && !loginParams.forceLogin) { return; @@ -2351,11 +2359,19 @@ interface BackgroundLogger { setMaskingFunction( maskingFunction: ((value: unknown) => unknown) | null, ): void; + setFilteringFunction( + filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null, + ): void; } export class TestBackgroundLogger implements BackgroundLogger { private items: LazyValue[][] = []; private maskingFunction: ((value: unknown) => unknown) | null = null; + private filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null = null; log(items: LazyValue[]): void { this.items.push(items); @@ -2367,6 +2383,14 @@ export class TestBackgroundLogger implements BackgroundLogger { this.maskingFunction = maskingFunction; } + setFilteringFunction( + filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null, + ): void { + this.filteringFunction = filteringFunction; + } + async flush(): Promise { return Promise.resolve(); } @@ -2375,11 +2399,29 @@ export class TestBackgroundLogger implements BackgroundLogger { const items = this.items; this.items = []; - // get all the values + // get all the values and apply filtering const events: BackgroundLogEvent[] = []; for (const item of items) { for (const event of item) { - events.push(await event.get()); + const eventData = await event.get(); + + // Apply filtering function + if (this.filteringFunction) { + try { + const filtered = this.filteringFunction(eventData); + if (filtered === null) { + // Event was filtered out, skip it + continue; + } + events.push(filtered); + } catch (e) { + // If filtering fails, log the original event + events.push(eventData); + } + } else if (eventData !== null) { + // Skip null events (filtered out by HTTPBackgroundLogger) + events.push(eventData); + } } } @@ -2443,6 +2485,9 @@ class HTTPBackgroundLogger implements BackgroundLogger { private activeFlushError: unknown = undefined; private onFlushError?: (error: unknown) => void; private maskingFunction: ((value: unknown) => unknown) | null = null; + private filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null = null; public syncFlush: boolean = false; // 6 MB for the AWS lambda gateway (from our own testing). @@ -2543,12 +2588,41 @@ class HTTPBackgroundLogger implements BackgroundLogger { this.maskingFunction = maskingFunction; } + setFilteringFunction( + filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null, + ): void { + this.filteringFunction = filteringFunction; + } + log(items: LazyValue[]) { if (this._disabled) { return; } - const droppedItems = this.queue.push(...items); + // Wrap items with filtering if a filtering function is set + let filteredItems = items; + if (this.filteringFunction) { + filteredItems = items.map((item) => { + return new LazyValue(async () => { + try { + const eventData = await item.get(); + const filtered = this.filteringFunction!(eventData); + if (filtered === null) { + // Event was filtered out - return a marker that will be removed later + return null as any; + } + return filtered; + } catch (e) { + // If filtering fails, return the original event + return await item.get(); + } + }); + }); + } + + const droppedItems = this.queue.push(...filteredItems); if (!this.syncFlush) { this.triggerActiveFlush(); @@ -2682,7 +2756,10 @@ class HTTPBackgroundLogger implements BackgroundLogger { ): Promise<[BackgroundLogEvent[][], Attachment[]]> { for (let i = 0; i < this.numTries; ++i) { try { - const items = await Promise.all(wrappedItems.map((x) => x.get())); + const allItems = await Promise.all(wrappedItems.map((x) => x.get())); + + // Filter out null events (filtered out by filtering function) + const items = allItems.filter((item) => item !== null); // TODO(kevin): `extractAttachments` should ideally come after // `mergeRowBatch`, since merge-overwriting could result in some @@ -3840,6 +3917,51 @@ export function setMaskingFunction( _globalState.setMaskingFunction(maskingFunction); } +/** + * Set a global filtering function to control which events are logged to Braintrust. + * + * The function receives a log event object and should return null to skip logging that event. + * You can also return a modified event, but for redacting sensitive data, prefer using + * setMaskingFunction() instead. + * + * This is useful for: + * - Filtering out events by tags (e.g., LangGraph's 'langsmith:hidden' tag) + * - Skipping zero-duration spans or other overhead + * - Filtering events by span name, score thresholds, or other criteria + * + * Example: + * ```typescript + * function myFilter(event) { + * // Skip events with 'langsmith:hidden' tag + * const rootTags = event.tags || []; + * const metadataTags = event.metadata?.tags || []; + * if ([...rootTags, ...metadataTags].includes('langsmith:hidden')) { + * return null; + * } + * + * // Skip zero-duration spans + * const metrics = event.metrics || {}; + * if (metrics.start === metrics.end) { + * return null; + * } + * + * return event; + * } + * + * setFilteringFunction(myFilter); + * ``` + * + * @param filteringFunction A function that takes a log event and returns null to skip + * logging, or the event to log it. Set to null to disable filtering. + */ +export function setFilteringFunction( + filteringFunction: + | ((event: BackgroundLogEvent) => BackgroundLogEvent | null) + | null, +): void { + _globalState.setFilteringFunction(filteringFunction); +} + /** * Log into Braintrust. This will prompt you for your API token, which you can find at * https://www.braintrust.dev/app/token. This method is called automatically by `init()`. diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py index 1c9795a50..255762d4b 100644 --- a/py/src/braintrust/logger.py +++ b/py/src/braintrust/logger.py @@ -392,6 +392,12 @@ def default_get_api_conn(): # different threads unintentionally use the same override. self._override_bg_logger = threading.local() + # Function for filtering/redacting log events before they are sent. + # The function receives a log event dict and can: + # - Return the event (potentially modified) to log it + # - Return None to skip logging this event + self._filtering_function: Callable[[dict[str, Any]], dict[str, Any] | None] | None = None + self.reset_login_info() self._prompt_cache = PromptCache( @@ -589,6 +595,12 @@ def set_masking_function(self, masking_function: Callable[[Any], Any] | None) -> """Set the masking function on the background logger.""" self.global_bg_logger().set_masking_function(masking_function) + def set_filtering_function( + self, filtering_function: Callable[[dict[str, Any]], dict[str, Any] | None] | None + ) -> None: + """Set the filtering function on the background logger.""" + self.global_bg_logger().set_filtering_function(filtering_function) + _state: BraintrustState = None # type: ignore @@ -823,6 +835,7 @@ def __init__(self): self.lock = threading.Lock() self.logs = [] self.masking_function: Callable[[Any], Any] | None = None + self.filtering_function: Callable[[dict[str, Any]], dict[str, Any] | None] | None = None self.upload_attempts: list[BaseAttachment] = [] # Track upload attempts def enforce_queue_size_limit(self, enforce: bool) -> None: @@ -830,12 +843,28 @@ def enforce_queue_size_limit(self, enforce: bool) -> None: def log(self, *args: LazyValue[dict[str, Any]]) -> None: with self.lock: - self.logs.extend(args) + filtered_args = [] + for arg in args: + # Apply filtering function before adding to logs + if self.filtering_function: + filtered = _apply_filtering_function(arg, self) + if filtered is None: + # Event was filtered out, skip it + continue + arg = filtered + filtered_args.append(arg) + self.logs.extend(filtered_args) def set_masking_function(self, masking_function: Callable[[Any], Any] | None) -> None: """Set the masking function for the memory logger.""" self.masking_function = masking_function + def set_filtering_function( + self, filtering_function: Callable[[dict[str, Any]], dict[str, Any] | None] | None + ) -> None: + """Set the filtering function for the memory logger.""" + self.filtering_function = filtering_function + def flush(self, batch_size: int | None = None): """Flush the memory logger, extracting attachments and tracking upload attempts.""" with self.lock: @@ -905,6 +934,7 @@ class _HTTPBackgroundLogger: def __init__(self, api_conn: LazyValue[HTTPConnection]): self.api_conn = api_conn self.masking_function: Callable[[Any], Any] | None = None + self.filtering_function: Callable[[dict[str, Any]], dict[str, Any] | None] | None = None self.outfile = sys.stderr self.flush_lock = threading.RLock() @@ -971,6 +1001,14 @@ def log(self, *args: LazyValue[dict[str, Any]]) -> None: self._start() dropped_items = [] for event in args: + # Apply filtering function before adding to queue + if self.filtering_function: + filtered = _apply_filtering_function(event, self) + if filtered is None: + # Event was filtered out, skip it + continue + event = filtered + dropped = self.queue.put(event) dropped_items.extend(dropped) @@ -1909,6 +1947,39 @@ def login_to_state( return state +def _apply_filtering_function( + event: LazyValue[dict[str, Any]] | dict[str, Any], logger: _HTTPBackgroundLogger | _MemoryBackgroundLogger +) -> LazyValue[dict[str, Any]] | dict[str, Any] | None: + """ + Helper function to apply the filtering function to an event (lazy or eager). + Returns None if the event should be filtered out, otherwise returns the event. + """ + if not logger.filtering_function: + return event + + try: + # Get the actual event data + if isinstance(event, LazyValue): + event_data = event.get() + else: + event_data = event + + # Apply the filtering function + filtered = logger.filtering_function(event_data) + + if filtered is None: + return None + + # Return in the same format as input + if isinstance(event, LazyValue): + return LazyValue(lambda: filtered, use_mutex=False) + else: + return filtered + except Exception: + # If filtering fails, log the original event + return event + + def set_masking_function(masking_function: Callable[[Any], Any] | None) -> None: """ Set a global masking function that will be applied to all logged data before sending to Braintrust. @@ -1920,6 +1991,44 @@ def set_masking_function(masking_function: Callable[[Any], Any] | None) -> None: _state.set_masking_function(masking_function) +def set_filtering_function( + filtering_function: Callable[[dict[str, Any]], dict[str, Any] | None] | None +) -> None: + """ + Set a global filtering function to control which events are logged to Braintrust. + + The function receives a log event dict and should return None to skip logging that event. + You can also return a modified event, but for redacting sensitive data, prefer using + set_masking_function() instead. + + This is useful for: + - Filtering out events by tags (e.g., LangGraph's 'langsmith:hidden' tag) + - Skipping zero-duration spans or other overhead + - Filtering events by span name, score thresholds, or other criteria + + Example: + def my_filter(event): + # Skip events with 'langsmith:hidden' tag + root_tags = event.get('tags', []) or [] + metadata_tags = event.get('metadata', {}).get('tags', []) or [] + if 'langsmith:hidden' in root_tags + metadata_tags: + return None + + # Skip zero-duration spans + metrics = event.get('metrics', {}) + if metrics.get('start') == metrics.get('end'): + return None + + return event + + braintrust.set_filtering_function(my_filter) + + :param filtering_function: A callable that takes a log event dict and returns None to skip + logging, or the event to log it. Set to None to disable filtering. + """ + _state.set_filtering_function(filtering_function) + + def log(**event: Any) -> str: """ Log a single event to the current experiment. The event will be batched and uploaded behind the scenes. diff --git a/py/src/braintrust/test_logger.py b/py/src/braintrust/test_logger.py index a4ef97ab8..42f1e09d0 100644 --- a/py/src/braintrust/test_logger.py +++ b/py/src/braintrust/test_logger.py @@ -2187,6 +2187,224 @@ def broken_masking_function(data): braintrust.set_masking_function(None) +def test_filtering_function_logger(with_memory_logger, with_simulate_login): + """Test that filtering function can filter and redact events in Logger.""" + + # Track span IDs that should be filtered + filtered_span_ids = set() + + def filtering_function(event): + """ + Filter out events with 'langsmith:hidden' tag. + Redact sensitive metadata fields. + """ + # Skip events from filtered spans + span_id = event.get("span_id") + if span_id and span_id in filtered_span_ids: + return None + + # Skip events with langsmith:hidden tag + # Tags can be in two places depending on where in the processing we are: + # 1. At root level before processing (event["tags"]) + # 2. In metadata after processing (event["metadata"]["tags"]) + root_tags = event.get("tags", []) or [] + metadata_tags = event.get("metadata", {}).get("tags", []) or [] + all_tags = list(root_tags) + list(metadata_tags) + + if "langsmith:hidden" in all_tags: + # Track this span ID so we filter all its events + if span_id: + filtered_span_ids.add(span_id) + return None + + # Redact api_key from metadata + if "metadata" in event and "api_key" in event.get("metadata", {}): + event = event.copy() + event["metadata"] = event["metadata"].copy() + event["metadata"]["api_key"] = "***REDACTED***" + + return event + + # Set filtering function globally + braintrust.set_filtering_function(filtering_function) + + # Create test logger + test_logger = init_test_logger("test_project") + + # Log event that should be filtered out + test_logger.log( + input="Hidden input", + output="Hidden output", + tags=["langsmith:hidden"], + ) + + # Log event that should be redacted + test_logger.log( + input="Normal input", + output="Normal output", + metadata={"api_key": "secret123", "user": "testuser"}, + ) + + # Log normal event + test_logger.log( + input="Another input", + output="Another output", + tags=["normal"], + ) + + # Check the logged data + logs = with_memory_logger.pop() + + # Should only have 2 logs (the first one was filtered out) + assert len(logs) == 2 + + # First log should have redacted api_key + log1 = logs[0] + assert log1["input"] == "Normal input" + assert log1["output"] == "Normal output" + assert log1["metadata"]["api_key"] == "***REDACTED***" + assert log1["metadata"]["user"] == "testuser" + + # Second log should be unchanged + log2 = logs[1] + assert log2["input"] == "Another input" + assert log2["output"] == "Another output" + assert log2["tags"] == ["normal"] + + # Clean up + braintrust.set_filtering_function(None) + + +def test_filtering_function_experiment(with_memory_logger, with_simulate_login): + """Test that filtering function works with Experiment spans.""" + + # Track filtered span IDs so we can filter all events from that span + filtered_span_ids = set() + + def filtering_function(event): + """Filter out spans with score below threshold.""" + span_id = event.get("span_id") + + # Skip events from already-filtered spans + if span_id and span_id in filtered_span_ids: + return None + + # Skip spans with low scores + scores = event.get("scores", {}) + if "accuracy" in scores and scores["accuracy"] < 0.5: + if span_id: + filtered_span_ids.add(span_id) + return None + return event + + # Set filtering function globally + braintrust.set_filtering_function(filtering_function) + + # Create test experiment + experiment = init_test_exp("test_project", "test_experiment") + + # Log event with low score (should be filtered out) + experiment.log( + input="bad input", + output="bad output", + scores={"accuracy": 0.3}, + ) + + # Log event with good score + experiment.log( + input="good input", + output="good output", + scores={"accuracy": 0.8}, + ) + + # Log event without accuracy score (but with a different score) + experiment.log( + input="neutral input", + output="neutral output", + scores={"quality": 0.9}, + ) + + experiment.flush() + + # Check the logged data + logs = with_memory_logger.pop() + + # Should only have 2 logs (the first one with low score was filtered out) + assert len(logs) == 2 + + log1 = logs[0] + assert log1["input"] == "good input" + assert log1["scores"]["accuracy"] == 0.8 + + log2 = logs[1] + assert log2["input"] == "neutral input" + assert log2["scores"]["quality"] == 0.9 + + # Clean up + braintrust.set_filtering_function(None) + + +def test_filtering_function_with_child_spans(with_memory_logger, with_simulate_login): + """Test that filtering function works with nested spans.""" + + # Track filtered span IDs + filtered_span_ids = set() + + def filtering_function(event): + """Filter out spans with specific names.""" + span_id = event.get("span_id") + + # Skip events from already-filtered spans + if span_id and span_id in filtered_span_ids: + return None + + # Filter out spans named "internal_helper" + span_attrs = event.get("span_attributes", {}) + if span_attrs.get("name") == "internal_helper": + if span_id: + filtered_span_ids.add(span_id) + return None + + return event + + # Set filtering function globally + braintrust.set_filtering_function(filtering_function) + + # Create test logger + test_logger = init_test_logger("test_project") + + # Create a parent span + with test_logger.start_span(name="parent_span") as parent: + # Log to parent (should be kept) + parent.log(input="parent input", output="result1") + + # Create a child span that should be filtered out + with parent.start_span(name="internal_helper") as child: + child.log(input="hidden input", output="hidden output") + + # Create another child span that should be kept + with parent.start_span(name="public_helper") as child2: + child2.log(input="public input", output="result2") + + # Check the logged data + logs = with_memory_logger.pop() + + # Should have events from parent_span and public_helper, but not internal_helper + # span.log() on existing spans creates merge events, so we get 2 events + assert len(logs) == 2 + + # First should be from parent_span + assert logs[0]["input"] == "parent input" + assert logs[0]["output"] == "result1" + + # Second should be from public_helper + assert logs[1]["input"] == "public input" + assert logs[1]["output"] == "result2" + + # Clean up + braintrust.set_filtering_function(None) + + def test_attachment_unreadable_path_logs_warning(caplog): with caplog.at_level(logging.WARNING, logger="braintrust"): Attachment( From 5d8f4311e21af6f50b128df8e65905711df9f3cb Mon Sep 17 00:00:00 2001 From: Olmo Maldonado Date: Fri, 30 Jan 2026 15:44:20 -0800 Subject: [PATCH 2/3] add root -> parent -> child + sibling (filtered)-> descendant test --- js/src/logger.test.ts | 187 ++++++++++++-------- py/src/braintrust/test_logger.py | 294 ++++++++++++------------------- 2 files changed, 220 insertions(+), 261 deletions(-) diff --git a/js/src/logger.test.ts b/js/src/logger.test.ts index 009d77f8c..c24cbbbeb 100644 --- a/js/src/logger.test.ts +++ b/js/src/logger.test.ts @@ -1212,20 +1212,44 @@ describe("filtering functionality", () => { _exportsForTestingOnly.clearTestBackgroundLogger(); }); - test("filter out events with langsmith:hidden tag", async () => { + /** + * Test that filtering a middle span creates an orphan, NOT a reparented child. + * + * Given this tree: + * root + * parent + * child + * sibling <-- FILTERED + * descendant + * + * The user might EXPECT descendant to be reparented: + * root + * parent + * child + * descendant <-- moved up to parent + * + * But ACTUAL behavior is descendant becomes orphan under root: + * root + * parent + * child + * descendant <-- orphan (span_parents still points to filtered "sibling") + * + * This is because: + * 1. descendant's span_parents still contains sibling's span_id + * 2. sibling doesn't exist in the logged data + * 3. UI treats spans with missing parents as orphans under root + */ + test("filtering middle span creates orphan under root", async () => { const filteredSpanIds = new Set(); - const filteringFunction = (event: any): any | null => { + const filterSibling = (event: any): any | null => { const spanId = event.span_id; if (spanId && filteredSpanIds.has(spanId)) { return null; } - const rootTags = event.tags || []; - const metadataTags = event.metadata?.tags || []; - const allTags = [...rootTags, ...metadataTags]; - - if (allTags.includes("langsmith:hidden")) { + const spanName = event.span_attributes?.name ?? ""; + if (spanName === "sibling") { if (spanId) { filteredSpanIds.add(spanId); } @@ -1235,88 +1259,87 @@ describe("filtering functionality", () => { return event; }; - setFilteringFunction(filteringFunction); + setFilteringFunction(filterSibling); const logger = initLogger({ projectName: "test", projectId: "test-project-id", }); - logger.log({ - input: "Hidden input", - output: "Hidden output", - tags: ["langsmith:hidden"], - }); + const root = logger.startSpan({ name: "root" }); + const rootSpanId = root.spanId; - logger.log({ - input: "Normal input", - output: "Normal output", - tags: ["normal"], - }); + const parent = root.startSpan({ name: "parent" }); + const parentSpanId = parent.spanId; - await memoryLogger.flush(); - const events = await memoryLogger.drain(); + const child = parent.startSpan({ name: "child" }); + child.log({ input: "child data" }); + child.end(); - expect(events).toHaveLength(1); - expect(events[0].input).toBe("Normal input"); - expect(events[0].output).toBe("Normal output"); - }); - - test("filter by score threshold", async () => { - const filteredSpanIds = new Set(); + const sibling = parent.startSpan({ name: "sibling" }); + const siblingSpanId = sibling.spanId; // This span will be filtered - const filteringFunction = (event: any): any | null => { - const spanId = event.span_id; - if (spanId && filteredSpanIds.has(spanId)) { - return null; - } + const descendant = sibling.startSpan({ name: "descendant" }); + descendant.log({ input: "descendant data" }); + const descendantSpanId = descendant.spanId; + descendant.end(); - if (event.scores?.accuracy !== undefined && event.scores.accuracy < 0.5) { - if (spanId) { - filteredSpanIds.add(spanId); - } - return null; - } - return event; - }; + sibling.end(); + parent.end(); + root.end(); - setFilteringFunction(filteringFunction); + await memoryLogger.flush(); + const events = await memoryLogger.drain(); - const experiment = _exportsForTestingOnly.initTestExperiment({ - projectName: "test", - experimentName: "test-exp", - }); + const loggedNames = events.map((e: any) => e.span_attributes?.name); + expect(loggedNames).toContain("root"); + expect(loggedNames).toContain("parent"); + expect(loggedNames).toContain("child"); + expect(loggedNames).not.toContain("sibling"); // Filtered out + expect(loggedNames).toContain("descendant"); // Still logged! - experiment.log({ - input: "bad input", - output: "bad output", - scores: { accuracy: 0.3 }, - }); + const descendantLog = events.find( + (e: any) => e.span_attributes?.name === "descendant", + ); - experiment.log({ - input: "good input", - output: "good output", - scores: { accuracy: 0.8 }, - }); + // Key assertion: descendant's span_parents still points to the FILTERED sibling + // This means the UI will see it as an orphan (parent doesn't exist) + expect(descendantLog.span_parents).toContain(siblingSpanId); - await memoryLogger.flush(); - const events = await memoryLogger.drain(); + // The descendant does NOT have parent_span_id pointing to "parent" + // It still thinks its parent is "sibling" (which was filtered) + expect(descendantLog.span_parents).not.toContain(parentSpanId); - expect(events).toHaveLength(1); - expect(events[0].input).toBe("good input"); - expect(events[0].scores.accuracy).toBe(0.8); + // All spans share the same root_span_id + expect(descendantLog.root_span_id).toBe(rootSpanId); }); - test("filter nested spans by name", async () => { + /** + * Test that users can implement cascading filter to avoid orphans. + * + * To filter "sibling" AND its descendants (avoiding orphans), users must + * track filtered span_ids and check span_parents in their filter function. + */ + test("filtering with cascade to fix orphans", async () => { const filteredSpanIds = new Set(); - const filteringFunction = (event: any): any | null => { + const filterSiblingWithCascade = (event: any): any | null => { const spanId = event.span_id; - if (spanId && filteredSpanIds.has(spanId)) { - return null; + const spanParents = event.span_parents || []; + + // Check if any parent was filtered (cascade) + for (const parentId of spanParents) { + if (filteredSpanIds.has(parentId)) { + if (spanId) { + filteredSpanIds.add(spanId); + } + return null; + } } - if (event.span_attributes?.name === "internal_helper") { + // Check if this span should be filtered + const spanName = event.span_attributes?.name ?? ""; + if (spanName === "sibling") { if (spanId) { filteredSpanIds.add(spanId); } @@ -1326,31 +1349,39 @@ describe("filtering functionality", () => { return event; }; - setFilteringFunction(filteringFunction); + setFilteringFunction(filterSiblingWithCascade); const logger = initLogger({ projectName: "test", projectId: "test-project-id", }); - const parent = logger.startSpan({ name: "parent_span" }); - parent.log({ input: "parent input", output: "parent output" }); - const internalChild = parent.startSpan({ name: "internal_helper" }); - internalChild.log({ input: "hidden input", output: "hidden output" }); - internalChild.end(); + const root = logger.startSpan({ name: "root" }); + + const parent = root.startSpan({ name: "parent" }); + + const child = parent.startSpan({ name: "child" }); + child.log({ input: "child data" }); + child.end(); + + const sibling = parent.startSpan({ name: "sibling" }); - const publicChild = parent.startSpan({ name: "public_helper" }); - publicChild.log({ input: "public input", output: "public output" }); - publicChild.end(); + const descendant = sibling.startSpan({ name: "descendant" }); + descendant.log({ input: "descendant data" }); + descendant.end(); + sibling.end(); parent.end(); + root.end(); await memoryLogger.flush(); const events = await memoryLogger.drain(); - const inputEvents = events.filter((e: any) => e.input !== undefined); - expect(inputEvents).toHaveLength(2); - expect(inputEvents[0].input).toBe("parent input"); - expect(inputEvents[1].input).toBe("public input"); + const loggedNames = events.map((e: any) => e.span_attributes?.name); + expect(loggedNames).toContain("root"); + expect(loggedNames).toContain("parent"); + expect(loggedNames).toContain("child"); + expect(loggedNames).not.toContain("sibling"); // Filtered + expect(loggedNames).not.toContain("descendant"); // Also filtered (cascaded) }); }); diff --git a/py/src/braintrust/test_logger.py b/py/src/braintrust/test_logger.py index 42f1e09d0..924181868 100644 --- a/py/src/braintrust/test_logger.py +++ b/py/src/braintrust/test_logger.py @@ -2187,221 +2187,158 @@ def broken_masking_function(data): braintrust.set_masking_function(None) -def test_filtering_function_logger(with_memory_logger, with_simulate_login): - """Test that filtering function can filter and redact events in Logger.""" - - # Track span IDs that should be filtered +def test_filtering_middle_span_creates_orphan_under_root(with_memory_logger, with_simulate_login): + """ + Test that filtering a middle span creates an orphan, NOT a reparented child. + + Given this tree: + root + parent + child + sibling <-- FILTERED + descendant + + The user might EXPECT descendant to be reparented: + root + parent + child + descendant <-- moved up to parent + + But ACTUAL behavior is descendant becomes orphan under root: + root + parent + child + descendant <-- orphan (span_parents still points to filtered "sibling") + + This is because: + 1. descendant's span_parents still contains sibling's span_id + 2. sibling doesn't exist in the logged data + 3. UI treats spans with missing parents as orphans under root + """ filtered_span_ids = set() - def filtering_function(event): - """ - Filter out events with 'langsmith:hidden' tag. - Redact sensitive metadata fields. - """ - # Skip events from filtered spans + def filter_sibling(event): span_id = event.get("span_id") if span_id and span_id in filtered_span_ids: return None - # Skip events with langsmith:hidden tag - # Tags can be in two places depending on where in the processing we are: - # 1. At root level before processing (event["tags"]) - # 2. In metadata after processing (event["metadata"]["tags"]) - root_tags = event.get("tags", []) or [] - metadata_tags = event.get("metadata", {}).get("tags", []) or [] - all_tags = list(root_tags) + list(metadata_tags) - - if "langsmith:hidden" in all_tags: - # Track this span ID so we filter all its events + span_name = event.get("span_attributes", {}).get("name", "") + if span_name == "sibling": if span_id: filtered_span_ids.add(span_id) return None - # Redact api_key from metadata - if "metadata" in event and "api_key" in event.get("metadata", {}): - event = event.copy() - event["metadata"] = event["metadata"].copy() - event["metadata"]["api_key"] = "***REDACTED***" - return event - # Set filtering function globally - braintrust.set_filtering_function(filtering_function) + braintrust.set_filtering_function(filter_sibling) - # Create test logger test_logger = init_test_logger("test_project") - # Log event that should be filtered out - test_logger.log( - input="Hidden input", - output="Hidden output", - tags=["langsmith:hidden"], - ) - - # Log event that should be redacted - test_logger.log( - input="Normal input", - output="Normal output", - metadata={"api_key": "secret123", "user": "testuser"}, - ) - - # Log normal event - test_logger.log( - input="Another input", - output="Another output", - tags=["normal"], - ) - - # Check the logged data - logs = with_memory_logger.pop() - - # Should only have 2 logs (the first one was filtered out) - assert len(logs) == 2 - - # First log should have redacted api_key - log1 = logs[0] - assert log1["input"] == "Normal input" - assert log1["output"] == "Normal output" - assert log1["metadata"]["api_key"] == "***REDACTED***" - assert log1["metadata"]["user"] == "testuser" - - # Second log should be unchanged - log2 = logs[1] - assert log2["input"] == "Another input" - assert log2["output"] == "Another output" - assert log2["tags"] == ["normal"] - - # Clean up - braintrust.set_filtering_function(None) - - -def test_filtering_function_experiment(with_memory_logger, with_simulate_login): - """Test that filtering function works with Experiment spans.""" - - # Track filtered span IDs so we can filter all events from that span - filtered_span_ids = set() - - def filtering_function(event): - """Filter out spans with score below threshold.""" - span_id = event.get("span_id") - - # Skip events from already-filtered spans - if span_id and span_id in filtered_span_ids: - return None - - # Skip spans with low scores - scores = event.get("scores", {}) - if "accuracy" in scores and scores["accuracy"] < 0.5: - if span_id: - filtered_span_ids.add(span_id) - return None - return event + with test_logger.start_span(name="root") as root: + root_span_id = root.span_id - # Set filtering function globally - braintrust.set_filtering_function(filtering_function) + with root.start_span(name="parent") as parent: + parent_span_id = parent.span_id - # Create test experiment - experiment = init_test_exp("test_project", "test_experiment") + with parent.start_span(name="child") as child: + child.log(input="child data") - # Log event with low score (should be filtered out) - experiment.log( - input="bad input", - output="bad output", - scores={"accuracy": 0.3}, - ) + with parent.start_span(name="sibling") as sibling: + sibling_span_id = sibling.span_id # This span will be filtered - # Log event with good score - experiment.log( - input="good input", - output="good output", - scores={"accuracy": 0.8}, - ) + with sibling.start_span(name="descendant") as descendant: + descendant.log(input="descendant data") + descendant_span_id = descendant.span_id - # Log event without accuracy score (but with a different score) - experiment.log( - input="neutral input", - output="neutral output", - scores={"quality": 0.9}, - ) + logs = with_memory_logger.pop() - experiment.flush() + logged_names = [l.get("span_attributes", {}).get("name") for l in logs] + assert "root" in logged_names + assert "parent" in logged_names + assert "child" in logged_names + assert "sibling" not in logged_names # Filtered out + assert "descendant" in logged_names # Still logged! - # Check the logged data - logs = with_memory_logger.pop() + descendant_log = next(l for l in logs if l.get("span_attributes", {}).get("name") == "descendant") - # Should only have 2 logs (the first one with low score was filtered out) - assert len(logs) == 2 + # Key assertion: descendant's span_parents still points to the FILTERED sibling + # This means the UI will see it as an orphan (parent doesn't exist) + assert sibling_span_id in descendant_log["span_parents"] - log1 = logs[0] - assert log1["input"] == "good input" - assert log1["scores"]["accuracy"] == 0.8 + # The descendant does NOT have parent_span_id pointing to "parent" + # It still thinks its parent is "sibling" (which was filtered) + assert parent_span_id not in descendant_log["span_parents"] - log2 = logs[1] - assert log2["input"] == "neutral input" - assert log2["scores"]["quality"] == 0.9 + # All spans share the same root_span_id + assert descendant_log["root_span_id"] == root_span_id - # Clean up braintrust.set_filtering_function(None) -def test_filtering_function_with_child_spans(with_memory_logger, with_simulate_login): - """Test that filtering function works with nested spans.""" - - # Track filtered span IDs +def test_filtering_with_cascade_to_fix_orphans(with_memory_logger, with_simulate_login): + """ + Test that users can implement cascading filter to avoid orphans. + + To filter "sibling" AND its descendants (avoiding orphans), users must + track filtered span_ids and check span_parents in their filter function. + + Given: + root + parent + child + sibling <-- FILTERED (and cascade to descendants) + descendant + + With cascading filter, result is: + root + parent + child + (descendant is also filtered, no orphans) + """ filtered_span_ids = set() - def filtering_function(event): - """Filter out spans with specific names.""" + def filter_sibling_with_cascade(event): span_id = event.get("span_id") - - # Skip events from already-filtered spans - if span_id and span_id in filtered_span_ids: - return None - - # Filter out spans named "internal_helper" - span_attrs = event.get("span_attributes", {}) - if span_attrs.get("name") == "internal_helper": + span_parents = event.get("span_parents", []) or [] + + # Check if any parent was filtered (cascade) + for parent_id in span_parents: + if parent_id in filtered_span_ids: + if span_id: + filtered_span_ids.add(span_id) + return None + + # Check if this span should be filtered + span_name = event.get("span_attributes", {}).get("name", "") + if span_name == "sibling": if span_id: filtered_span_ids.add(span_id) return None return event - # Set filtering function globally - braintrust.set_filtering_function(filtering_function) + braintrust.set_filtering_function(filter_sibling_with_cascade) - # Create test logger test_logger = init_test_logger("test_project") - # Create a parent span - with test_logger.start_span(name="parent_span") as parent: - # Log to parent (should be kept) - parent.log(input="parent input", output="result1") - - # Create a child span that should be filtered out - with parent.start_span(name="internal_helper") as child: - child.log(input="hidden input", output="hidden output") + with test_logger.start_span(name="root") as root: + with root.start_span(name="parent") as parent: + with parent.start_span(name="child") as child: + child.log(input="child data") - # Create another child span that should be kept - with parent.start_span(name="public_helper") as child2: - child2.log(input="public input", output="result2") + with parent.start_span(name="sibling") as sibling: + with sibling.start_span(name="descendant") as descendant: + descendant.log(input="descendant data") - # Check the logged data logs = with_memory_logger.pop() - # Should have events from parent_span and public_helper, but not internal_helper - # span.log() on existing spans creates merge events, so we get 2 events - assert len(logs) == 2 - - # First should be from parent_span - assert logs[0]["input"] == "parent input" - assert logs[0]["output"] == "result1" + logged_names = [l.get("span_attributes", {}).get("name") for l in logs] + assert "root" in logged_names + assert "parent" in logged_names + assert "child" in logged_names + assert "sibling" not in logged_names # Filtered + assert "descendant" not in logged_names # Also filtered (cascaded) - # Second should be from public_helper - assert logs[1]["input"] == "public input" - assert logs[1]["output"] == "result2" - - # Clean up braintrust.set_filtering_function(None) @@ -3227,7 +3164,7 @@ def test_extract_attachments_collects_and_replaces(): event = { "input": {"file": attachment1}, "output": {"file": attachment2}, - "metadata": {"files": [attachment1, ext_attachment]} + "metadata": {"files": [attachment1, ext_attachment]}, } attachments = [] @@ -3257,7 +3194,7 @@ def test_extract_attachments_preserves_identity(): event = { "input": attachment, "output": attachment, # Same instance - "metadata": {"file": attachment} # Same instance again + "metadata": {"file": attachment}, # Same instance again } attachments = [] @@ -3296,10 +3233,7 @@ def test_multiple_attachments_upload_tracked(with_memory_logger, with_simulate_l logger = init_test_logger(__name__) span = logger.start_span(name="test_span") - span.log( - input={"file1": attachment1}, - output={"file2": attachment2} - ) + span.log(input={"file1": attachment1}, output={"file2": attachment2}) span.end() logger.flush() @@ -3329,9 +3263,7 @@ def test_same_attachment_logged_twice_tracked_twice(with_memory_logger, with_sim def test_external_attachment_upload_tracked(with_memory_logger, with_simulate_login): """Test that ExternalAttachment upload is also tracked.""" ext_attachment = ExternalAttachment( - url="s3://bucket/key.pdf", - filename="external.pdf", - content_type="application/pdf" + url="s3://bucket/key.pdf", filename="external.pdf", content_type="application/pdf" ) logger = init_test_logger(__name__) @@ -3369,11 +3301,7 @@ def test_multiple_attachment_types_tracked(with_memory_logger, with_simulate_log logger = init_test_logger(__name__) span = logger.start_span(name="test_span") - span.log( - input=attachment, - output=json_attachment, - metadata={"file": ext_attachment} - ) + span.log(input=attachment, output=json_attachment, metadata={"file": ext_attachment}) span.end() logger.flush() From cdee484941072597fa9a48879f584a34f3ed068b Mon Sep 17 00:00:00 2001 From: Olmo Maldonado Date: Fri, 30 Jan 2026 15:59:34 -0800 Subject: [PATCH 3/3] add root (filtered) > child + sibling > descendant test --- js/src/logger.test.ts | 106 +++++++++++++++++++++++++++++++ py/src/braintrust/test_logger.py | 106 +++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+) diff --git a/js/src/logger.test.ts b/js/src/logger.test.ts index c24cbbbeb..47e21c53e 100644 --- a/js/src/logger.test.ts +++ b/js/src/logger.test.ts @@ -1384,4 +1384,110 @@ describe("filtering functionality", () => { expect(loggedNames).not.toContain("sibling"); // Filtered expect(loggedNames).not.toContain("descendant"); // Also filtered (cascaded) }); + + /** + * Test what happens when the ROOT span is filtered. + * + * Given: + * root <-- FILTERED + * parent + * child + * sibling + * descendant + * + * Result: All spans still logged, but parent becomes orphan. + * The internal hierarchy (parent->child, parent->sibling, sibling->descendant) is preserved. + * UI will create synthetic root and put parent under it as orphan. + */ + test("filtering root span - descendants preserve internal hierarchy but parent becomes orphan", async () => { + const filteredSpanIds = new Set(); + + const filterRoot = (event: any): any | null => { + const spanId = event.span_id; + if (spanId && filteredSpanIds.has(spanId)) { + return null; + } + + const spanName = event.span_attributes?.name ?? ""; + if (spanName === "root") { + if (spanId) { + filteredSpanIds.add(spanId); + } + return null; + } + + return event; + }; + + setFilteringFunction(filterRoot); + + const logger = initLogger({ + projectName: "test", + projectId: "test-project-id", + }); + + const root = logger.startSpan({ name: "root" }); + const rootSpanId = root.spanId; + const rootRootSpanId = root.rootSpanId; + + const parent = root.startSpan({ name: "parent" }); + const parentSpanId = parent.spanId; + + const child = parent.startSpan({ name: "child" }); + child.log({ input: "child data" }); + child.end(); + + const sibling = parent.startSpan({ name: "sibling" }); + const siblingSpanId = sibling.spanId; + + const descendant = sibling.startSpan({ name: "descendant" }); + descendant.log({ input: "descendant data" }); + descendant.end(); + + sibling.end(); + parent.end(); + root.end(); + + await memoryLogger.flush(); + const events = await memoryLogger.drain(); + + // Verify root was filtered but all others are logged + const loggedNames = events.map((e: any) => e.span_attributes?.name); + expect(loggedNames).not.toContain("root"); // Filtered + expect(loggedNames).toContain("parent"); + expect(loggedNames).toContain("child"); + expect(loggedNames).toContain("sibling"); + expect(loggedNames).toContain("descendant"); + + // Get each span's log + const parentLog = events.find( + (e: any) => e.span_attributes?.name === "parent", + ); + const childLog = events.find( + (e: any) => e.span_attributes?.name === "child", + ); + const siblingLog = events.find( + (e: any) => e.span_attributes?.name === "sibling", + ); + const descendantLog = events.find( + (e: any) => e.span_attributes?.name === "descendant", + ); + + // All spans still have root_span_id pointing to the filtered root + expect(parentLog.root_span_id).toBe(rootRootSpanId); + expect(childLog.root_span_id).toBe(rootRootSpanId); + expect(siblingLog.root_span_id).toBe(rootRootSpanId); + expect(descendantLog.root_span_id).toBe(rootRootSpanId); + + // Parent's span_parents points to filtered root (making it an orphan in UI) + expect(parentLog.span_parents).toContain(rootSpanId); + + // But the internal hierarchy is preserved: + // child and sibling are children of parent + expect(childLog.span_parents).toContain(parentSpanId); + expect(siblingLog.span_parents).toContain(parentSpanId); + + // descendant is child of sibling + expect(descendantLog.span_parents).toContain(siblingSpanId); + }); }); diff --git a/py/src/braintrust/test_logger.py b/py/src/braintrust/test_logger.py index 924181868..a1c361bbf 100644 --- a/py/src/braintrust/test_logger.py +++ b/py/src/braintrust/test_logger.py @@ -2342,6 +2342,112 @@ def filter_sibling_with_cascade(event): braintrust.set_filtering_function(None) +def test_filtering_root_span_all_descendants_become_orphans(with_memory_logger, with_simulate_login): + """ + Test what happens when the ROOT span is filtered. + + Given this tree: + root <-- FILTERED + parent + child + sibling + descendant + + Question: Will descendants maintain their relative structure? + parent + child + sibling + descendant + + Answer: NO. All spans become orphans because: + 1. All spans have root_span_id pointing to the filtered root + 2. parent has span_parents pointing to root (which doesn't exist) + 3. UI will create a synthetic root and put parent under it as orphan + 4. child/sibling are children of parent (which exists), so they're NOT orphans + 5. descendant is child of sibling (which exists), so it's NOT an orphan + + So the structure will be: + [synthetic root] + parent <-- orphan (its span_parents points to filtered root) + child + sibling + descendant + + The internal hierarchy is preserved, but parent becomes an orphan. + """ + filtered_span_ids = set() + + def filter_root(event): + span_id = event.get("span_id") + if span_id and span_id in filtered_span_ids: + return None + + span_name = event.get("span_attributes", {}).get("name", "") + if span_name == "root": + if span_id: + filtered_span_ids.add(span_id) + return None + + return event + + braintrust.set_filtering_function(filter_root) + + test_logger = init_test_logger("test_project") + + with test_logger.start_span(name="root") as root: + root_span_id = root.span_id + root_root_span_id = root.root_span_id + + with root.start_span(name="parent") as parent: + parent_span_id = parent.span_id + + with parent.start_span(name="child") as child: + child.log(input="child data") + child_span_id = child.span_id + + with parent.start_span(name="sibling") as sibling: + sibling_span_id = sibling.span_id + + with sibling.start_span(name="descendant") as descendant: + descendant.log(input="descendant data") + descendant_span_id = descendant.span_id + + logs = with_memory_logger.pop() + + # Verify root was filtered but all others are logged + logged_names = [l.get("span_attributes", {}).get("name") for l in logs] + assert "root" not in logged_names # Filtered + assert "parent" in logged_names + assert "child" in logged_names + assert "sibling" in logged_names + assert "descendant" in logged_names + + # Get each span's log + parent_log = next(l for l in logs if l.get("span_attributes", {}).get("name") == "parent") + child_log = next(l for l in logs if l.get("span_attributes", {}).get("name") == "child") + sibling_log = next(l for l in logs if l.get("span_attributes", {}).get("name") == "sibling") + descendant_log = next(l for l in logs if l.get("span_attributes", {}).get("name") == "descendant") + + # All spans still have root_span_id pointing to the filtered root + assert parent_log["root_span_id"] == root_root_span_id + assert child_log["root_span_id"] == root_root_span_id + assert sibling_log["root_span_id"] == root_root_span_id + assert descendant_log["root_span_id"] == root_root_span_id + + # Parent's span_parents points to filtered root (making it an orphan in UI) + assert root_span_id in parent_log["span_parents"] + + # But the internal hierarchy is preserved: + # child and sibling are children of parent + assert parent_span_id in child_log["span_parents"] + assert parent_span_id in sibling_log["span_parents"] + + # descendant is child of sibling + assert sibling_span_id in descendant_log["span_parents"] + + braintrust.set_filtering_function(None) + + def test_attachment_unreadable_path_logs_warning(caplog): with caplog.at_level(logging.WARNING, logger="braintrust"): Attachment(