Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Tooling

- **Benchmark report scatter plots now assign a unique color per model.** `_render_pareto`, `_render_precision_recall_chart`, and `_render_token_efficiency_chart` were drawing from `tab20` with `i % 20`, so any run with more than 20 models silently reused colors and made the legend ambiguous. Added `_distinct_colors(n)` that concatenates `tab20 + tab20b + tab20c` (60 categorical colors) and falls back to evenly-spaced `hsv` past that. Benchmark-only change; not shipped in the runtime image, no version bump.

## [2.4.8] - 2026-05-15

### Security
Expand Down
158 changes: 79 additions & 79 deletions benchmarks/llm/results/report.md

Large diffs are not rendered by default.

176 changes: 88 additions & 88 deletions benchmarks/llm/results/report_assets/agreement.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
356 changes: 178 additions & 178 deletions benchmarks/llm/results/report_assets/alignment.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
218 changes: 109 additions & 109 deletions benchmarks/llm/results/report_assets/boundary.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
102 changes: 51 additions & 51 deletions benchmarks/llm/results/report_assets/calibration.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
258 changes: 129 additions & 129 deletions benchmarks/llm/results/report_assets/compliance.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
98 changes: 49 additions & 49 deletions benchmarks/llm/results/report_assets/detection_by_length.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
98 changes: 49 additions & 49 deletions benchmarks/llm/results/report_assets/detection_by_position.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
110 changes: 55 additions & 55 deletions benchmarks/llm/results/report_assets/episodes.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
462 changes: 231 additions & 231 deletions benchmarks/llm/results/report_assets/latency_tail.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
425 changes: 284 additions & 141 deletions benchmarks/llm/results/report_assets/pareto.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
702 changes: 352 additions & 350 deletions benchmarks/llm/results/report_assets/parser_stress.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
433 changes: 288 additions & 145 deletions benchmarks/llm/results/report_assets/precision_recall.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
507 changes: 325 additions & 182 deletions benchmarks/llm/results/report_assets/token_efficiency.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
176 changes: 88 additions & 88 deletions benchmarks/llm/results/report_assets/trial_variance.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
32 changes: 25 additions & 7 deletions benchmarks/llm/src/benchmark/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,25 @@ def _avg_f1(stats: ModelStats) -> float:
return stats.avg_f1


_CATEGORICAL_CMAPS = ("tab20", "tab20b", "tab20c") # 20 colors each = 60 total


def _distinct_colors(n: int) -> list[tuple[float, ...]]:
"""Return n visually distinct RGBA tuples, never repeating.

Concatenating the tab20 family gives 60 categorical colors with good
perceptual contrast. Past that, fall back to evenly-spaced hsv samples
so each model still gets a unique color, accepting weaker contrast."""
import matplotlib.pyplot as plt
palette: list[tuple[float, ...]] = []
for name in _CATEGORICAL_CMAPS:
palette.extend(plt.get_cmap(name).colors)
if n <= len(palette):
return palette[:n]
cmap = plt.get_cmap("hsv")
return [cmap(i / n) for i in range(n)]


def _render_pareto(stats: dict[str, ModelStats], path: Path) -> None:
"""Distinct color per model, legend rendered as a real matplotlib legend
below the plot so each model's color sits next to its name."""
Expand All @@ -1027,13 +1046,12 @@ def _render_pareto(stats: dict[str, ModelStats], path: Path) -> None:
points = [(s, f1) for s, f1 in points if not (f1 == 0 and s.total_episode_cost == 0)]
points.sort(key=lambda t: (-t[1], t[0].total_episode_cost)) # rank by F1 desc, then cost asc

cmap = plt.get_cmap("tab20")
colors = _distinct_colors(len(points))
fig, ax = plt.subplots(figsize=(11, 9))
for i, (s, f1) in enumerate(points):
color = cmap(i % 20)
ax.scatter(
s.total_episode_cost, f1,
s=180, color=color,
s=180, color=colors[i],
edgecolors="black", linewidths=0.7, zorder=3,
label=f"{s.model} (F1 {f1:.3f}, ${s.total_episode_cost:.4f}/ep)",
)
Expand Down Expand Up @@ -1683,7 +1701,7 @@ def _render_precision_recall_chart(stats: dict[str, ModelStats], path: Path) ->
return
points.sort(key=lambda t: -(2 * t[1] * t[2] / (t[1] + t[2]) if (t[1] + t[2]) > 0 else 0)) # F1 desc

cmap = plt.get_cmap("tab20")
colors = _distinct_colors(len(points))
fig, ax = plt.subplots(figsize=(11, 9))

# F1 isocurves: for each target F1, plot the curve precision*recall*2 / (p+r) = F1
Expand All @@ -1699,7 +1717,7 @@ def _render_precision_recall_chart(stats: dict[str, ModelStats], path: Path) ->

for i, (s, p, r) in enumerate(points):
f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
ax.scatter(r, p, s=180, color=cmap(i % 20),
ax.scatter(r, p, s=180, color=colors[i],
edgecolors="black", linewidths=0.7, zorder=3,
label=f"{s.model} (P {p:.2f}, R {r:.2f}, F1 {f1:.2f})")

Expand Down Expand Up @@ -1775,10 +1793,10 @@ def _render_token_efficiency_chart(stats: dict[str, ModelStats], path: Path) ->
return
points.sort(key=lambda t: -t[1])

cmap = plt.get_cmap("tab20")
colors = _distinct_colors(len(points))
fig, ax = plt.subplots(figsize=(11, 8))
for i, (s, f1) in enumerate(points):
ax.scatter(s.tokens_per_detected_ad, f1, s=180, color=cmap(i % 20),
ax.scatter(s.tokens_per_detected_ad, f1, s=180, color=colors[i],
edgecolors="black", linewidths=0.7, zorder=3,
label=f"{s.model} (F1 {f1:.2f}, {s.tokens_per_detected_ad:.0f} tok/ad)")
ax.set_xscale("log")
Expand Down
Loading