Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
454 changes: 208 additions & 246 deletions benchmarks/llm/results/report.md

Large diffs are not rendered by default.

785 changes: 395 additions & 390 deletions benchmarks/llm/results/report_assets/agreement.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,896 changes: 913 additions & 983 deletions benchmarks/llm/results/report_assets/alignment.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
218 changes: 109 additions & 109 deletions benchmarks/llm/results/report_assets/boundary.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,114 changes: 507 additions & 607 deletions benchmarks/llm/results/report_assets/calibration.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
232 changes: 116 additions & 116 deletions benchmarks/llm/results/report_assets/compliance.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,136 changes: 526 additions & 610 deletions benchmarks/llm/results/report_assets/detection_by_length.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,136 changes: 526 additions & 610 deletions benchmarks/llm/results/report_assets/detection_by_position.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
110 changes: 55 additions & 55 deletions benchmarks/llm/results/report_assets/episodes.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
462 changes: 231 additions & 231 deletions benchmarks/llm/results/report_assets/latency_tail.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
282 changes: 141 additions & 141 deletions benchmarks/llm/results/report_assets/pareto.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
826 changes: 412 additions & 414 deletions benchmarks/llm/results/report_assets/parser_stress.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
290 changes: 145 additions & 145 deletions benchmarks/llm/results/report_assets/precision_recall.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
364 changes: 182 additions & 182 deletions benchmarks/llm/results/report_assets/token_efficiency.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
396 changes: 198 additions & 198 deletions benchmarks/llm/results/report_assets/trial_variance.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
33 changes: 24 additions & 9 deletions benchmarks/llm/src/benchmark/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,19 +135,22 @@ def render(
active = {mid: s for mid, s in by_model.items() if mid not in deprecated_ids}
deprecated = {mid: s for mid, s in by_model.items() if mid in deprecated_ids}

extras_active = extras.without(deprecated_ids)
calls_active = calls if not deprecated_ids else [r for r in calls if r["model"] not in deprecated_ids]

sections = [
_render_how_to_read(),
_render_tldr(active, episodes),
_render_charts_section(),
_render_failures(calls),
_render_failures(calls_active),
_render_accuracy_breakdown(active),
_render_boundary_accuracy(active),
_render_calibration_table(extras.calibration),
_render_calibration_table(extras_active.calibration),
_render_latency_tail(active),
_render_token_efficiency(active),
_render_trial_variance(active),
_render_cross_model_agreement(extras.agreement, active),
_render_detection_buckets(extras.detection_buckets),
_render_cross_model_agreement(extras_active.agreement, active),
_render_detection_buckets(extras_active.detection_buckets),
_render_quick_comparison(active, episodes),
"---",
"## Detailed Results",
Expand All @@ -171,22 +174,22 @@ def render(
_render_pareto(active, assets_dir / "pareto.svg")
_render_compliance(active, assets_dir / "compliance.svg")
_render_episode_heatmap(active, episodes, assets_dir / "episodes.svg")
_render_calibration_chart(extras.calibration, assets_dir / "calibration.svg")
_render_calibration_chart(extras_active.calibration, assets_dir / "calibration.svg")
_render_latency_tail_chart(active, assets_dir / "latency_tail.svg")
_render_agreement_chart(extras.agreement, len(active), assets_dir / "agreement.svg")
_render_alignment_chart(extras.agreement, len(active), assets_dir / "alignment.svg")
_render_agreement_chart(extras_active.agreement, len(active), assets_dir / "agreement.svg")
_render_alignment_chart(extras_active.agreement, len(active), assets_dir / "alignment.svg")
_render_precision_recall_chart(active, assets_dir / "precision_recall.svg")
_render_boundary_chart(active, assets_dir / "boundary.svg")
_render_token_efficiency_chart(active, assets_dir / "token_efficiency.svg")
_render_trial_variance_chart(active, assets_dir / "trial_variance.svg")
_render_detection_bucket_chart(
extras.detection_buckets, "length",
extras_active.detection_buckets, "length",
["short (<30s)", "medium (30-90s)", "long (>=90s)"],
"Detection rate by ad length (rows sorted by overall detection rate, descending)",
assets_dir / "detection_by_length.svg",
)
_render_detection_bucket_chart(
extras.detection_buckets, "position",
extras_active.detection_buckets, "position",
["pre-roll (<10%)", "mid-roll (10-90%)", "post-roll (>90%)"],
"Detection rate by ad position (rows sorted by overall detection rate, descending)",
assets_dir / "detection_by_position.svg",
Expand All @@ -200,6 +203,18 @@ class _Extras:
calibration: dict[str, list[tuple[float, bool]]] # model -> [(confidence, is_tp), ...]
agreement: dict[tuple[str, int], dict[str, int]] # (episode, window_idx) -> {model: n_predicted_ads}
detection_buckets: dict[str, dict[str, dict[str, list[bool]]]]

def without(self, model_ids: set[str]) -> "_Extras":
if not model_ids:
return self
return _Extras(
calibration={mid: v for mid, v in self.calibration.items() if mid not in model_ids},
agreement={
key: {m: v for m, v in per_model.items() if m not in model_ids}
for key, per_model in self.agreement.items()
},
detection_buckets={mid: v for mid, v in self.detection_buckets.items() if mid not in model_ids},
)
# detection_buckets[model][bucket_kind][bucket_label] -> list of bool (was each truth-ad in this bucket detected?)


Expand Down
Loading