Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 93 additions & 27 deletions viewer/index.v2.html
Original file line number Diff line number Diff line change
Expand Up @@ -413,14 +413,36 @@
font-size: 0.78rem; color: var(--muted); white-space: nowrap;
}
.domain-bar-filter-toggle {
margin-left: auto;
white-space: nowrap;
border-radius: 8px;
padding: 7px 12px;
font-size: 0.78rem;
font-weight: 600;
min-height: 32px;
}
.domain-bar-sort-inline {
margin-left: auto;
display: inline-flex; align-items: center; gap: 6px;
font-size: 0.78rem; color: var(--muted);
white-space: nowrap;
}
.domain-bar-sort-inline > span { font-weight: 600; color: #35423e; }
.domain-bar-sort-inline select {
border: 1px solid var(--line);
background: var(--panel);
color: var(--ink);
border-radius: 8px;
padding: 6px 8px;
font-size: 0.78rem;
font-weight: 600;
min-height: 32px;
cursor: pointer;
}
.domain-bar-sort-inline select:hover { border-color: var(--line-strong); background: #ebe7de; }
.domain-bar-sort-inline select:focus-visible {
outline: 2px solid rgba(29, 91, 80, 0.3);
outline-offset: 2px;
}
.domain-bar-filter-toggle[aria-expanded="true"] {
background: var(--panel-alt);
border-color: #7f9087;
Expand Down Expand Up @@ -1303,6 +1325,13 @@ <h1 id="heroTitle">BullshitBench</h1>
<div class="domain-bar-pills" id="domainBarPills"></div>
<span class="domain-bar-sep"></span>
<span class="filter-count" id="domainBarSummary">Overall</span>
<label class="domain-bar-sort-inline" title="Choose which outcome drives the main-view rankings">
<span>Sort by</span>
<select id="domainBarSortSelect" aria-label="Primary ranking metric">
<option value="green">Clear Pushback</option>
<option value="red">Accepted Nonsense</option>
</select>
</label>
<button type="button" id="domainBarFilterToggle" class="domain-bar-filter-toggle alt" aria-controls="filtersCollapsible" aria-expanded="false">Show Filters</button>
<button type="button" id="domainBarPinToggle" class="domain-bar-pin-toggle" aria-pressed="false" aria-label="Pin Domain Scope banner to top" title="Pin Domain Scope banner to top">
<svg viewBox="0 0 16 16" aria-hidden="true">
Expand Down Expand Up @@ -1680,6 +1709,39 @@ <h2 data-bench-heading="Response Viewer">BullshitBench: Response Viewer</h2>
};
const CATEGORY_ORDER = ["green", "amber", "red", "error"];

// strongestDir puts the best model at the top: desc for green (higher = better),
// asc for red (lower = better). Keeps the "strongest model wins" framing.
const PRIMARY_METRIC_META = {
green: { label: "Clear Pushback", rateField: "greenRate", lbKey: "greenRate", strongestDir: "desc" },
red: { label: "Accepted Nonsense", rateField: "redRate", lbKey: "redRate", strongestDir: "asc" },
};
const DEFAULT_PRIMARY_METRIC = "green";
function primaryMetricMeta(metric) {
return PRIMARY_METRIC_META[metric] || PRIMARY_METRIC_META[DEFAULT_PRIMARY_METRIC];
}
function primaryMetricComparator(labelKey = "label", metric = undefined) {
const active = metric || S.primaryMetric || DEFAULT_PRIMARY_METRIC;
return (a, b) => {
let cmp;
if (active === "red") {
cmp = (a.redRate - b.redRate) ||
(b.greenRate - a.greenRate) ||
(b.amberRate - a.amberRate);
} else {
cmp = (b.greenRate - a.greenRate) ||
(b.amberRate - a.amberRate) ||
(a.redRate - b.redRate);
}
if (cmp) return cmp;
const al = String(a[labelKey] || ""), bl = String(b[labelKey] || "");
return al.localeCompare(bl);
};
}
function defaultLbSortForMetric(metric) {
const meta = primaryMetricMeta(metric);
return { key: meta.lbKey, direction: meta.strongestDir };
}

const ORG_COLORS = {
anthropic: "#f97316", openai: "#1f9d55", google: "#1a73e8",
meta: "#0866ff",
Expand Down Expand Up @@ -1748,7 +1810,9 @@ <h2 data-bench-heading="Response Viewer">BullshitBench: Response Viewer</h2>
topVariantsOnly: false,
heroPinned: false,
domainPinned: false,
drilldown: null, lbSort: { key: "greenRate", direction: "desc" },
drilldown: null,
primaryMetric: DEFAULT_PRIMARY_METRIC,
lbSort: defaultLbSortForMetric(DEFAULT_PRIMARY_METRIC),
tokenSort: "total",
techniqueSeed: Math.floor(Math.random() * 1e9),
legendExpanded: {},
Expand Down Expand Up @@ -2689,12 +2753,7 @@ <h2 data-bench-heading="Response Viewer">BullshitBench: Response Viewer</h2>
...s,
};
});
rows.sort((a,b)=>
(b.greenRate-a.greenRate) ||
(b.amberRate-a.amberRate) ||
(a.redRate-b.redRate) ||
a.label.localeCompare(b.label)
);
rows.sort(primaryMetricComparator("label"));
rows.forEach((row, idx) => {
row.rank = idx + 1;
});
Expand Down Expand Up @@ -2757,7 +2816,8 @@ <h2 data-bench-heading="Response Viewer">BullshitBench: Response Viewer</h2>
});
return { mk, label:prettyModel(sample), org:modelOrg(sample), overall, byDomain };
});
modelData.sort((a,b)=>b.overall.greenRate-a.overall.greenRate||a.label.localeCompare(b.label));
const overallCmp = primaryMetricComparator("label");
modelData.sort((a,b)=>overallCmp({...a.overall, label:a.label}, {...b.overall, label:b.label}));

const heatColor = (rate) => {
if(rate===null) return "#f3f6f4";
Expand Down Expand Up @@ -4297,13 +4357,8 @@ <h3 style="margin:0 0 6px;">${esc(prettyModel(sample))} -- ${esc(domLabel)}</h3>
avgCost,
};
});
// Baseline ranking
const baseline=[...rows].sort((a,b)=>
(b.greenRate-a.greenRate) ||
(b.amberRate-a.amberRate) ||
(a.redRate-b.redRate) ||
a.model.localeCompare(b.model)
);
const baselineCmp = primaryMetricComparator("model");
const baseline=[...rows].sort(baselineCmp);
baseline.forEach((r,i)=>r.rank=i+1);
// Apply user sort
const {key,direction}=S.lbSort;
Expand All @@ -4313,14 +4368,7 @@ <h3 style="margin:0 0 6px;">${esc(prettyModel(sample))} -- ${esc(domLabel)}</h3>
let cmp=0;
if(numKeys.has(key)) cmp=Number(a[key]||0)-Number(b[key]||0);
else cmp=String(a[key]||"").localeCompare(String(b[key]||""));
return cmp!==0
? cmp*dir
: (
(b.greenRate-a.greenRate) ||
(b.amberRate-a.amberRate) ||
(a.redRate-b.redRate) ||
a.model.localeCompare(b.model)
);
return cmp!==0 ? cmp*dir : baselineCmp(a,b);
});

const lbBody = document.getElementById("lbBody");
Expand Down Expand Up @@ -4362,14 +4410,15 @@ <h3 style="margin:0 0 6px;">${esc(prettyModel(sample))} -- ${esc(domLabel)}</h3>
const qGroups = groupBy(filtered, r=>r.question_id||"");
const qEntries = [...qGroups.entries()].map(([qid,rows])=>{
const s=summarize(rows), sample=rows[0]||{};
return { qid, rows, sample, greenRate:s.greenRate };
}).sort((a,b)=>b.greenRate-a.greenRate||a.qid.localeCompare(b.qid));
return { qid, rows, sample, greenRate:s.greenRate, amberRate:s.amberRate, redRate:s.redRate };
}).sort(primaryMetricComparator("qid"));

const metricMeta = primaryMetricMeta(S.primaryMetric);
const qSel=document.getElementById("compareQuestion");
const prev=qSel.value;
qSel.innerHTML=qEntries.map(e=>{
const qm=questionMeta(e.qid);
const parts = [e.qid, fmtPct(e.greenRate)];
const parts = [e.qid, fmtPct(e[metricMeta.rateField])];
const scope = questionScopeLabel(qm);
if (scope) parts.push(scope);
if (qm.difficulty) parts.push(DIFFICULTY_LABELS[qm.difficulty] || qm.difficulty);
Expand Down Expand Up @@ -4632,6 +4681,19 @@ <h4 class="title"><span class="org-dot" style="background:${orgColor(org)};"></s
renderAll();
});

// Primary ranking metric selector
const sortSelect = document.getElementById("domainBarSortSelect");
if (sortSelect) {
sortSelect.value = S.primaryMetric;
sortSelect.addEventListener("change", () => {
const next = PRIMARY_METRIC_META[sortSelect.value] ? sortSelect.value : DEFAULT_PRIMARY_METRIC;
if (next === S.primaryMetric) return;
S.primaryMetric = next;
S.lbSort = defaultLbSortForMetric(next);
renderAll();
});
}

// Sticky filter toggle button
const filterToggleBtn = document.getElementById("domainBarFilterToggle");
const filterDetails = document.getElementById("filtersCollapsible");
Expand Down Expand Up @@ -4689,6 +4751,10 @@ <h4 class="title"><span class="org-dot" style="background:${orgColor(org)};"></s
document.getElementById("leaderboardRecentSelect").value=LEADERBOARD_WINDOW_DEFAULT;
[1,2,3].forEach(i=>{if(!document.getElementById(`judge${i}Toggle`).disabled) document.getElementById(`judge${i}Toggle`).checked=true;});
syncJudges(); S.hiddenModels.clear(); S.topVariantsOnly = false; S.leaderboardRecentWindow = LEADERBOARD_WINDOW_DEFAULT; S.modelChartFocusOrg = "all";
S.primaryMetric = DEFAULT_PRIMARY_METRIC;
S.lbSort = defaultLbSortForMetric(DEFAULT_PRIMARY_METRIC);
const sortSel = document.getElementById("domainBarSortSelect");
if (sortSel) sortSel.value = DEFAULT_PRIMARY_METRIC;
[...document.querySelectorAll('input[name="scoreFilter"]')].forEach(n=>n.checked=true);
refreshTopVariantToggle();
renderModelToggles(); renderAll();
Expand Down