jaylfc · jaylfc · Apr 19, 2026 · coderabbitai · Apr 19, 2026 · coderabbitai
diff --git a/benchmarks/locomo_runner.py b/benchmarks/locomo_runner.py
@@ -136,15 +136,23 @@ async def _ollama_generate(client: httpx.AsyncClient, url: str, model: str,
 
 
 async def _judge(client: httpx.AsyncClient, url: str, model: str,
-                 question: str, reference: str, predicted: str) -> float:
+                 question: str, reference: str, predicted: str) -> float | None:
+    """Return 1.0/0.0 on a YES/NO judgment, or None if the judge call itself
+    failed (timeout, network, HTTP error).
+
+    None is deliberately distinct from 0.0 — a transport failure is not a
+    wrong-answer verdict. Callers must treat None as "metric unavailable"
+    and exclude it from averages so that infra flakiness doesn't bias the
+    Judge score downward.
+    """
     try:
         reply = await _ollama_generate(
             client, url, model,
             JUDGE_PROMPT.format(question=question, reference=reference, predicted=predicted),
             temperature=0.0,
         )
     except Exception:
-        return 0.0
+        return None
     head = reply.strip().upper().split()
     return 1.0 if head and head[0].startswith("YES") else 0.0
 
@@ -233,45 +241,90 @@ async def _process_qa(qa: dict, conv_id: str, vmem: VectorMemory, client: httpx.
 
     context = _build_context(hits)
     t1 = time.time()
+    generation_error: str | None = None
     try:
         predicted = await _ollama_generate(
             client, ollama_url, model,
             ANSWER_PROMPT.format(context=context, question=question),
         )
     except Exception as exc:
-        predicted = f"[generation_error: {exc}]"
+        predicted = ""
+        generation_error = f"{type(exc).__name__}: {exc}"
     gen_ms = (time.time() - t1) * 1000.0
 
-    judge = await _judge(client, ollama_url, model, question, reference, predicted)
+    # If generation failed, do NOT compute scores on a synthetic error string —
+    # that folds infra failures into the benchmark averages. Record None so
+    # _summary can exclude the row from F1/BLEU/Judge means. Retrieval-side
+    # metrics (evidence_hits, retrieval_ms) are still valid, keep them.
+    if generation_error is None:
+        judge = await _judge(client, ollama_url, model, question, reference, predicted)
+        f1_val = round(_f1(predicted, reference), 4)
+        bleu_val = round(_bleu1(predicted, reference), 4)
+        judge_val = round(judge, 4) if judge is not None else None
+    else:
+        f1_val = None
+        bleu_val = None
+        judge_val = None
 
-    return {
+    row: dict = {
         "conversation_id": conv_id,
         "question": question,
         "reference": reference,
         "predicted": predicted,
         "category": category,
-        "f1": round(_f1(predicted, reference), 4),
-        "bleu1": round(_bleu1(predicted, reference), 4),
-        "judge": round(judge, 4),
+        "f1": f1_val,
+        "bleu1": bleu_val,
+        "judge": judge_val,
         "retrieval_ms": round(retrieval_ms, 2),
         "gen_ms": round(gen_ms, 2),
         "evidence_hits": _evidence_hits(hits, evidence),
         "evidence_total": len(evidence),
     }
+    if generation_error is not None:
+        row["error"] = generation_error
+    return row
 
 
 def _summary(rows: list[dict]) -> dict:
+    """Aggregate per-category stats, skipping None-valued metrics.
+
+    Judge, F1, BLEU-1 can each be None on a row for two reasons:
+    (1) generation failed (the whole row has an `error` field), or
+    (2) the judge LLM call timed out.
+    Similarly, `evidence_hits`/`evidence_total` can be None when the memory
+    backend (e.g. mem0) doesn't round-trip per-turn IDs. In all these cases
+    the row is excluded from the corresponding mean so infra/adapter
+    artefacts don't bias the headline numbers.
+
+    Also reports `scored_count` (rows contributing to each metric) alongside
+    `count` (total rows) so callers can see the denominator honestly.
+    """
     if not rows:
-        return {"count": 0, "f1": 0.0, "bleu1": 0.0, "judge": 0.0, "retrieval_recall": 0.0}
+        return {"count": 0, "f1": 0.0, "bleu1": 0.0, "judge": 0.0,
+                "retrieval_recall": 0.0, "judge_scored": 0, "recall_scored": 0}
     n = len(rows)
-    hit_rows = [r for r in rows if r.get("evidence_total", 0) > 0]
-    recall = (sum(1 for r in hit_rows if r["evidence_hits"] > 0) / len(hit_rows)) if hit_rows else 0.0
+
+    f1_vals = [r["f1"] for r in rows if r.get("f1") is not None]
+    bleu_vals = [r["bleu1"] for r in rows if r.get("bleu1") is not None]
+    judge_vals = [r["judge"] for r in rows if r.get("judge") is not None]
+
+    recall_rows = [
+        r for r in rows
+        if r.get("evidence_total") is not None and r.get("evidence_hits") is not None
+        and r["evidence_total"] > 0
+    ]
+    recall = (
+        sum(1 for r in recall_rows if r["evidence_hits"] > 0) / len(recall_rows)
+    ) if recall_rows else 0.0
+
     return {
         "count": n,
-        "f1": round(sum(r["f1"] for r in rows) / n, 4),
-        "bleu1": round(sum(r["bleu1"] for r in rows) / n, 4),
-        "judge": round(sum(r["judge"] for r in rows) / n, 4),
+        "f1": round(sum(f1_vals) / len(f1_vals), 4) if f1_vals else 0.0,
+        "bleu1": round(sum(bleu_vals) / len(bleu_vals), 4) if bleu_vals else 0.0,
+        "judge": round(sum(judge_vals) / len(judge_vals), 4) if judge_vals else 0.0,
         "retrieval_recall": round(recall, 4),
+        "judge_scored": len(judge_vals),
+        "recall_scored": len(recall_rows),
     }
 
 
@@ -339,6 +392,12 @@ async def _guarded(qa: dict, conv_id: str, vmem: VectorMemory,
                     print(f"  qa failed: {e!r}", flush=True)
                 return None
         if outcome is not None:
+            # Rows with an `error` field had a generation failure — count
+            # them as failed so the JSON's failed_qa reflects reality, not
+            # zero just because _process_qa swallowed the exception.
+            if outcome.get("error"):
+                async with progress_lock:
+                    failed_qa += 1
             async with progress_lock:
                 total_seen += 1
                 if total_seen % 25 == 0:

diff --git a/benchmarks/mem0_locomo_runner.py b/benchmarks/mem0_locomo_runner.py
@@ -300,6 +300,14 @@ async def _process_qa_mem0(
     category = int(qa.get("category", 0))
     evidence = qa.get("evidence", []) or []
 
+    # mem0 stored facts don't carry original LoCoMo dia_id metadata, so we
+    # cannot match against the gold evidence list. Report both hits and total
+    # as None — the metric is _unavailable_, not zero. _summary in the taosmd
+    # runner (and any downstream scorecard builder) skips None-valued rows
+    # from retrieval_recall so this artefact doesn't publish a fake 0.0.
+    # TODO: wire dia_id pass-through if mem0 adds metadata preservation.
+    EVIDENCE_UNAVAILABLE = {"evidence_hits": None, "evidence_total": None}
+
     t0 = time.time()
     try:
         # mem0.search is synchronous; offload to thread pool to avoid blocking.
@@ -313,67 +321,56 @@ async def _process_qa_mem0(
             "reference": reference,
             "predicted": "",
             "category": category,
-            "f1": 0.0,
-            "bleu1": 0.0,
-            "judge": 0.0,
+            "f1": None,
+            "bleu1": None,
+            "judge": None,
             "retrieval_ms": 0.0,
             "gen_ms": 0.0,
-            # evidence_hits/evidence_total not computable without dia_id metadata from
-            # mem0's stored facts (mem0 doesn't preserve per-turn dia_ids). Set to 0.
-            # TODO: if mem0 exposes metadata round-trip, wire dia_id here.
-            "evidence_hits": 0,
-            "evidence_total": len(evidence),
-            "error": str(exc),
+            **EVIDENCE_UNAVAILABLE,
+            "error": f"{type(exc).__name__}: {exc}",
         }
     retrieval_ms = (time.time() - t0) * 1000.0
 
     context = "\n---\n".join(context_chunks) if context_chunks else ""
 
     t1 = time.time()
+    generation_error: str | None = None
     try:
         predicted = await _ollama_generate(
             client, ollama_url, model,
             ANSWER_PROMPT.format(context=context, question=question),
         )
     except Exception as exc:
         predicted = ""
-        gen_ms = (time.time() - t1) * 1000.0
-        return {
-            "conversation_id": conv_id,
-            "question": question,
-            "reference": reference,
-            "predicted": predicted,
-            "category": category,
-            "f1": 0.0,
-            "bleu1": 0.0,
-            "judge": 0.0,
-            "retrieval_ms": round(retrieval_ms, 2),
-            "gen_ms": round(gen_ms, 2),
-            "evidence_hits": 0,
-            "evidence_total": len(evidence),
-            "error": str(exc),
-        }
+        generation_error = f"{type(exc).__name__}: {exc}"
     gen_ms = (time.time() - t1) * 1000.0
 
-    judge = await _judge(client, ollama_url, model, question, reference, predicted)
+    if generation_error is None:
+        judge = await _judge(client, ollama_url, model, question, reference, predicted)
+        f1_val = round(_f1(predicted, reference), 4)
+        bleu_val = round(_bleu1(predicted, reference), 4)
+        judge_val = round(judge, 4) if judge is not None else None
+    else:
+        f1_val = None
+        bleu_val = None
+        judge_val = None
 
-    return {
+    row: dict = {
         "conversation_id": conv_id,
         "question": question,
         "reference": reference,
         "predicted": predicted,
         "category": category,
-        "f1": round(_f1(predicted, reference), 4),
-        "bleu1": round(_bleu1(predicted, reference), 4),
-        "judge": round(judge, 4),
+        "f1": f1_val,
+        "bleu1": bleu_val,
+        "judge": judge_val,
         "retrieval_ms": round(retrieval_ms, 2),
         "gen_ms": round(gen_ms, 2),
-        # mem0 stored facts don't carry original dia_id metadata, so we cannot
-        # match against the LoCoMo gold evidence list. Set evidence_hits to 0.
-        # TODO: wire dia_id if mem0 adds metadata pass-through in a future release.
-        "evidence_hits": 0,
-        "evidence_total": len(evidence),
+        **EVIDENCE_UNAVAILABLE,
     }
+    if generation_error is not None:
+        row["error"] = generation_error
+    return row
 
 
 # ---------------------------------------------------------------------------