diff --git a/benchmarks/locomo_runner.py b/benchmarks/locomo_runner.py index 11e421d..1ba4423 100644 --- a/benchmarks/locomo_runner.py +++ b/benchmarks/locomo_runner.py @@ -136,7 +136,15 @@ async def _ollama_generate(client: httpx.AsyncClient, url: str, model: str, async def _judge(client: httpx.AsyncClient, url: str, model: str, - question: str, reference: str, predicted: str) -> float: + question: str, reference: str, predicted: str) -> float | None: + """Return 1.0/0.0 on a YES/NO judgment, or None if the judge call itself + failed (timeout, network, HTTP error). + + None is deliberately distinct from 0.0 — a transport failure is not a + wrong-answer verdict. Callers must treat None as "metric unavailable" + and exclude it from averages so that infra flakiness doesn't bias the + Judge score downward. + """ try: reply = await _ollama_generate( client, url, model, @@ -144,7 +152,7 @@ async def _judge(client: httpx.AsyncClient, url: str, model: str, temperature=0.0, ) except Exception: - return 0.0 + return None head = reply.strip().upper().split() return 1.0 if head and head[0].startswith("YES") else 0.0 @@ -233,45 +241,90 @@ async def _process_qa(qa: dict, conv_id: str, vmem: VectorMemory, client: httpx. context = _build_context(hits) t1 = time.time() + generation_error: str | None = None try: predicted = await _ollama_generate( client, ollama_url, model, ANSWER_PROMPT.format(context=context, question=question), ) except Exception as exc: - predicted = f"[generation_error: {exc}]" + predicted = "" + generation_error = f"{type(exc).__name__}: {exc}" gen_ms = (time.time() - t1) * 1000.0 - judge = await _judge(client, ollama_url, model, question, reference, predicted) + # If generation failed, do NOT compute scores on a synthetic error string — + # that folds infra failures into the benchmark averages. Record None so + # _summary can exclude the row from F1/BLEU/Judge means. Retrieval-side + # metrics (evidence_hits, retrieval_ms) are still valid, keep them. + if generation_error is None: + judge = await _judge(client, ollama_url, model, question, reference, predicted) + f1_val = round(_f1(predicted, reference), 4) + bleu_val = round(_bleu1(predicted, reference), 4) + judge_val = round(judge, 4) if judge is not None else None + else: + f1_val = None + bleu_val = None + judge_val = None - return { + row: dict = { "conversation_id": conv_id, "question": question, "reference": reference, "predicted": predicted, "category": category, - "f1": round(_f1(predicted, reference), 4), - "bleu1": round(_bleu1(predicted, reference), 4), - "judge": round(judge, 4), + "f1": f1_val, + "bleu1": bleu_val, + "judge": judge_val, "retrieval_ms": round(retrieval_ms, 2), "gen_ms": round(gen_ms, 2), "evidence_hits": _evidence_hits(hits, evidence), "evidence_total": len(evidence), } + if generation_error is not None: + row["error"] = generation_error + return row def _summary(rows: list[dict]) -> dict: + """Aggregate per-category stats, skipping None-valued metrics. + + Judge, F1, BLEU-1 can each be None on a row for two reasons: + (1) generation failed (the whole row has an `error` field), or + (2) the judge LLM call timed out. + Similarly, `evidence_hits`/`evidence_total` can be None when the memory + backend (e.g. mem0) doesn't round-trip per-turn IDs. In all these cases + the row is excluded from the corresponding mean so infra/adapter + artefacts don't bias the headline numbers. + + Also reports `scored_count` (rows contributing to each metric) alongside + `count` (total rows) so callers can see the denominator honestly. + """ if not rows: - return {"count": 0, "f1": 0.0, "bleu1": 0.0, "judge": 0.0, "retrieval_recall": 0.0} + return {"count": 0, "f1": 0.0, "bleu1": 0.0, "judge": 0.0, + "retrieval_recall": 0.0, "judge_scored": 0, "recall_scored": 0} n = len(rows) - hit_rows = [r for r in rows if r.get("evidence_total", 0) > 0] - recall = (sum(1 for r in hit_rows if r["evidence_hits"] > 0) / len(hit_rows)) if hit_rows else 0.0 + + f1_vals = [r["f1"] for r in rows if r.get("f1") is not None] + bleu_vals = [r["bleu1"] for r in rows if r.get("bleu1") is not None] + judge_vals = [r["judge"] for r in rows if r.get("judge") is not None] + + recall_rows = [ + r for r in rows + if r.get("evidence_total") is not None and r.get("evidence_hits") is not None + and r["evidence_total"] > 0 + ] + recall = ( + sum(1 for r in recall_rows if r["evidence_hits"] > 0) / len(recall_rows) + ) if recall_rows else 0.0 + return { "count": n, - "f1": round(sum(r["f1"] for r in rows) / n, 4), - "bleu1": round(sum(r["bleu1"] for r in rows) / n, 4), - "judge": round(sum(r["judge"] for r in rows) / n, 4), + "f1": round(sum(f1_vals) / len(f1_vals), 4) if f1_vals else 0.0, + "bleu1": round(sum(bleu_vals) / len(bleu_vals), 4) if bleu_vals else 0.0, + "judge": round(sum(judge_vals) / len(judge_vals), 4) if judge_vals else 0.0, "retrieval_recall": round(recall, 4), + "judge_scored": len(judge_vals), + "recall_scored": len(recall_rows), } @@ -339,6 +392,12 @@ async def _guarded(qa: dict, conv_id: str, vmem: VectorMemory, print(f" qa failed: {e!r}", flush=True) return None if outcome is not None: + # Rows with an `error` field had a generation failure — count + # them as failed so the JSON's failed_qa reflects reality, not + # zero just because _process_qa swallowed the exception. + if outcome.get("error"): + async with progress_lock: + failed_qa += 1 async with progress_lock: total_seen += 1 if total_seen % 25 == 0: diff --git a/benchmarks/mem0_locomo_runner.py b/benchmarks/mem0_locomo_runner.py index 0795142..9c75459 100644 --- a/benchmarks/mem0_locomo_runner.py +++ b/benchmarks/mem0_locomo_runner.py @@ -300,6 +300,14 @@ async def _process_qa_mem0( category = int(qa.get("category", 0)) evidence = qa.get("evidence", []) or [] + # mem0 stored facts don't carry original LoCoMo dia_id metadata, so we + # cannot match against the gold evidence list. Report both hits and total + # as None — the metric is _unavailable_, not zero. _summary in the taosmd + # runner (and any downstream scorecard builder) skips None-valued rows + # from retrieval_recall so this artefact doesn't publish a fake 0.0. + # TODO: wire dia_id pass-through if mem0 adds metadata preservation. + EVIDENCE_UNAVAILABLE = {"evidence_hits": None, "evidence_total": None} + t0 = time.time() try: # mem0.search is synchronous; offload to thread pool to avoid blocking. @@ -313,23 +321,20 @@ async def _process_qa_mem0( "reference": reference, "predicted": "", "category": category, - "f1": 0.0, - "bleu1": 0.0, - "judge": 0.0, + "f1": None, + "bleu1": None, + "judge": None, "retrieval_ms": 0.0, "gen_ms": 0.0, - # evidence_hits/evidence_total not computable without dia_id metadata from - # mem0's stored facts (mem0 doesn't preserve per-turn dia_ids). Set to 0. - # TODO: if mem0 exposes metadata round-trip, wire dia_id here. - "evidence_hits": 0, - "evidence_total": len(evidence), - "error": str(exc), + **EVIDENCE_UNAVAILABLE, + "error": f"{type(exc).__name__}: {exc}", } retrieval_ms = (time.time() - t0) * 1000.0 context = "\n---\n".join(context_chunks) if context_chunks else "" t1 = time.time() + generation_error: str | None = None try: predicted = await _ollama_generate( client, ollama_url, model, @@ -337,43 +342,35 @@ async def _process_qa_mem0( ) except Exception as exc: predicted = "" - gen_ms = (time.time() - t1) * 1000.0 - return { - "conversation_id": conv_id, - "question": question, - "reference": reference, - "predicted": predicted, - "category": category, - "f1": 0.0, - "bleu1": 0.0, - "judge": 0.0, - "retrieval_ms": round(retrieval_ms, 2), - "gen_ms": round(gen_ms, 2), - "evidence_hits": 0, - "evidence_total": len(evidence), - "error": str(exc), - } + generation_error = f"{type(exc).__name__}: {exc}" gen_ms = (time.time() - t1) * 1000.0 - judge = await _judge(client, ollama_url, model, question, reference, predicted) + if generation_error is None: + judge = await _judge(client, ollama_url, model, question, reference, predicted) + f1_val = round(_f1(predicted, reference), 4) + bleu_val = round(_bleu1(predicted, reference), 4) + judge_val = round(judge, 4) if judge is not None else None + else: + f1_val = None + bleu_val = None + judge_val = None - return { + row: dict = { "conversation_id": conv_id, "question": question, "reference": reference, "predicted": predicted, "category": category, - "f1": round(_f1(predicted, reference), 4), - "bleu1": round(_bleu1(predicted, reference), 4), - "judge": round(judge, 4), + "f1": f1_val, + "bleu1": bleu_val, + "judge": judge_val, "retrieval_ms": round(retrieval_ms, 2), "gen_ms": round(gen_ms, 2), - # mem0 stored facts don't carry original dia_id metadata, so we cannot - # match against the LoCoMo gold evidence list. Set evidence_hits to 0. - # TODO: wire dia_id if mem0 adds metadata pass-through in a future release. - "evidence_hits": 0, - "evidence_total": len(evidence), + **EVIDENCE_UNAVAILABLE, } + if generation_error is not None: + row["error"] = generation_error + return row # ---------------------------------------------------------------------------