Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 73 additions & 14 deletions benchmarks/locomo_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,23 @@ async def _ollama_generate(client: httpx.AsyncClient, url: str, model: str,


async def _judge(client: httpx.AsyncClient, url: str, model: str,
question: str, reference: str, predicted: str) -> float:
question: str, reference: str, predicted: str) -> float | None:
"""Return 1.0/0.0 on a YES/NO judgment, or None if the judge call itself
failed (timeout, network, HTTP error).

None is deliberately distinct from 0.0 — a transport failure is not a
wrong-answer verdict. Callers must treat None as "metric unavailable"
and exclude it from averages so that infra flakiness doesn't bias the
Judge score downward.
"""
try:
reply = await _ollama_generate(
client, url, model,
JUDGE_PROMPT.format(question=question, reference=reference, predicted=predicted),
temperature=0.0,
)
except Exception:
return 0.0
return None
head = reply.strip().upper().split()
return 1.0 if head and head[0].startswith("YES") else 0.0

Expand Down Expand Up @@ -233,45 +241,90 @@ async def _process_qa(qa: dict, conv_id: str, vmem: VectorMemory, client: httpx.

context = _build_context(hits)
t1 = time.time()
generation_error: str | None = None
try:
predicted = await _ollama_generate(
client, ollama_url, model,
ANSWER_PROMPT.format(context=context, question=question),
)
except Exception as exc:
predicted = f"[generation_error: {exc}]"
predicted = ""
generation_error = f"{type(exc).__name__}: {exc}"
gen_ms = (time.time() - t1) * 1000.0

judge = await _judge(client, ollama_url, model, question, reference, predicted)
# If generation failed, do NOT compute scores on a synthetic error string —
# that folds infra failures into the benchmark averages. Record None so
# _summary can exclude the row from F1/BLEU/Judge means. Retrieval-side
# metrics (evidence_hits, retrieval_ms) are still valid, keep them.
if generation_error is None:
judge = await _judge(client, ollama_url, model, question, reference, predicted)
f1_val = round(_f1(predicted, reference), 4)
bleu_val = round(_bleu1(predicted, reference), 4)
judge_val = round(judge, 4) if judge is not None else None
else:
f1_val = None
bleu_val = None
judge_val = None

return {
row: dict = {
"conversation_id": conv_id,
"question": question,
"reference": reference,
"predicted": predicted,
"category": category,
"f1": round(_f1(predicted, reference), 4),
"bleu1": round(_bleu1(predicted, reference), 4),
"judge": round(judge, 4),
"f1": f1_val,
"bleu1": bleu_val,
"judge": judge_val,
"retrieval_ms": round(retrieval_ms, 2),
"gen_ms": round(gen_ms, 2),
"evidence_hits": _evidence_hits(hits, evidence),
"evidence_total": len(evidence),
}
if generation_error is not None:
row["error"] = generation_error
return row


def _summary(rows: list[dict]) -> dict:
"""Aggregate per-category stats, skipping None-valued metrics.

Judge, F1, BLEU-1 can each be None on a row for two reasons:
(1) generation failed (the whole row has an `error` field), or
(2) the judge LLM call timed out.
Similarly, `evidence_hits`/`evidence_total` can be None when the memory
backend (e.g. mem0) doesn't round-trip per-turn IDs. In all these cases
the row is excluded from the corresponding mean so infra/adapter
artefacts don't bias the headline numbers.

Also reports `scored_count` (rows contributing to each metric) alongside
`count` (total rows) so callers can see the denominator honestly.
"""
if not rows:
return {"count": 0, "f1": 0.0, "bleu1": 0.0, "judge": 0.0, "retrieval_recall": 0.0}
return {"count": 0, "f1": 0.0, "bleu1": 0.0, "judge": 0.0,
"retrieval_recall": 0.0, "judge_scored": 0, "recall_scored": 0}
n = len(rows)
hit_rows = [r for r in rows if r.get("evidence_total", 0) > 0]
recall = (sum(1 for r in hit_rows if r["evidence_hits"] > 0) / len(hit_rows)) if hit_rows else 0.0

f1_vals = [r["f1"] for r in rows if r.get("f1") is not None]
bleu_vals = [r["bleu1"] for r in rows if r.get("bleu1") is not None]
judge_vals = [r["judge"] for r in rows if r.get("judge") is not None]

recall_rows = [
r for r in rows
if r.get("evidence_total") is not None and r.get("evidence_hits") is not None
and r["evidence_total"] > 0
]
recall = (
sum(1 for r in recall_rows if r["evidence_hits"] > 0) / len(recall_rows)
) if recall_rows else 0.0

return {
"count": n,
"f1": round(sum(r["f1"] for r in rows) / n, 4),
"bleu1": round(sum(r["bleu1"] for r in rows) / n, 4),
"judge": round(sum(r["judge"] for r in rows) / n, 4),
"f1": round(sum(f1_vals) / len(f1_vals), 4) if f1_vals else 0.0,
"bleu1": round(sum(bleu_vals) / len(bleu_vals), 4) if bleu_vals else 0.0,
"judge": round(sum(judge_vals) / len(judge_vals), 4) if judge_vals else 0.0,
"retrieval_recall": round(recall, 4),
"judge_scored": len(judge_vals),
"recall_scored": len(recall_rows),
}


Expand Down Expand Up @@ -339,6 +392,12 @@ async def _guarded(qa: dict, conv_id: str, vmem: VectorMemory,
print(f" qa failed: {e!r}", flush=True)
return None
if outcome is not None:
# Rows with an `error` field had a generation failure — count
# them as failed so the JSON's failed_qa reflects reality, not
# zero just because _process_qa swallowed the exception.
if outcome.get("error"):
async with progress_lock:
failed_qa += 1
async with progress_lock:
total_seen += 1
if total_seen % 25 == 0:
Expand Down
67 changes: 32 additions & 35 deletions benchmarks/mem0_locomo_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,14 @@ async def _process_qa_mem0(
category = int(qa.get("category", 0))
evidence = qa.get("evidence", []) or []

# mem0 stored facts don't carry original LoCoMo dia_id metadata, so we
# cannot match against the gold evidence list. Report both hits and total
# as None — the metric is _unavailable_, not zero. _summary in the taosmd
# runner (and any downstream scorecard builder) skips None-valued rows
# from retrieval_recall so this artefact doesn't publish a fake 0.0.
# TODO: wire dia_id pass-through if mem0 adds metadata preservation.
EVIDENCE_UNAVAILABLE = {"evidence_hits": None, "evidence_total": None}
Comment on lines +303 to +309
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

These None sentinels currently break _summary() in this file.

Every returned row now carries evidence_total=None, but the local _summary() still does r.get("evidence_total", 0) > 0, so aggregation will raise TypeError on the first mem0 result. The new f1/bleu1/judge=None paths also conflict with its raw sum(...) / n averages. Please port the benchmarks/locomo_runner.py::_summary filtering/counting logic here before merging.

Also applies to: 318-330, 348-369

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@benchmarks/mem0_locomo_runner.py` around lines 303 - 309, The _summary
aggregation in this file is failing because rows now contain None sentinels
(EVIDENCE_UNAVAILABLE) for evidence fields and None for f1/bleu1/judge, so
change the _summary implementation to mirror the safe filtering/counting used in
benchmarks/locomo_runner.py: skip rows where evidence_total is None or where
metric values are None before summing/averaging, treat None as "unavailable" not
zero, and compute counts using explicit filters (e.g., only include rows with
r.get("evidence_total") and r.get("f1") is not None) so sums/averages use
numeric values only; update all places referencing evidence_total/evidence_hits
and f1/bleu1/judge in _summary to use this guarded logic and preserve
EVIDENCE_UNAVAILABLE behavior.


t0 = time.time()
try:
# mem0.search is synchronous; offload to thread pool to avoid blocking.
Expand All @@ -313,67 +321,56 @@ async def _process_qa_mem0(
"reference": reference,
"predicted": "",
"category": category,
"f1": 0.0,
"bleu1": 0.0,
"judge": 0.0,
"f1": None,
"bleu1": None,
"judge": None,
"retrieval_ms": 0.0,
"gen_ms": 0.0,
# evidence_hits/evidence_total not computable without dia_id metadata from
# mem0's stored facts (mem0 doesn't preserve per-turn dia_ids). Set to 0.
# TODO: if mem0 exposes metadata round-trip, wire dia_id here.
"evidence_hits": 0,
"evidence_total": len(evidence),
"error": str(exc),
**EVIDENCE_UNAVAILABLE,
"error": f"{type(exc).__name__}: {exc}",
}
retrieval_ms = (time.time() - t0) * 1000.0

context = "\n---\n".join(context_chunks) if context_chunks else ""

t1 = time.time()
generation_error: str | None = None
try:
predicted = await _ollama_generate(
client, ollama_url, model,
ANSWER_PROMPT.format(context=context, question=question),
)
except Exception as exc:
predicted = ""
gen_ms = (time.time() - t1) * 1000.0
return {
"conversation_id": conv_id,
"question": question,
"reference": reference,
"predicted": predicted,
"category": category,
"f1": 0.0,
"bleu1": 0.0,
"judge": 0.0,
"retrieval_ms": round(retrieval_ms, 2),
"gen_ms": round(gen_ms, 2),
"evidence_hits": 0,
"evidence_total": len(evidence),
"error": str(exc),
}
generation_error = f"{type(exc).__name__}: {exc}"
gen_ms = (time.time() - t1) * 1000.0

judge = await _judge(client, ollama_url, model, question, reference, predicted)
if generation_error is None:
judge = await _judge(client, ollama_url, model, question, reference, predicted)
f1_val = round(_f1(predicted, reference), 4)
bleu_val = round(_bleu1(predicted, reference), 4)
judge_val = round(judge, 4) if judge is not None else None
Comment on lines +348 to +352
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Judge transport failures are still treated as wrong answers here.

This branch expects _judge() to return None, but the local implementation still returns 0.0 on exceptions. As written, mem0 judge outages will continue to depress the Judge average instead of being excluded.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@benchmarks/mem0_locomo_runner.py` around lines 348 - 352, The code treats
judge outages as valid zeros because _judge currently returns 0.0 on exceptions;
update the implementation so transport/remote failures return None (not 0.0),
and keep this call site logic (the variable judge and the rounding into
judge_val) intact; specifically modify the _judge(...) function to catch
transport/HTTP errors and return None on those failure paths (rather than 0.0)
so judge_val becomes None and outages are excluded from averages.

else:
f1_val = None
bleu_val = None
judge_val = None

return {
row: dict = {
"conversation_id": conv_id,
"question": question,
"reference": reference,
"predicted": predicted,
"category": category,
"f1": round(_f1(predicted, reference), 4),
"bleu1": round(_bleu1(predicted, reference), 4),
"judge": round(judge, 4),
"f1": f1_val,
"bleu1": bleu_val,
"judge": judge_val,
"retrieval_ms": round(retrieval_ms, 2),
"gen_ms": round(gen_ms, 2),
# mem0 stored facts don't carry original dia_id metadata, so we cannot
# match against the LoCoMo gold evidence list. Set evidence_hits to 0.
# TODO: wire dia_id if mem0 adds metadata pass-through in a future release.
"evidence_hits": 0,
"evidence_total": len(evidence),
**EVIDENCE_UNAVAILABLE,
}
if generation_error is not None:
row["error"] = generation_error
return row


# ---------------------------------------------------------------------------
Expand Down