Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
316 changes: 310 additions & 6 deletions agents/paper_completeness.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion agents/paper_orchestra_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1840,7 +1840,7 @@ def _resolve_template_id(fmt: str) -> str:
"backend": "paper_orchestra",
}
page_budget_warning = None
latex_sanity_report = latex_sanity_check(main_tex)
latex_sanity_report = latex_sanity_check(main_tex, state=state)
_write(bundle_dir / "main.tex", main_tex)
_write(
bundle_dir / "latex_sanity_report.json",
Expand Down
3 changes: 3 additions & 0 deletions cla-signers.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
},
{
"github": "hitome0123"
},
{
"github": "Protocol-zero-0"
}
]
}
38 changes: 38 additions & 0 deletions tests/fixtures/m1_p_eq_one.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"state": {
"result_packet": {
"run_id": "run-m1-p-one",
"claim_id": "claim-m1-p-one",
"claim_text": "The method improves utility.",
"metric_name": "exact_match",
"verdict": "inconclusive",
"p_value": 1.0,
"effect_size": 0.0,
"benchmark_summary": {
"primary_metric": "exact_match",
"per_method": {
"Candidate": {"exact_match": 0.705, "n": 200},
"Baseline": {"exact_match": 0.655, "n": 200}
},
"seed_variance": {
"Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
},
"seeds": [0, 1, 2],
"statistical_tests": "p=1.0000"
}
}
},
"manifest": {
"datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
"models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
"baselines": ["Baseline", "Candidate"],
"metrics": ["exact_match"],
"seeds": [0, 1, 2],
"hardware": "CPU test fixture",
"statistical_tests": "p=1.0000",
"latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
"token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
"ablation": [{"name": "no_guard"}],
"artifacts": {"benchmark_summary": "benchmark_summary.json"}
}
}
38 changes: 38 additions & 0 deletions tests/fixtures/m1_refuted.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"state": {
"result_packet": {
"run_id": "run-m1-refuted",
"claim_id": "claim-m1-refuted",
"claim_text": "The method improves utility.",
"metric_name": "exact_match",
"verdict": "refuted",
"p_value": 0.0123,
"effect_size": -0.01,
"benchmark_summary": {
"primary_metric": "exact_match",
"per_method": {
"Candidate": {"exact_match": 0.605, "n": 200},
"Baseline": {"exact_match": 0.655, "n": 200}
},
"seed_variance": {
"Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
},
"seeds": [0, 1, 2],
"statistical_tests": "paired permutation p=0.0123"
}
}
},
"manifest": {
"datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
"models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
"baselines": ["Baseline", "Candidate"],
"metrics": ["exact_match"],
"seeds": [0, 1, 2],
"hardware": "CPU test fixture",
"statistical_tests": "paired permutation p=0.0123",
"latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
"token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
"ablation": [{"name": "no_guard"}],
"artifacts": {"benchmark_summary": "benchmark_summary.json"}
}
}
40 changes: 40 additions & 0 deletions tests/fixtures/m1_significant.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"state": {
"result_packet": {
"run_id": "run-m1-significant",
"claim_id": "claim-m1-significant",
"claim_text": "The method improves utility.",
"metric_name": "exact_match",
"verdict": "confirmed",
"p_value": 0.0123,
"effect_size": 0.045,
"benchmark_summary": {
"primary_metric": "exact_match",
"per_method": {
"Candidate": {"exact_match": 0.705, "n": 200},
"Baseline": {"exact_match": 0.655, "n": 200}
},
"seed_variance": {
"Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
},
"seeds": [0, 1, 2],
"repro_ci": [0.01, 0.08],
"kept_ci": [0.0, 0.07],
"statistical_tests": "paired permutation p=0.0123"
}
}
},
"manifest": {
"datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
"models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
"baselines": ["Baseline", "Candidate"],
"metrics": ["exact_match"],
"seeds": [0, 1, 2],
"hardware": "CPU test fixture",
"statistical_tests": "paired permutation p=0.0123",
"latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
"token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
"ablation": [{"name": "no_guard"}],
"artifacts": {"benchmark_summary": "benchmark_summary.json"}
}
}
7 changes: 7 additions & 0 deletions tests/fixtures/m2a_clean.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
\documentclass{article}
\begin{document}
Does the method help?
\begin{verbatim}
Table ??
\end{verbatim}
\end{document}
4 changes: 4 additions & 0 deletions tests/fixtures/m2a_unresolved.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
\documentclass{article}
\begin{document}
Table ?? reports the unresolved result.
\end{document}
6 changes: 6 additions & 0 deletions tests/fixtures/m2b_clean.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
\documentclass{article}
\begin{document}
\[
\frac{{a}}{{b}}
\]
\end{document}
6 changes: 6 additions & 0 deletions tests/fixtures/m2b_scaffold.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
\documentclass{article}
\begin{document}
\begin{figure}
\caption{Generated plot command {{plot_cmd}}.}
\end{figure}
\end{document}
7 changes: 7 additions & 0 deletions tests/fixtures/m2c_clean.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
\documentclass{article}
\title{CGGR Results on GSM8K}
\begin{document}
\begin{figure}
\caption{CGGR trajectory on GSM8K with CPU evaluation.}
\end{figure}
\end{document}
7 changes: 7 additions & 0 deletions tests/fixtures/m2c_contaminated.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
\documentclass{article}
\title{OtherMethod Results}
\begin{document}
\begin{figure}
\caption{OtherMethod trajectory.}
\end{figure}
\end{document}
6 changes: 6 additions & 0 deletions tests/fixtures/m2d_clean.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
\documentclass{article}
\begin{document}
This section introduces the benchmark setup.
The result section then describes the measured effects.
Finally, the discussion names limitations without repeating a template.
\end{document}
7 changes: 7 additions & 0 deletions tests/fixtures/m2d_repeat.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
\documentclass{article}
\begin{document}
This repeated boilerplate sentence should not appear in every generated paragraph.
This repeated boilerplate sentence should not appear in every generated paragraph.
This repeated boilerplate sentence should not appear in every generated paragraph.
This repeated boilerplate sentence should not appear in every generated paragraph.
\end{document}
8 changes: 8 additions & 0 deletions tests/fixtures/m4_clean.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
\documentclass{article}
\begin{document}
\begin{abstract}
Candidate reaches 70.5% exact match with p=0.0123 and effect 0.045.
\end{abstract}
\section{Conclusion}
The reproduced mean is 70.5% with interval endpoints 0.01 and 0.08.
\end{document}
22 changes: 22 additions & 0 deletions tests/fixtures/m4_clean_ledger.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"schema_version": "1.0",
"run_id": "run-m4-clean",
"claim_id": "claim-m4-clean",
"metric": "exact_match",
"alpha": 0.05,
"verdict": "confirmed",
"p_value": 0.0123,
"effect_size": 0.045,
"confidence": 0.9877,
"repro_ci": [0.01, 0.08],
"kept_ci": [0.0, 0.07],
"per_method": {
"Candidate": {"exact_match": 0.705, "n": 200},
"Baseline": {"exact_match": 0.655, "n": 200}
},
"seed_variance": {
"Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
},
"seeds": [0, 1, 2],
"provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
}
10 changes: 10 additions & 0 deletions tests/fixtures/m4_method_only.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
\documentclass{article}
\begin{document}
\begin{abstract}
This paper describes the experiment.
\end{abstract}
\section{Method}
The method section mentions a calibration constant 0.777 that is not in the ledger.
\section{Conclusion}
The conclusion contains no numeric claim.
\end{document}
21 changes: 21 additions & 0 deletions tests/fixtures/m4_missing_p_ledger.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"schema_version": "1.0",
"run_id": "run-m4-missing-p",
"claim_id": "claim-m4-missing-p",
"metric": "exact_match",
"alpha": 0.05,
"verdict": "confirmed",
"effect_size": 0.045,
"confidence": 0.99,
"repro_ci": [0.02, 0.08],
"kept_ci": [0.0, 0.07],
"per_method": {
"Candidate": {"exact_match": 0.705, "n": 200},
"Baseline": {"exact_match": 0.655, "n": 200}
},
"seed_variance": {
"Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
},
"seeds": [0, 1, 2],
"provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
}
22 changes: 22 additions & 0 deletions tests/fixtures/m4_refuted_ledger.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"schema_version": "1.0",
"run_id": "run-m4-refuted",
"claim_id": "claim-m4-refuted",
"metric": "exact_match",
"alpha": 0.05,
"verdict": "refuted",
"p_value": 0.0123,
"effect_size": -0.01,
"confidence": 0.9877,
"repro_ci": [-0.08, -0.01],
"kept_ci": [-0.07, 0.0],
"per_method": {
"Candidate": {"exact_match": 0.605, "n": 200},
"Baseline": {"exact_match": 0.655, "n": 200}
},
"seed_variance": {
"Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
},
"seeds": [0, 1, 2],
"provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
}
8 changes: 8 additions & 0 deletions tests/fixtures/m4_refuted_positive_claim.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
\documentclass{article}
\begin{document}
\begin{abstract}
Candidate significantly improves exact match over the baseline.
\end{abstract}
\section{Conclusion}
The method outperforms the baseline.
\end{document}
8 changes: 8 additions & 0 deletions tests/fixtures/m4_unsourced_number.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
\documentclass{article}
\begin{document}
\begin{abstract}
Candidate significantly improves exact match with p=0.01.
\end{abstract}
\section{Conclusion}
The conclusion repeats p=0.01.
\end{document}
8 changes: 8 additions & 0 deletions tests/fixtures/m4_whitelist_numbers.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
\documentclass{article}
\begin{document}
\begin{abstract}
Table 2 summarizes the result over 5 seeds.
\end{abstract}
\section{Discussion}
Figure 3 shows the setup.
\end{document}
71 changes: 71 additions & 0 deletions tests/test_latex_sanity_m2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from pathlib import Path

from agents.paper_completeness import latex_sanity_check


FIXTURES = Path(__file__).parent / "fixtures"


def _tex(name: str) -> str:
return (FIXTURES / name).read_text(encoding="utf-8")


def _rules(report: dict) -> set[str]:
return {hit.get("rule") or hit.get("kind") for hit in report.get("hits", [])}


def test_m2a_unresolved_reference_fails_with_line_location():
report = latex_sanity_check(_tex("m2a_unresolved.tex"))

assert report["ok"] is False
assert "unresolved_reference" in _rules(report)
assert any(hit.get("location", {}).get("line") for hit in report["hits"])


def test_m2a_single_question_and_verbatim_question_marks_pass():
report = latex_sanity_check(_tex("m2a_clean.tex"))

assert report["ok"] is True


def test_m2b_scaffold_placeholder_regex_fails_but_latex_braces_pass():
bad = latex_sanity_check(_tex("m2b_scaffold.tex"))
clean = latex_sanity_check(_tex("m2b_clean.tex"))

assert bad["ok"] is False
assert "template_placeholder" in _rules(bad)
assert clean["ok"] is True


def test_m2b_existing_placeholder_forbidden_term_still_blocks():
report = latex_sanity_check("This caption contains placeholder text.")

assert report["ok"] is False
assert any(hit.get("value") == "placeholder" for hit in report["hits"])


def test_m2c_cross_run_method_token_fails_by_set_membership():
report = latex_sanity_check(_tex("m2c_contaminated.tex"), state={"method_name": "CGGR"})

assert report["ok"] is False
assert "cross_run_identity" in _rules(report)
assert any(hit.get("value") == "OtherMethod" for hit in report["hits"])


def test_m2c_method_token_and_abbreviation_whitelist_pass():
report = latex_sanity_check(_tex("m2c_clean.tex"), state={"method_name": "CGGR"})

assert report["ok"] is True


def test_m2d_repeated_boilerplate_sentence_fails():
report = latex_sanity_check(_tex("m2d_repeat.tex"))

assert report["ok"] is False
assert "boilerplate_repetition" in _rules(report)


def test_m2d_clean_sentences_pass():
report = latex_sanity_check(_tex("m2d_clean.tex"))

assert report["ok"] is True
Loading