billion-token-one-task · Protocol-zero-0 · Jun 6, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
diff --git a/agents/paper_orchestra_pipeline.py b/agents/paper_orchestra_pipeline.py
@@ -1840,7 +1840,7 @@ def _resolve_template_id(fmt: str) -> str:
                 "backend": "paper_orchestra",
             }
         page_budget_warning = None
-        latex_sanity_report = latex_sanity_check(main_tex)
+        latex_sanity_report = latex_sanity_check(main_tex, state=state)
         _write(bundle_dir / "main.tex", main_tex)
         _write(
             bundle_dir / "latex_sanity_report.json",

diff --git a/cla-signers.json b/cla-signers.json
@@ -6,6 +6,9 @@
     },
     {
       "github": "hitome0123"
+    },
+    {
+      "github": "Protocol-zero-0"
     }
   ]
 }
diff --git a/tests/fixtures/m1_p_eq_one.json b/tests/fixtures/m1_p_eq_one.json
@@ -0,0 +1,38 @@
+{
+  "state": {
+    "result_packet": {
+      "run_id": "run-m1-p-one",
+      "claim_id": "claim-m1-p-one",
+      "claim_text": "The method improves utility.",
+      "metric_name": "exact_match",
+      "verdict": "inconclusive",
+      "p_value": 1.0,
+      "effect_size": 0.0,
+      "benchmark_summary": {
+        "primary_metric": "exact_match",
+        "per_method": {
+          "Candidate": {"exact_match": 0.705, "n": 200},
+          "Baseline": {"exact_match": 0.655, "n": 200}
+        },
+        "seed_variance": {
+          "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+        },
+        "seeds": [0, 1, 2],
+        "statistical_tests": "p=1.0000"
+      }
+    }
+  },
+  "manifest": {
+    "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+    "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+    "baselines": ["Baseline", "Candidate"],
+    "metrics": ["exact_match"],
+    "seeds": [0, 1, 2],
+    "hardware": "CPU test fixture",
+    "statistical_tests": "p=1.0000",
+    "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+    "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+    "ablation": [{"name": "no_guard"}],
+    "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+  }
+}
diff --git a/tests/fixtures/m1_refuted.json b/tests/fixtures/m1_refuted.json
@@ -0,0 +1,38 @@
+{
+  "state": {
+    "result_packet": {
+      "run_id": "run-m1-refuted",
+      "claim_id": "claim-m1-refuted",
+      "claim_text": "The method improves utility.",
+      "metric_name": "exact_match",
+      "verdict": "refuted",
+      "p_value": 0.0123,
+      "effect_size": -0.01,
+      "benchmark_summary": {
+        "primary_metric": "exact_match",
+        "per_method": {
+          "Candidate": {"exact_match": 0.605, "n": 200},
+          "Baseline": {"exact_match": 0.655, "n": 200}
+        },
+        "seed_variance": {
+          "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
+        },
+        "seeds": [0, 1, 2],
+        "statistical_tests": "paired permutation p=0.0123"
+      }
+    }
+  },
+  "manifest": {
+    "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+    "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+    "baselines": ["Baseline", "Candidate"],
+    "metrics": ["exact_match"],
+    "seeds": [0, 1, 2],
+    "hardware": "CPU test fixture",
+    "statistical_tests": "paired permutation p=0.0123",
+    "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+    "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+    "ablation": [{"name": "no_guard"}],
+    "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+  }
+}
diff --git a/tests/fixtures/m1_significant.json b/tests/fixtures/m1_significant.json
@@ -0,0 +1,40 @@
+{
+  "state": {
+    "result_packet": {
+      "run_id": "run-m1-significant",
+      "claim_id": "claim-m1-significant",
+      "claim_text": "The method improves utility.",
+      "metric_name": "exact_match",
+      "verdict": "confirmed",
+      "p_value": 0.0123,
+      "effect_size": 0.045,
+      "benchmark_summary": {
+        "primary_metric": "exact_match",
+        "per_method": {
+          "Candidate": {"exact_match": 0.705, "n": 200},
+          "Baseline": {"exact_match": 0.655, "n": 200}
+        },
+        "seed_variance": {
+          "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+        },
+        "seeds": [0, 1, 2],
+        "repro_ci": [0.01, 0.08],
+        "kept_ci": [0.0, 0.07],
+        "statistical_tests": "paired permutation p=0.0123"
+      }
+    }
+  },
+  "manifest": {
+    "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+    "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+    "baselines": ["Baseline", "Candidate"],
+    "metrics": ["exact_match"],
+    "seeds": [0, 1, 2],
+    "hardware": "CPU test fixture",
+    "statistical_tests": "paired permutation p=0.0123",
+    "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+    "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+    "ablation": [{"name": "no_guard"}],
+    "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+  }
+}
diff --git a/tests/fixtures/m2a_clean.tex b/tests/fixtures/m2a_clean.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\begin{document}
+Does the method help?
+\begin{verbatim}
+Table ??
+\end{verbatim}
+\end{document}
diff --git a/tests/fixtures/m2a_unresolved.tex b/tests/fixtures/m2a_unresolved.tex
@@ -0,0 +1,4 @@
+\documentclass{article}
+\begin{document}
+Table ?? reports the unresolved result.
+\end{document}
diff --git a/tests/fixtures/m2b_clean.tex b/tests/fixtures/m2b_clean.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+\[
+\frac{{a}}{{b}}
+\]
+\end{document}
diff --git a/tests/fixtures/m2b_scaffold.tex b/tests/fixtures/m2b_scaffold.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+\begin{figure}
+\caption{Generated plot command {{plot_cmd}}.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2c_clean.tex b/tests/fixtures/m2c_clean.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\title{CGGR Results on GSM8K}
+\begin{document}
+\begin{figure}
+\caption{CGGR trajectory on GSM8K with CPU evaluation.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2c_contaminated.tex b/tests/fixtures/m2c_contaminated.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\title{OtherMethod Results}
+\begin{document}
+\begin{figure}
+\caption{OtherMethod trajectory.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2d_clean.tex b/tests/fixtures/m2d_clean.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+This section introduces the benchmark setup.
+The result section then describes the measured effects.
+Finally, the discussion names limitations without repeating a template.
+\end{document}
diff --git a/tests/fixtures/m2d_repeat.tex b/tests/fixtures/m2d_repeat.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\begin{document}
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+\end{document}
diff --git a/tests/fixtures/m4_clean.tex b/tests/fixtures/m4_clean.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate reaches 70.5% exact match with p=0.0123 and effect 0.045.
+\end{abstract}
+\section{Conclusion}
+The reproduced mean is 70.5% with interval endpoints 0.01 and 0.08.
+\end{document}
diff --git a/tests/fixtures/m4_clean_ledger.json b/tests/fixtures/m4_clean_ledger.json
@@ -0,0 +1,22 @@
+{
+  "schema_version": "1.0",
+  "run_id": "run-m4-clean",
+  "claim_id": "claim-m4-clean",
+  "metric": "exact_match",
+  "alpha": 0.05,
+  "verdict": "confirmed",
+  "p_value": 0.0123,
+  "effect_size": 0.045,
+  "confidence": 0.9877,
+  "repro_ci": [0.01, 0.08],
+  "kept_ci": [0.0, 0.07],
+  "per_method": {
+    "Candidate": {"exact_match": 0.705, "n": 200},
+    "Baseline": {"exact_match": 0.655, "n": 200}
+  },
+  "seed_variance": {
+    "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+  },
+  "seeds": [0, 1, 2],
+  "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_method_only.tex b/tests/fixtures/m4_method_only.tex
@@ -0,0 +1,10 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+This paper describes the experiment.
+\end{abstract}
+\section{Method}
+The method section mentions a calibration constant 0.777 that is not in the ledger.
+\section{Conclusion}
+The conclusion contains no numeric claim.
+\end{document}
diff --git a/tests/fixtures/m4_missing_p_ledger.json b/tests/fixtures/m4_missing_p_ledger.json
@@ -0,0 +1,21 @@
+{
+  "schema_version": "1.0",
+  "run_id": "run-m4-missing-p",
+  "claim_id": "claim-m4-missing-p",
+  "metric": "exact_match",
+  "alpha": 0.05,
+  "verdict": "confirmed",
+  "effect_size": 0.045,
+  "confidence": 0.99,
+  "repro_ci": [0.02, 0.08],
+  "kept_ci": [0.0, 0.07],
+  "per_method": {
+    "Candidate": {"exact_match": 0.705, "n": 200},
+    "Baseline": {"exact_match": 0.655, "n": 200}
+  },
+  "seed_variance": {
+    "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+  },
+  "seeds": [0, 1, 2],
+  "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_refuted_ledger.json b/tests/fixtures/m4_refuted_ledger.json
@@ -0,0 +1,22 @@
+{
+  "schema_version": "1.0",
+  "run_id": "run-m4-refuted",
+  "claim_id": "claim-m4-refuted",
+  "metric": "exact_match",
+  "alpha": 0.05,
+  "verdict": "refuted",
+  "p_value": 0.0123,
+  "effect_size": -0.01,
+  "confidence": 0.9877,
+  "repro_ci": [-0.08, -0.01],
+  "kept_ci": [-0.07, 0.0],
+  "per_method": {
+    "Candidate": {"exact_match": 0.605, "n": 200},
+    "Baseline": {"exact_match": 0.655, "n": 200}
+  },
+  "seed_variance": {
+    "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
+  },
+  "seeds": [0, 1, 2],
+  "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_refuted_positive_claim.tex b/tests/fixtures/m4_refuted_positive_claim.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate significantly improves exact match over the baseline.
+\end{abstract}
+\section{Conclusion}
+The method outperforms the baseline.
+\end{document}
diff --git a/tests/fixtures/m4_unsourced_number.tex b/tests/fixtures/m4_unsourced_number.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate significantly improves exact match with p=0.01.
+\end{abstract}
+\section{Conclusion}
+The conclusion repeats p=0.01.
+\end{document}
diff --git a/tests/fixtures/m4_whitelist_numbers.tex b/tests/fixtures/m4_whitelist_numbers.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Table 2 summarizes the result over 5 seeds.
+\end{abstract}
+\section{Discussion}
+Figure 3 shows the setup.
+\end{document}
diff --git a/tests/test_latex_sanity_m2.py b/tests/test_latex_sanity_m2.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+
+from agents.paper_completeness import latex_sanity_check
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _tex(name: str) -> str:
+    return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+def _rules(report: dict) -> set[str]:
+    return {hit.get("rule") or hit.get("kind") for hit in report.get("hits", [])}
+
+
+def test_m2a_unresolved_reference_fails_with_line_location():
+    report = latex_sanity_check(_tex("m2a_unresolved.tex"))
+
+    assert report["ok"] is False
+    assert "unresolved_reference" in _rules(report)
+    assert any(hit.get("location", {}).get("line") for hit in report["hits"])
+
+
+def test_m2a_single_question_and_verbatim_question_marks_pass():
+    report = latex_sanity_check(_tex("m2a_clean.tex"))
+
+    assert report["ok"] is True
+
+
+def test_m2b_scaffold_placeholder_regex_fails_but_latex_braces_pass():
+    bad = latex_sanity_check(_tex("m2b_scaffold.tex"))
+    clean = latex_sanity_check(_tex("m2b_clean.tex"))
+
+    assert bad["ok"] is False
+    assert "template_placeholder" in _rules(bad)
+    assert clean["ok"] is True
+
+
+def test_m2b_existing_placeholder_forbidden_term_still_blocks():
+    report = latex_sanity_check("This caption contains placeholder text.")
+
+    assert report["ok"] is False
+    assert any(hit.get("value") == "placeholder" for hit in report["hits"])
+
+
+def test_m2c_cross_run_method_token_fails_by_set_membership():
+    report = latex_sanity_check(_tex("m2c_contaminated.tex"), state={"method_name": "CGGR"})
+
+    assert report["ok"] is False
+    assert "cross_run_identity" in _rules(report)
+    assert any(hit.get("value") == "OtherMethod" for hit in report["hits"])
+
+
+def test_m2c_method_token_and_abbreviation_whitelist_pass():
+    report = latex_sanity_check(_tex("m2c_clean.tex"), state={"method_name": "CGGR"})
+
+    assert report["ok"] is True
+
+
+def test_m2d_repeated_boilerplate_sentence_fails():
+    report = latex_sanity_check(_tex("m2d_repeat.tex"))
+
+    assert report["ok"] is False
+    assert "boilerplate_repetition" in _rules(report)
+
+
+def test_m2d_clean_sentences_pass():
+    report = latex_sanity_check(_tex("m2d_clean.tex"))
+
+    assert report["ok"] is True
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,9 @@ @@
         },
         {
           "github": "hitome0123"
+        },
+        {
+          "github": "Protocol-zero-0"
         }
       ]
     }