From 60fcc096683d886ce57f41588900b64bf802fc55 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:40:14 +0000
Subject: [PATCH 1/7] Stabilize vnext manuscript regression fixture

---
 tests/test_vnext_manuscript.py | 95 +++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 6 deletions(-)
diff --git a/tests/test_vnext_manuscript.py b/tests/test_vnext_manuscript.py
index 88c429f..53b9eb8 100644
--- a/tests/test_vnext_manuscript.py
+++ b/tests/test_vnext_manuscript.py
@@ -225,7 +225,18 @@ def _write_complete_benchmark_packet(self):
         (results_dir / "ablation_table.json").write_text(json.dumps(benchmark_summary["ablation_table"]), encoding="utf-8")
         (results_dir / "latency_tokens_table.json").write_text(json.dumps(benchmark_summary["latency_tokens_table"]), encoding="utf-8")
 
-    def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name):
+    def _stub_orchestra(
+        self,
+        state,
+        literature_block,
+        paper_ids,
+        iterations,
+        *,
+        figures_dir,
+        baseline,
+        metric_name,
+        template_id=None,
+    ):
         figures_dir.mkdir(parents=True, exist_ok=True)
         (figures_dir / "fig_metric_trajectory.svg").write_text(
             '<svg xmlns="http://www.w3.org/2000/svg"><text>metric</text></svg>',
@@ -256,10 +267,39 @@ def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, fig
                 "abstract": "Abstract text.",
                 "introduction": "Introduction text with \\cite{cite_a}.",
                 "method": "Method text.",
-                "experiments": "Experiments text.",
+                "experiments": (
+                    "Experiments text.\\n"
+                    "\\begin{figure}[t]\\n"
+                    "\\centering\\n"
+                    "\\includegraphics[width=0.9\\linewidth]{figures/fig_metric_trajectory.svg}\\n"
+                    "\\caption{Metric trajectory.}\\n"
+                    "\\label{fig:metric_trajectory}\\n"
+                    "\\end{figure}"
+                ),
                 "discussion": "Discussion text.",
             },
-            "refinement_full_text": "",
+            "refinement_full_text": (
+                "\\documentclass{article}\n"
+                "\\usepackage{graphicx}\n"
+                "\\title{Auto Manuscript Insight}\n"
+                "\\begin{document}\n"
+                "\\maketitle\n"
+                "\\begin{abstract}Abstract text.\\end{abstract}\n"
+                "\\section{Introduction}Intro with \\cite{cite_a}.\n"
+                "\\section{Related Work}Related work with \\cite{cite_a}.\n"
+                "\\section{Method}Method text.\n"
+                "\\section{Experiments}Experiments text.\n"
+                "\\begin{figure}[t]\n"
+                "\\centering\n"
+                "\\includegraphics[width=0.9\\linewidth]{figures/fig_metric_trajectory.svg}\n"
+                "\\caption{Metric trajectory.}\n"
+                "\\label{fig:metric_trajectory}\n"
+                "\\end{figure}\n"
+                "\\section{Discussion}Discussion text.\n"
+                "\\bibliographystyle{plain}\n"
+                "\\bibliography{references}\n"
+                "\\end{document}"
+            ),
             "agentreview_worklog": [],
             "bibtex": "@misc{cite_a,\n  title = {Verified Paper},\n  author = {Author One},\n  year = {2024}\n}\n",
             "bib_keys": ["cite_a"],
@@ -286,7 +326,23 @@ def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, fig
     @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
     def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(self, run_full):
         run_full.side_effect = self._stub_orchestra
-        result = generate_submission_bundle(1, bundle_formats=["conference"])
+        with mock.patch(
+            "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry",
+            side_effect=lambda main_tex, **_: (
+                main_tex,
+                {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}},
+            ),
+        ), mock.patch(
+            "agents.manuscript_page_budget.apply_exact_page_budget",
+            side_effect=lambda main_tex, *_args, **_kwargs: (
+                main_tex,
+                {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10},
+            ),
+        ), mock.patch(
+            "agents.manuscript_page_budget.page_budget_blockers",
+            return_value=[],
+        ):
+            result = generate_submission_bundle(1, bundle_formats=["conference"])
         self.assertIn("manuscript_run_id", result)
         self.assertEqual(result["backend"], "paper_orchestra")
         bundle = database.fetchone("SELECT * FROM submission_bundles WHERE manuscript_run_id=?", (result["manuscript_run_id"],))
@@ -371,7 +427,17 @@ def test_generate_submission_bundle_blocks_benchmark_plan_without_artifact_manif
 
     @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
     def test_generate_submission_bundle_blocks_placeholder_figure_assets(self, run_full):
-        def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name):
+        def _stub_with_placeholder(
+            state,
+            literature_block,
+            paper_ids,
+            iterations,
+            *,
+            figures_dir,
+            baseline,
+            metric_name,
+            template_id=None,
+        ):
             out = self._stub_orchestra(
                 state,
                 literature_block,
@@ -380,6 +446,7 @@ def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, fi
                 figures_dir=figures_dir,
                 baseline=baseline,
                 metric_name=metric_name,
+                template_id=template_id,
             )
             (figures_dir / "fig_metric_trajectory.svg").write_text(
                 '<svg xmlns="http://www.w3.org/2000/svg"><text>Diagram placeholder: failed API figure.</text></svg>',
@@ -389,7 +456,23 @@ def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, fi
 
         run_full.side_effect = _stub_with_placeholder
 
-        result = generate_submission_bundle(1, bundle_formats=["conference"])
+        with mock.patch(
+            "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry",
+            side_effect=lambda main_tex, **_: (
+                main_tex,
+                {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}},
+            ),
+        ), mock.patch(
+            "agents.manuscript_page_budget.apply_exact_page_budget",
+            side_effect=lambda main_tex, *_args, **_kwargs: (
+                main_tex,
+                {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10},
+            ),
+        ), mock.patch(
+            "agents.manuscript_page_budget.page_budget_blockers",
+            return_value=[],
+        ):
+            result = generate_submission_bundle(1, bundle_formats=["conference"])
 
         self.assertIn("error", result)
         self.assertIn("placeholder", " ".join(result.get("submission_blockers") or []).lower())

From 062cf5523d84e4461e0c3046733e5b4e3eefb378 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:44:34 +0000
Subject: [PATCH 2/7] Fix significance evidence ledger gate (#24)

---
 agents/paper_completeness.py        |  71 ++++++++++++++++--
 tests/fixtures/m1_p_eq_one.json     |  38 ++++++++++
 tests/fixtures/m1_refuted.json      |  38 ++++++++++
 tests/fixtures/m1_significant.json  |  40 ++++++++++
 tests/test_paper_completeness_m1.py | 110 ++++++++++++++++++++++++++++
 5 files changed, 292 insertions(+), 5 deletions(-)
 create mode 100644 tests/fixtures/m1_p_eq_one.json
 create mode 100644 tests/fixtures/m1_refuted.json
 create mode 100644 tests/fixtures/m1_significant.json
 create mode 100644 tests/test_paper_completeness_m1.py

diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
index f3b8d0e..42061d7 100644
--- a/agents/paper_completeness.py
+++ b/agents/paper_completeness.py
@@ -145,6 +145,62 @@ def _numeric(value: Any) -> float | None:
         return None
 
 
+def _significance_alpha() -> float:
+    raw = os.environ.get("DEEPGRAPH_SIGNIFICANCE_ALPHA")
+    alpha = _numeric(raw)
+    if alpha is None or alpha <= 0:
+        return 0.05
+    return alpha
+
+
+def build_evidence_ledger(
+    packet: dict,
+    benchmark_summary: dict,
+    *,
+    alpha: float,
+    provenance: dict | None = None,
+) -> dict:
+    """Build the single source of truth for presentation evidence."""
+    packet = _as_dict(packet)
+    summary = _as_dict(benchmark_summary)
+    p_value = _numeric(packet.get("p_value"))
+    effect_size = _numeric(_first_present(packet.get("effect_size"), packet.get("effect_pct")))
+    metric = _text(_first_present(packet.get("metric_name"), summary.get("primary_metric"), summary.get("metric_name")))
+    seeds = _as_list(_first_present(summary.get("seeds"), summary.get("seed_values")))
+    if not seeds:
+        seeds = _seed_list(summary)
+    confidence = round(1.0 - p_value, 10) if p_value is not None else None
+    return {
+        "schema_version": "1.0",
+        "run_id": _text(packet.get("run_id")),
+        "claim_id": _text(packet.get("claim_id")),
+        "metric": metric,
+        "alpha": alpha,
+        "verdict": _text(packet.get("verdict")),
+        "p_value": p_value,
+        "effect_size": effect_size,
+        "confidence": confidence,
+        "repro_ci": _as_list(_first_present(packet.get("repro_ci"), summary.get("repro_ci"))),
+        "kept_ci": _as_list(_first_present(packet.get("kept_ci"), summary.get("kept_ci"))),
+        "per_method": _as_dict(summary.get("per_method")),
+        "seed_variance": _as_dict(summary.get("seed_variance")),
+        "seeds": seeds,
+        "provenance": _as_dict(provenance),
+    }
+
+
+def _ledger_supports_significance(ledger: dict[str, Any]) -> bool:
+    p_value = _numeric(ledger.get("p_value"))
+    alpha = _numeric(ledger.get("alpha"))
+    verdict = _lower(ledger.get("verdict"))
+    return bool(
+        p_value is not None
+        and alpha is not None
+        and p_value < alpha
+        and verdict in {"confirmed", "reproduced"}
+    )
+
+
 def _infer_model_size(name: str) -> str:
     match = re.search(r"(\d+(?:\.\d+)?\s*[bB])", name or "")
     return match.group(1).replace(" ", "") if match else ""
@@ -586,9 +642,9 @@ def build_claim_evidence_matrix(state: dict[str, Any], manifest: dict[str, Any])
     packet, summary, _artifact_manifest, _contract = _packet_parts(state)
     per_method = _as_dict(summary.get("per_method"))
     seeds = _as_list(manifest.get("seeds"))
-    p_text = _lower(manifest.get("statistical_tests"))
     has_multi_seed = len(seeds) >= 3
-    has_significance = "p=" in p_text or "bootstrap" in p_text or "permutation" in p_text
+    ledger = build_evidence_ledger(packet, summary, alpha=_significance_alpha())
+    has_significance = _ledger_supports_significance(ledger)
     rows = [
         {
             "claim": "Improves utility",
@@ -634,8 +690,8 @@ def build_claim_evidence_matrix(state: dict[str, Any], manifest: dict[str, Any])
                 "claim": _text(packet.get("claim_text"))[:280],
                 "required_evidence": "mapped quantitative artifact in result_packet or benchmark_summary",
                 "current_evidence": "present" if len(per_method) >= 2 else "missing benchmark comparison",
-                "can_appear_in_abstract": bool(len(per_method) >= 2 and has_multi_seed),
-                "allowed_sections": ["Abstract", "Introduction", "Conclusion"] if len(per_method) >= 2 and has_multi_seed else ["Motivation", "Limitations"],
+                "can_appear_in_abstract": bool(len(per_method) >= 2 and has_multi_seed and has_significance),
+                "allowed_sections": ["Abstract", "Introduction", "Conclusion"] if len(per_method) >= 2 and has_multi_seed and has_significance else ["Motivation", "Limitations"],
             },
         )
     return rows
@@ -742,6 +798,7 @@ def build_reviewer_report(
     blockers: list[str],
 ) -> dict[str, Any]:
     packet, summary, _artifact_manifest, _contract = _packet_parts(state)
+    ledger = build_evidence_ledger(packet, summary, alpha=_significance_alpha())
     answers: list[dict[str, Any]] = []
 
     def add(question: str, yes: bool, evidence: str) -> None:
@@ -750,7 +807,11 @@ def add(question: str, yes: bool, evidence: str) -> None:
     add("What is the exact dataset?", bool(_as_list(manifest.get("datasets")) and not any("dataset" in b.lower() for b in blockers[:3])), str(manifest.get("datasets") or "missing"))
     add("What is the exact model?", bool(_as_list(manifest.get("models")) and not any("model" in b.lower() for b in blockers)), str(manifest.get("models") or "missing"))
     add("What is the exact baseline?", len(_as_list(manifest.get("baselines"))) >= 2, ", ".join(_as_list(manifest.get("baselines"))))
-    add("Is the improvement statistically significant?", bool(_text(manifest.get("statistical_tests")) and not any("statistical" in b.lower() for b in blockers)), _text(manifest.get("statistical_tests")))
+    add(
+        "Is the improvement statistically significant?",
+        bool(_ledger_supports_significance(ledger) and not any("statistical" in b.lower() for b in blockers)),
+        f"p={ledger.get('p_value')}; alpha={ledger.get('alpha')}; verdict={ledger.get('verdict')}",
+    )
     add("Is there more than one benchmark?", len(_as_list(manifest.get("datasets"))) > 1, str(len(_as_list(manifest.get("datasets")))))
     add("Are compute savings actually measured?", bool(manifest.get("latency") or manifest.get("token_cost")), "latency/token payload present" if (manifest.get("latency") or manifest.get("token_cost")) else "missing")
     add("Is the proposed gate better than confidence/disagreement routing?", any("Beats confidence" in row.get("claim", "") and row.get("can_appear_in_abstract") for row in claim_matrix), "claim-evidence matrix")
diff --git a/tests/fixtures/m1_p_eq_one.json b/tests/fixtures/m1_p_eq_one.json
new file mode 100644
index 0000000..099a315
--- /dev/null
+++ b/tests/fixtures/m1_p_eq_one.json
@@ -0,0 +1,38 @@
+{
+  "state": {
+    "result_packet": {
+      "run_id": "run-m1-p-one",
+      "claim_id": "claim-m1-p-one",
+      "claim_text": "The method improves utility.",
+      "metric_name": "exact_match",
+      "verdict": "inconclusive",
+      "p_value": 1.0,
+      "effect_size": 0.0,
+      "benchmark_summary": {
+        "primary_metric": "exact_match",
+        "per_method": {
+          "Candidate": {"exact_match": 0.705, "n": 200},
+          "Baseline": {"exact_match": 0.655, "n": 200}
+        },
+        "seed_variance": {
+          "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+        },
+        "seeds": [0, 1, 2],
+        "statistical_tests": "p=1.0000"
+      }
+    }
+  },
+  "manifest": {
+    "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+    "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+    "baselines": ["Baseline", "Candidate"],
+    "metrics": ["exact_match"],
+    "seeds": [0, 1, 2],
+    "hardware": "CPU test fixture",
+    "statistical_tests": "p=1.0000",
+    "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+    "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+    "ablation": [{"name": "no_guard"}],
+    "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+  }
+}
diff --git a/tests/fixtures/m1_refuted.json b/tests/fixtures/m1_refuted.json
new file mode 100644
index 0000000..a0743e6
--- /dev/null
+++ b/tests/fixtures/m1_refuted.json
@@ -0,0 +1,38 @@
+{
+  "state": {
+    "result_packet": {
+      "run_id": "run-m1-refuted",
+      "claim_id": "claim-m1-refuted",
+      "claim_text": "The method improves utility.",
+      "metric_name": "exact_match",
+      "verdict": "refuted",
+      "p_value": 0.0123,
+      "effect_size": -0.01,
+      "benchmark_summary": {
+        "primary_metric": "exact_match",
+        "per_method": {
+          "Candidate": {"exact_match": 0.605, "n": 200},
+          "Baseline": {"exact_match": 0.655, "n": 200}
+        },
+        "seed_variance": {
+          "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
+        },
+        "seeds": [0, 1, 2],
+        "statistical_tests": "paired permutation p=0.0123"
+      }
+    }
+  },
+  "manifest": {
+    "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+    "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+    "baselines": ["Baseline", "Candidate"],
+    "metrics": ["exact_match"],
+    "seeds": [0, 1, 2],
+    "hardware": "CPU test fixture",
+    "statistical_tests": "paired permutation p=0.0123",
+    "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+    "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+    "ablation": [{"name": "no_guard"}],
+    "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+  }
+}
diff --git a/tests/fixtures/m1_significant.json b/tests/fixtures/m1_significant.json
new file mode 100644
index 0000000..3da4701
--- /dev/null
+++ b/tests/fixtures/m1_significant.json
@@ -0,0 +1,40 @@
+{
+  "state": {
+    "result_packet": {
+      "run_id": "run-m1-significant",
+      "claim_id": "claim-m1-significant",
+      "claim_text": "The method improves utility.",
+      "metric_name": "exact_match",
+      "verdict": "confirmed",
+      "p_value": 0.0123,
+      "effect_size": 0.045,
+      "benchmark_summary": {
+        "primary_metric": "exact_match",
+        "per_method": {
+          "Candidate": {"exact_match": 0.705, "n": 200},
+          "Baseline": {"exact_match": 0.655, "n": 200}
+        },
+        "seed_variance": {
+          "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+        },
+        "seeds": [0, 1, 2],
+        "repro_ci": [0.01, 0.08],
+        "kept_ci": [0.0, 0.07],
+        "statistical_tests": "paired permutation p=0.0123"
+      }
+    }
+  },
+  "manifest": {
+    "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+    "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+    "baselines": ["Baseline", "Candidate"],
+    "metrics": ["exact_match"],
+    "seeds": [0, 1, 2],
+    "hardware": "CPU test fixture",
+    "statistical_tests": "paired permutation p=0.0123",
+    "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+    "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+    "ablation": [{"name": "no_guard"}],
+    "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+  }
+}
diff --git a/tests/test_paper_completeness_m1.py b/tests/test_paper_completeness_m1.py
new file mode 100644
index 0000000..170de8f
--- /dev/null
+++ b/tests/test_paper_completeness_m1.py
@@ -0,0 +1,110 @@
+import json
+from pathlib import Path
+
+from agents.paper_completeness import (
+    audit_evidence_completeness,
+    build_claim_evidence_matrix,
+    build_evidence_ledger,
+    build_reviewer_report,
+)
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _load_fixture(name: str) -> dict:
+    return json.loads((FIXTURES / name).read_text(encoding="utf-8"))
+
+
+def _improves_row(matrix: list[dict]) -> dict:
+    return next(row for row in matrix if row["claim"] == "Improves utility")
+
+
+def _claim_row(matrix: list[dict], claim: str) -> dict:
+    return next(row for row in matrix if row["claim"].startswith(claim))
+
+
+def _significance_answer(report: dict) -> dict:
+    return next(
+        row
+        for row in report["checklist"]
+        if row["question"] == "Is the improvement statistically significant?"
+    )
+
+
+def test_m1_p_eq_one_is_not_significant_even_with_p_text():
+    payload = _load_fixture("m1_p_eq_one.json")
+    matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+    assert _improves_row(matrix)["can_appear_in_abstract"] is False
+    assert _claim_row(matrix, "The method improves utility")["can_appear_in_abstract"] is False
+
+
+def test_m1_confirmed_p_below_default_alpha_can_appear():
+    payload = _load_fixture("m1_significant.json")
+    matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+    assert _improves_row(matrix)["can_appear_in_abstract"] is True
+
+
+def test_m1_refuted_verdict_blocks_abstract_claim_even_with_low_p():
+    payload = _load_fixture("m1_refuted.json")
+    matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+    assert _improves_row(matrix)["can_appear_in_abstract"] is False
+    assert _claim_row(matrix, "The method improves utility")["can_appear_in_abstract"] is False
+
+
+def test_m1_alpha_env_override_controls_significance(monkeypatch):
+    payload = _load_fixture("m1_significant.json")
+    payload["state"]["result_packet"]["p_value"] = 0.049
+    payload["manifest"]["statistical_tests"] = "p=0.049"
+    monkeypatch.setenv("DEEPGRAPH_SIGNIFICANCE_ALPHA", "0.01")
+
+    matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+    assert _improves_row(matrix)["can_appear_in_abstract"] is False
+
+
+def test_m1_missing_p_value_is_not_significant_and_does_not_crash():
+    payload = _load_fixture("m1_significant.json")
+    payload["state"]["result_packet"]["p_value"] = None
+    payload["manifest"]["statistical_tests"] = ""
+
+    matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+    assert _improves_row(matrix)["can_appear_in_abstract"] is False
+
+
+def test_m1_reviewer_significance_answer_uses_numeric_p_but_preserves_existence_gate():
+    payload = _load_fixture("m1_p_eq_one.json")
+    matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+    report = build_reviewer_report(payload["state"], payload["manifest"], matrix, blockers=[])
+
+    assert _significance_answer(report)["answer"] == "No"
+    audit = audit_evidence_completeness(payload["state"])
+    assert not any("Statistical test or confidence interval" in blocker for blocker in audit["blockers"])
+
+
+def test_m1_build_evidence_ledger_minimal_schema():
+    payload = _load_fixture("m1_significant.json")
+    packet = payload["state"]["result_packet"]
+    summary = packet["benchmark_summary"]
+
+    ledger = build_evidence_ledger(
+        packet,
+        summary,
+        alpha=0.01,
+        provenance={"command": "pytest tests/test_paper_completeness_m1.py"},
+    )
+
+    assert ledger["schema_version"] == "1.0"
+    assert ledger["alpha"] == 0.01
+    assert ledger["verdict"] == "confirmed"
+    assert ledger["p_value"] == 0.0123
+    assert ledger["effect_size"] == 0.045
+    assert ledger["confidence"] == 0.9877
+    assert ledger["per_method"]["Candidate"]["exact_match"] == 0.705
+    assert ledger["seed_variance"]["Candidate"]["per_seed"] == {"0": 0.71, "1": 0.7, "2": 0.705}
+    assert ledger["seeds"] == [0, 1, 2]
+    assert ledger["provenance"]["command"].startswith("pytest")

From 28b2c6355f25bb2c24b81c437039f75fb6629111 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:47:53 +0000
Subject: [PATCH 3/7] Add EvidenceLedger traceability checks (#27)

---
 agents/paper_completeness.py                 | 146 +++++++++++++++++++
 tests/fixtures/m4_clean.tex                  |   8 +
 tests/fixtures/m4_clean_ledger.json          |  22 +++
 tests/fixtures/m4_method_only.tex            |  10 ++
 tests/fixtures/m4_missing_p_ledger.json      |  21 +++
 tests/fixtures/m4_refuted_ledger.json        |  22 +++
 tests/fixtures/m4_refuted_positive_claim.tex |   8 +
 tests/fixtures/m4_unsourced_number.tex       |   8 +
 tests/fixtures/m4_whitelist_numbers.tex      |   8 +
 tests/test_paper_completeness_m4.py          |  58 ++++++++
 10 files changed, 311 insertions(+)
 create mode 100644 tests/fixtures/m4_clean.tex
 create mode 100644 tests/fixtures/m4_clean_ledger.json
 create mode 100644 tests/fixtures/m4_method_only.tex
 create mode 100644 tests/fixtures/m4_missing_p_ledger.json
 create mode 100644 tests/fixtures/m4_refuted_ledger.json
 create mode 100644 tests/fixtures/m4_refuted_positive_claim.tex
 create mode 100644 tests/fixtures/m4_unsourced_number.tex
 create mode 100644 tests/fixtures/m4_whitelist_numbers.tex
 create mode 100644 tests/test_paper_completeness_m4.py

diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
index 42061d7..6e72905 100644
--- a/agents/paper_completeness.py
+++ b/agents/paper_completeness.py
@@ -201,6 +201,152 @@ def _ledger_supports_significance(ledger: dict[str, Any]) -> bool:
     )
 
 
+EVIDENCE_LEDGER_REQUIRED_FIELDS = (
+    "schema_version",
+    "run_id",
+    "claim_id",
+    "metric",
+    "alpha",
+    "verdict",
+    "p_value",
+    "effect_size",
+    "confidence",
+    "repro_ci",
+    "kept_ci",
+    "per_method",
+    "seed_variance",
+    "seeds",
+    "provenance",
+)
+
+TRACEABILITY_POSITIVE_TERMS = (
+    "improves",
+    "improve",
+    "outperforms",
+    "outperform",
+    "significant",
+    "significantly",
+    "superior",
+    "beats",
+    "surpasses",
+)
+
+TRACEABILITY_NUMBER_RE = re.compile(r"-?\d+\.?\d*%?")
+
+
+def validate_evidence_ledger(ledger: dict) -> list[dict]:
+    """Return schema violations. Missing required fields are explicit."""
+    ledger = _as_dict(ledger)
+    violations: list[dict] = []
+    for field in EVIDENCE_LEDGER_REQUIRED_FIELDS:
+        if field not in ledger:
+            violations.append(
+                {
+                    "rule": "schema_error",
+                    "location": {"section": "ledger", "line": 0},
+                    "snippet": f"missing required field: {field}",
+                    "value": field,
+                }
+            )
+    return violations
+
+
+def _traceable_sections(main_tex: str) -> list[tuple[str, str, int]]:
+    sections: list[tuple[str, str, int]] = []
+    abstract = re.search(r"\\begin\{abstract\}(.*?)\\end\{abstract\}", main_tex or "", re.DOTALL | re.IGNORECASE)
+    if abstract:
+        sections.append(("abstract", abstract.group(1), abstract.start(1)))
+    section_re = re.compile(r"\\section\{([^}]*)\}", re.IGNORECASE)
+    matches = list(section_re.finditer(main_tex or ""))
+    for idx, match in enumerate(matches):
+        name = _lower(match.group(1))
+        if name not in {"conclusion", "conclusions", "discussion"}:
+            continue
+        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(main_tex or "")
+        section_name = "discussion" if name == "discussion" else "conclusion"
+        sections.append((section_name, (main_tex or "")[match.end():end], match.end()))
+    return sections
+
+
+def _ledger_numeric_sources(ledger: dict[str, Any]) -> list[float]:
+    sources: list[float] = []
+
+    def add(value: Any) -> None:
+        numeric = _numeric(value)
+        if numeric is not None:
+            sources.append(numeric)
+
+    add(ledger.get("p_value"))
+    add(ledger.get("effect_size"))
+    for value in _as_list(ledger.get("repro_ci")) + _as_list(ledger.get("kept_ci")):
+        add(value)
+    metric = _text(ledger.get("metric"))
+    for payload in _as_dict(ledger.get("per_method")).values():
+        row = _as_dict(payload)
+        if metric:
+            add(row.get(metric))
+    for payload in _as_dict(ledger.get("seed_variance")).values():
+        add(_as_dict(payload).get("mean"))
+    return sources
+
+
+def _traceable_number(token: str, sources: list[float]) -> bool:
+    is_percent = token.endswith("%")
+    numeric = _numeric(token[:-1] if is_percent else token)
+    if numeric is None:
+        return True
+    if is_percent:
+        numeric /= 100.0
+    return any(abs(numeric - source) <= 1e-6 for source in sources)
+
+
+def _skip_traceability_number(line: str, start: int, end: int) -> bool:
+    prefix = line[:start]
+    suffix = line[end:]
+    if re.search(r"(?:Table|Figure|Section)\s*$", prefix, re.IGNORECASE):
+        return True
+    if re.match(r"\s*seeds?\b", suffix, re.IGNORECASE):
+        return True
+    return False
+
+
+def assert_traceable(main_tex: str, ledger: dict) -> list[dict]:
+    """Return evidence traceability violations for Abstract / Conclusion only."""
+    ledger = _as_dict(ledger)
+    violations = validate_evidence_ledger(ledger)
+    sources = _ledger_numeric_sources(ledger)
+    negative_verdict = _lower(ledger.get("verdict")) in {"refuted", "inconclusive"}
+    positive_re = re.compile(r"\b(" + "|".join(re.escape(term) for term in TRACEABILITY_POSITIVE_TERMS) + r")\b", re.IGNORECASE)
+    text = main_tex or ""
+
+    for section, body, offset in _traceable_sections(text):
+        for rel_line, line in enumerate(body.splitlines(), start=1):
+            absolute_line = text.count("\n", 0, offset) + rel_line
+            if negative_verdict and positive_re.search(line):
+                violations.append(
+                    {
+                        "rule": "positive_claim_with_negative_verdict",
+                        "location": {"section": section, "line": absolute_line},
+                        "snippet": line.strip(),
+                        "value": ledger.get("verdict"),
+                    }
+                )
+            for match in TRACEABILITY_NUMBER_RE.finditer(line):
+                if _skip_traceability_number(line, match.start(), match.end()):
+                    continue
+                token = match.group(0)
+                if not _traceable_number(token, sources):
+                    violations.append(
+                        {
+                            "rule": "unsourced_number",
+                            "location": {"section": section, "line": absolute_line},
+                            "snippet": line.strip(),
+                            "value": token[:-1] if token.endswith("%") else token,
+                        }
+                    )
+    return violations
+
+
 def _infer_model_size(name: str) -> str:
     match = re.search(r"(\d+(?:\.\d+)?\s*[bB])", name or "")
     return match.group(1).replace(" ", "") if match else ""
diff --git a/tests/fixtures/m4_clean.tex b/tests/fixtures/m4_clean.tex
new file mode 100644
index 0000000..df2b41e
--- /dev/null
+++ b/tests/fixtures/m4_clean.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate reaches 70.5% exact match with p=0.0123 and effect 0.045.
+\end{abstract}
+\section{Conclusion}
+The reproduced mean is 70.5% with interval endpoints 0.01 and 0.08.
+\end{document}
diff --git a/tests/fixtures/m4_clean_ledger.json b/tests/fixtures/m4_clean_ledger.json
new file mode 100644
index 0000000..875c817
--- /dev/null
+++ b/tests/fixtures/m4_clean_ledger.json
@@ -0,0 +1,22 @@
+{
+  "schema_version": "1.0",
+  "run_id": "run-m4-clean",
+  "claim_id": "claim-m4-clean",
+  "metric": "exact_match",
+  "alpha": 0.05,
+  "verdict": "confirmed",
+  "p_value": 0.0123,
+  "effect_size": 0.045,
+  "confidence": 0.9877,
+  "repro_ci": [0.01, 0.08],
+  "kept_ci": [0.0, 0.07],
+  "per_method": {
+    "Candidate": {"exact_match": 0.705, "n": 200},
+    "Baseline": {"exact_match": 0.655, "n": 200}
+  },
+  "seed_variance": {
+    "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+  },
+  "seeds": [0, 1, 2],
+  "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_method_only.tex b/tests/fixtures/m4_method_only.tex
new file mode 100644
index 0000000..924f5e2
--- /dev/null
+++ b/tests/fixtures/m4_method_only.tex
@@ -0,0 +1,10 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+This paper describes the experiment.
+\end{abstract}
+\section{Method}
+The method section mentions a calibration constant 0.777 that is not in the ledger.
+\section{Conclusion}
+The conclusion contains no numeric claim.
+\end{document}
diff --git a/tests/fixtures/m4_missing_p_ledger.json b/tests/fixtures/m4_missing_p_ledger.json
new file mode 100644
index 0000000..269c894
--- /dev/null
+++ b/tests/fixtures/m4_missing_p_ledger.json
@@ -0,0 +1,21 @@
+{
+  "schema_version": "1.0",
+  "run_id": "run-m4-missing-p",
+  "claim_id": "claim-m4-missing-p",
+  "metric": "exact_match",
+  "alpha": 0.05,
+  "verdict": "confirmed",
+  "effect_size": 0.045,
+  "confidence": 0.99,
+  "repro_ci": [0.02, 0.08],
+  "kept_ci": [0.0, 0.07],
+  "per_method": {
+    "Candidate": {"exact_match": 0.705, "n": 200},
+    "Baseline": {"exact_match": 0.655, "n": 200}
+  },
+  "seed_variance": {
+    "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+  },
+  "seeds": [0, 1, 2],
+  "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_refuted_ledger.json b/tests/fixtures/m4_refuted_ledger.json
new file mode 100644
index 0000000..69ccf56
--- /dev/null
+++ b/tests/fixtures/m4_refuted_ledger.json
@@ -0,0 +1,22 @@
+{
+  "schema_version": "1.0",
+  "run_id": "run-m4-refuted",
+  "claim_id": "claim-m4-refuted",
+  "metric": "exact_match",
+  "alpha": 0.05,
+  "verdict": "refuted",
+  "p_value": 0.0123,
+  "effect_size": -0.01,
+  "confidence": 0.9877,
+  "repro_ci": [-0.08, -0.01],
+  "kept_ci": [-0.07, 0.0],
+  "per_method": {
+    "Candidate": {"exact_match": 0.605, "n": 200},
+    "Baseline": {"exact_match": 0.655, "n": 200}
+  },
+  "seed_variance": {
+    "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
+  },
+  "seeds": [0, 1, 2],
+  "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_refuted_positive_claim.tex b/tests/fixtures/m4_refuted_positive_claim.tex
new file mode 100644
index 0000000..9d7bced
--- /dev/null
+++ b/tests/fixtures/m4_refuted_positive_claim.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate significantly improves exact match over the baseline.
+\end{abstract}
+\section{Conclusion}
+The method outperforms the baseline.
+\end{document}
diff --git a/tests/fixtures/m4_unsourced_number.tex b/tests/fixtures/m4_unsourced_number.tex
new file mode 100644
index 0000000..9683f9a
--- /dev/null
+++ b/tests/fixtures/m4_unsourced_number.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate significantly improves exact match with p=0.01.
+\end{abstract}
+\section{Conclusion}
+The conclusion repeats p=0.01.
+\end{document}
diff --git a/tests/fixtures/m4_whitelist_numbers.tex b/tests/fixtures/m4_whitelist_numbers.tex
new file mode 100644
index 0000000..8c00fa8
--- /dev/null
+++ b/tests/fixtures/m4_whitelist_numbers.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Table 2 summarizes the result over 5 seeds.
+\end{abstract}
+\section{Discussion}
+Figure 3 shows the setup.
+\end{document}
diff --git a/tests/test_paper_completeness_m4.py b/tests/test_paper_completeness_m4.py
new file mode 100644
index 0000000..5722f84
--- /dev/null
+++ b/tests/test_paper_completeness_m4.py
@@ -0,0 +1,58 @@
+import json
+from pathlib import Path
+
+from agents.paper_completeness import assert_traceable, validate_evidence_ledger
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _tex(name: str) -> str:
+    return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+def _ledger(name: str) -> dict:
+    return json.loads((FIXTURES / name).read_text(encoding="utf-8"))
+
+
+def test_m4_unsourced_abstract_number_reports_violation_when_ledger_lacks_value():
+    violations = assert_traceable(_tex("m4_unsourced_number.tex"), _ledger("m4_missing_p_ledger.json"))
+
+    assert any(v["rule"] == "unsourced_number" and v["value"] == "0.01" for v in violations)
+
+
+def test_m4_refuted_verdict_blocks_positive_abstract_and_conclusion_claims():
+    violations = assert_traceable(
+        _tex("m4_refuted_positive_claim.tex"),
+        _ledger("m4_refuted_ledger.json"),
+    )
+
+    assert any(v["rule"] == "positive_claim_with_negative_verdict" for v in violations)
+    assert {v["location"]["section"] for v in violations} >= {"abstract", "conclusion"}
+
+
+def test_m4_clean_abstract_and_conclusion_numbers_are_traceable_with_percent_normalization():
+    violations = assert_traceable(_tex("m4_clean.tex"), _ledger("m4_clean_ledger.json"))
+
+    assert violations == []
+
+
+def test_m4_numbers_outside_abstract_and_conclusion_are_allowed():
+    violations = assert_traceable(_tex("m4_method_only.tex"), _ledger("m4_clean_ledger.json"))
+
+    assert violations == []
+
+
+def test_m4_schema_validation_reports_missing_required_field():
+    ledger = _ledger("m4_clean_ledger.json")
+    ledger.pop("verdict")
+
+    violations = validate_evidence_ledger(ledger)
+
+    assert any(v["rule"] == "schema_error" and v["value"] == "verdict" for v in violations)
+
+
+def test_m4_table_figure_and_seed_counts_are_not_unsourced_numbers():
+    violations = assert_traceable(_tex("m4_whitelist_numbers.tex"), _ledger("m4_clean_ledger.json"))
+
+    assert violations == []

From 9cfc1cbfcbb4b85bd2a9a05ca87a528c5a26b3d4 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:52:17 +0000
Subject: [PATCH 4/7] Add deterministic LaTeX sanity checks (#25)

---
 agents/paper_completeness.py        | 99 ++++++++++++++++++++++++++++-
 tests/fixtures/m2a_clean.tex        |  7 ++
 tests/fixtures/m2a_unresolved.tex   |  4 ++
 tests/fixtures/m2b_clean.tex        |  6 ++
 tests/fixtures/m2b_scaffold.tex     |  6 ++
 tests/fixtures/m2c_clean.tex        |  7 ++
 tests/fixtures/m2c_contaminated.tex |  7 ++
 tests/fixtures/m2d_clean.tex        |  6 ++
 tests/fixtures/m2d_repeat.tex       |  7 ++
 tests/test_latex_sanity_m2.py       | 71 +++++++++++++++++++++
 10 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 tests/fixtures/m2a_clean.tex
 create mode 100644 tests/fixtures/m2a_unresolved.tex
 create mode 100644 tests/fixtures/m2b_clean.tex
 create mode 100644 tests/fixtures/m2b_scaffold.tex
 create mode 100644 tests/fixtures/m2c_clean.tex
 create mode 100644 tests/fixtures/m2c_contaminated.tex
 create mode 100644 tests/fixtures/m2d_clean.tex
 create mode 100644 tests/fixtures/m2d_repeat.tex
 create mode 100644 tests/test_latex_sanity_m2.py

diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
index 6e72905..4d3e4ab 100644
--- a/agents/paper_completeness.py
+++ b/agents/paper_completeness.py
@@ -36,6 +36,9 @@
     "outside the verified claim",
     "missing generated figure",
     "diagram placeholder",
+    "fixme",
+    "<scaffold",
+    "generate a figure that",
 )
 
 FORBIDDEN_LATEX_SYMBOLS = ("￾", "", "�")
@@ -87,6 +90,21 @@
     "graph-organized neural units",
 )
 
+METHOD_TOKEN_WHITELIST = {
+    "GSM8K",
+    "MuSiQue",
+    "MMLU",
+    "BLEU",
+    "GPU",
+    "CPU",
+    "LLM",
+    "LaTeX",
+    "JSON",
+    "API",
+}
+
+BOILERPLATE_REPEAT_THRESHOLD = 3
+
 
 def _as_dict(value: Any) -> dict[str, Any]:
     if isinstance(value, Mapping):
@@ -1033,10 +1051,89 @@ def audit_evidence_completeness(state: dict[str, Any]) -> dict[str, Any]:
     }
 
 
-def latex_sanity_check(text: str) -> dict[str, Any]:
+def _line_hit(rule: str, value: str, line_no: int, line: str, *, kind: str = "deterministic") -> dict[str, Any]:
+    return {
+        "kind": kind,
+        "rule": rule,
+        "value": value,
+        "location": {"line": line_no},
+        "snippet": line.strip(),
+    }
+
+
+def _strip_latex_code_blocks(text: str) -> str:
+    stripped = re.sub(
+        r"\\begin\{(?:verbatim|lstlisting|minted)\}.*?\\end\{(?:verbatim|lstlisting|minted)\}",
+        "",
+        text or "",
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+    stripped = re.sub(r"```.*?```", "", stripped, flags=re.DOTALL)
+    return stripped
+
+
+def _deterministic_latex_hits(text: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
+    state = _as_dict(state)
+    scan_text = _strip_latex_code_blocks(text or "")
+    hits: list[dict[str, Any]] = []
+    for line_no, line in enumerate(scan_text.splitlines(), start=1):
+        if re.search(r"\?\?|Table\s+\?\?|Figure\s+\?\?", line):
+            hits.append(_line_hit("unresolved_reference", "??", line_no, line))
+        for match in re.finditer(r"\\(?:ref|cite[a-zA-Z*]*)\{([^}]*)\}", line):
+            key = match.group(1).strip()
+            if not key or "?" in key or _lower(key) in {"todo", "tbd", "placeholder"}:
+                hits.append(_line_hit("unresolved_reference", match.group(0), line_no, line))
+        for match in re.finditer(r"\{\{[a-z_][a-z0-9_]*\}\}", line):
+            prefix = line[: match.start()]
+            if re.search(r"\\[A-Za-z]+\s*$", prefix) or prefix.rstrip().endswith("}"):
+                continue
+            hits.append(_line_hit("template_placeholder", match.group(0), line_no, line))
+
+    method_name = _text(state.get("method_name"))
+    if method_name:
+        allowed = set(METHOD_TOKEN_WHITELIST)
+        allowed.update(re.findall(r"\b(?:[A-Z]{2,}[A-Za-z0-9]*|[A-Z][a-z]+(?:[A-Z][A-Za-z0-9]*)+)\b", method_name))
+        scoped_parts: list[tuple[str, int]] = []
+        for match in re.finditer(r"\\title\{([^}]*)\}", scan_text, re.DOTALL | re.IGNORECASE):
+            scoped_parts.append((match.group(1), scan_text.count("\n", 0, match.start(1)) + 1))
+        begin_doc = re.search(r"\\begin\{document\}", scan_text, re.IGNORECASE)
+        if begin_doc:
+            scoped_parts.append((scan_text[:begin_doc.start()], 1))
+        for match in re.finditer(r"\\caption\{([^}]*)\}", scan_text, re.DOTALL | re.IGNORECASE):
+            scoped_parts.append((match.group(1), scan_text.count("\n", 0, match.start(1)) + 1))
+        token_re = re.compile(r"\b(?:[A-Z]{2,}[A-Za-z0-9]*|[A-Z][a-z]+(?:[A-Z][A-Za-z0-9]*)+)\b")
+        for snippet, base_line in scoped_parts:
+            for token_match in token_re.finditer(snippet):
+                token = token_match.group(0)
+                if token not in allowed:
+                    hits.append(
+                        _line_hit(
+                            "cross_run_identity",
+                            token,
+                            base_line + snippet.count("\n", 0, token_match.start()),
+                            snippet.splitlines()[0] if snippet.splitlines() else snippet,
+                        )
+                    )
+
+    sentence_locations: dict[str, list[tuple[int, str]]] = {}
+    for line_no, line in enumerate(scan_text.splitlines(), start=1):
+        for sentence in re.split(r"(?<=[.!?])\s+", line.strip()):
+            normalized = re.sub(r"[^a-z0-9\s]", "", sentence.lower())
+            normalized = re.sub(r"\s+", " ", normalized).strip()
+            if len(normalized.split()) >= 8:
+                sentence_locations.setdefault(normalized, []).append((line_no, sentence))
+    for normalized, locations in sentence_locations.items():
+        if len(locations) >= BOILERPLATE_REPEAT_THRESHOLD:
+            line_no, sentence = locations[0]
+            hits.append(_line_hit("boilerplate_repetition", normalized, line_no, sentence))
+    return hits
+
+
+def latex_sanity_check(text: str, state: dict[str, Any] | None = None, ledger: dict[str, Any] | None = None) -> dict[str, Any]:
     lower = (text or "").lower()
     hits = [{"kind": "term", "value": term} for term in FORBIDDEN_LATEX_TERMS if term in lower]
     hits.extend({"kind": "symbol", "value": symbol} for symbol in FORBIDDEN_LATEX_SYMBOLS if symbol in (text or ""))
+    hits.extend(_deterministic_latex_hits(text or "", state=state))
     return {
         "schema_version": "latex_sanity_v1",
         "ok": not hits,
diff --git a/tests/fixtures/m2a_clean.tex b/tests/fixtures/m2a_clean.tex
new file mode 100644
index 0000000..68b02e7
--- /dev/null
+++ b/tests/fixtures/m2a_clean.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\begin{document}
+Does the method help?
+\begin{verbatim}
+Table ??
+\end{verbatim}
+\end{document}
diff --git a/tests/fixtures/m2a_unresolved.tex b/tests/fixtures/m2a_unresolved.tex
new file mode 100644
index 0000000..c18c553
--- /dev/null
+++ b/tests/fixtures/m2a_unresolved.tex
@@ -0,0 +1,4 @@
+\documentclass{article}
+\begin{document}
+Table ?? reports the unresolved result.
+\end{document}
diff --git a/tests/fixtures/m2b_clean.tex b/tests/fixtures/m2b_clean.tex
new file mode 100644
index 0000000..01c4970
--- /dev/null
+++ b/tests/fixtures/m2b_clean.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+\[
+\frac{{a}}{{b}}
+\]
+\end{document}
diff --git a/tests/fixtures/m2b_scaffold.tex b/tests/fixtures/m2b_scaffold.tex
new file mode 100644
index 0000000..673f786
--- /dev/null
+++ b/tests/fixtures/m2b_scaffold.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+\begin{figure}
+\caption{Generated plot command {{plot_cmd}}.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2c_clean.tex b/tests/fixtures/m2c_clean.tex
new file mode 100644
index 0000000..ae1dcb8
--- /dev/null
+++ b/tests/fixtures/m2c_clean.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\title{CGGR Results on GSM8K}
+\begin{document}
+\begin{figure}
+\caption{CGGR trajectory on GSM8K with CPU evaluation.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2c_contaminated.tex b/tests/fixtures/m2c_contaminated.tex
new file mode 100644
index 0000000..da9e89c
--- /dev/null
+++ b/tests/fixtures/m2c_contaminated.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\title{OtherMethod Results}
+\begin{document}
+\begin{figure}
+\caption{OtherMethod trajectory.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2d_clean.tex b/tests/fixtures/m2d_clean.tex
new file mode 100644
index 0000000..10a0ea4
--- /dev/null
+++ b/tests/fixtures/m2d_clean.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+This section introduces the benchmark setup.
+The result section then describes the measured effects.
+Finally, the discussion names limitations without repeating a template.
+\end{document}
diff --git a/tests/fixtures/m2d_repeat.tex b/tests/fixtures/m2d_repeat.tex
new file mode 100644
index 0000000..1729d07
--- /dev/null
+++ b/tests/fixtures/m2d_repeat.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\begin{document}
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+\end{document}
diff --git a/tests/test_latex_sanity_m2.py b/tests/test_latex_sanity_m2.py
new file mode 100644
index 0000000..1a00845
--- /dev/null
+++ b/tests/test_latex_sanity_m2.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+
+from agents.paper_completeness import latex_sanity_check
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _tex(name: str) -> str:
+    return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+def _rules(report: dict) -> set[str]:
+    return {hit.get("rule") or hit.get("kind") for hit in report.get("hits", [])}
+
+
+def test_m2a_unresolved_reference_fails_with_line_location():
+    report = latex_sanity_check(_tex("m2a_unresolved.tex"))
+
+    assert report["ok"] is False
+    assert "unresolved_reference" in _rules(report)
+    assert any(hit.get("location", {}).get("line") for hit in report["hits"])
+
+
+def test_m2a_single_question_and_verbatim_question_marks_pass():
+    report = latex_sanity_check(_tex("m2a_clean.tex"))
+
+    assert report["ok"] is True
+
+
+def test_m2b_scaffold_placeholder_regex_fails_but_latex_braces_pass():
+    bad = latex_sanity_check(_tex("m2b_scaffold.tex"))
+    clean = latex_sanity_check(_tex("m2b_clean.tex"))
+
+    assert bad["ok"] is False
+    assert "template_placeholder" in _rules(bad)
+    assert clean["ok"] is True
+
+
+def test_m2b_existing_placeholder_forbidden_term_still_blocks():
+    report = latex_sanity_check("This caption contains placeholder text.")
+
+    assert report["ok"] is False
+    assert any(hit.get("value") == "placeholder" for hit in report["hits"])
+
+
+def test_m2c_cross_run_method_token_fails_by_set_membership():
+    report = latex_sanity_check(_tex("m2c_contaminated.tex"), state={"method_name": "CGGR"})
+
+    assert report["ok"] is False
+    assert "cross_run_identity" in _rules(report)
+    assert any(hit.get("value") == "OtherMethod" for hit in report["hits"])
+
+
+def test_m2c_method_token_and_abbreviation_whitelist_pass():
+    report = latex_sanity_check(_tex("m2c_clean.tex"), state={"method_name": "CGGR"})
+
+    assert report["ok"] is True
+
+
+def test_m2d_repeated_boilerplate_sentence_fails():
+    report = latex_sanity_check(_tex("m2d_repeat.tex"))
+
+    assert report["ok"] is False
+    assert "boilerplate_repetition" in _rules(report)
+
+
+def test_m2d_clean_sentences_pass():
+    report = latex_sanity_check(_tex("m2d_clean.tex"))
+
+    assert report["ok"] is True

From 2c769f96046194f856c6372f4eb1e3f9e4f45835 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:56:46 +0000
Subject: [PATCH 5/7] Wire deterministic sanity checks into render gate (#28)

---
 agents/paper_orchestra_pipeline.py |   2 +-
 tests/test_vnext_manuscript.py     | 124 +++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/agents/paper_orchestra_pipeline.py b/agents/paper_orchestra_pipeline.py
index 98f352b..11497bd 100644
--- a/agents/paper_orchestra_pipeline.py
+++ b/agents/paper_orchestra_pipeline.py
@@ -1840,7 +1840,7 @@ def _resolve_template_id(fmt: str) -> str:
                 "backend": "paper_orchestra",
             }
         page_budget_warning = None
-        latex_sanity_report = latex_sanity_check(main_tex)
+        latex_sanity_report = latex_sanity_check(main_tex, state=state)
         _write(bundle_dir / "main.tex", main_tex)
         _write(
             bundle_dir / "latex_sanity_report.json",
diff --git a/tests/test_vnext_manuscript.py b/tests/test_vnext_manuscript.py
index 53b9eb8..2636c3a 100644
--- a/tests/test_vnext_manuscript.py
+++ b/tests/test_vnext_manuscript.py
@@ -323,6 +323,43 @@ def _stub_orchestra(
             },
         }
 
+    def _generate_bundle_offline(self, run_full, side_effect):
+        run_full.side_effect = side_effect
+        with mock.patch(
+            "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry",
+            side_effect=lambda main_tex, **_: (
+                main_tex,
+                {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}},
+            ),
+        ), mock.patch(
+            "agents.manuscript_page_budget.apply_exact_page_budget",
+            side_effect=lambda main_tex, *_args, **_kwargs: (
+                main_tex,
+                {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10},
+            ),
+        ), mock.patch(
+            "agents.manuscript_page_budget.page_budget_blockers",
+            return_value=[],
+        ):
+            return generate_submission_bundle(1, bundle_formats=["conference"])
+
+    def _orchestra_with_full_tex(self, full_tex):
+        def _stub(state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name, template_id=None):
+            out = self._stub_orchestra(
+                state,
+                literature_block,
+                paper_ids,
+                iterations,
+                figures_dir=figures_dir,
+                baseline=baseline,
+                metric_name=metric_name,
+                template_id=template_id,
+            )
+            out["refinement_full_text"] = full_tex
+            return out
+
+        return _stub
+
     @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
     def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(self, run_full):
         run_full.side_effect = self._stub_orchestra
@@ -370,6 +407,93 @@ def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(se
         self.assertIn("fig_metric_trajectory.svg", main_tex)
         self.assertTrue((self.workspace_root / "idea_1" / "paper" / "current" / "main.tex").exists())
 
+    @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+    def test_generate_submission_bundle_blocks_unresolved_reference_in_rendered_tex(self, run_full):
+        full_tex = (
+            "\\documentclass{article}\\begin{document}"
+            "\\begin{abstract}Clean abstract.\\end{abstract}"
+            "\\section{Results}Table ?? reports the result."
+            "\\section{Discussion}Clean discussion."
+            "\\end{document}"
+        )
+        result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+        self.assertIn("error", result)
+        blockers = " ".join(result.get("submission_blockers") or []).lower()
+        self.assertIn("??", blockers)
+
+    @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+    def test_generate_submission_bundle_blocks_repeated_boilerplate_in_rendered_tex(self, run_full):
+        sentence = "This repeated boilerplate sentence should not appear in every generated paragraph."
+        full_tex = (
+            "\\documentclass{article}\\begin{document}"
+            "\\begin{abstract}Clean abstract.\\end{abstract}"
+            "\\section{Results}"
+            + "\n".join([sentence] * 4)
+            + "\n"
+            + "\\section{Discussion}Clean discussion."
+            "\\end{document}"
+        )
+        result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+        self.assertIn("error", result)
+        blockers = " ".join(result.get("submission_blockers") or []).lower()
+        self.assertIn("boilerplate", blockers)
+
+    @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+    def test_generate_submission_bundle_preserves_existing_placeholder_gate(self, run_full):
+        full_tex = (
+            "\\documentclass{article}\\begin{document}"
+            "\\begin{abstract}Clean abstract.\\end{abstract}"
+            "\\section{Results}This placeholder text must not ship."
+            "\\section{Discussion}Clean discussion."
+            "\\end{document}"
+        )
+        result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+        self.assertIn("error", result)
+        blockers = " ".join(result.get("submission_blockers") or []).lower()
+        self.assertIn("placeholder", blockers)
+
+    @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+    def test_generate_submission_bundle_reports_all_latex_sanity_hits(self, run_full):
+        full_tex = (
+            "\\documentclass{article}\\begin{document}"
+            "\\begin{abstract}Clean abstract.\\end{abstract}"
+            "\\section{Results}Table ?? appears with {{plot_cmd}}."
+            "\\section{Discussion}Clean discussion."
+            "\\end{document}"
+        )
+        result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+        self.assertIn("error", result)
+        blockers = " ".join(result.get("submission_blockers") or []).lower()
+        self.assertIn("??", blockers)
+        self.assertIn("plot_cmd", blockers)
+
+    @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+    def test_generate_submission_bundle_blocks_cross_run_identity_in_rendered_tex(self, run_full):
+        full_tex = (
+            "\\documentclass{article}"
+            "\\title{Auto Manuscript Insight}"
+            "\\begin{document}"
+            "\\maketitle"
+            "\\begin{abstract}Clean abstract.\\end{abstract}"
+            "\\section{Introduction}Intro with \\cite{cite_a}."
+            "\\begin{figure}[t]"
+            "\\caption{OtherMethod trajectory.}"
+            "\\end{figure}"
+            "\\section{Discussion}Clean discussion."
+            "\\bibliographystyle{plain}"
+            "\\bibliography{references}"
+            "\\end{document}"
+        )
+        result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+        self.assertIn("error", result)
+        blockers = " ".join(result.get("submission_blockers") or []).lower()
+        self.assertIn("othermethod", blockers)
+
     def test_generate_submission_bundle_blocks_non_formal_run(self):
         database.execute(
             "UPDATE experiment_runs SET proxy_config=? WHERE id=1",

From 384664254e0960545678f37a1599ff2868de0008 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:59:55 +0000
Subject: [PATCH 6/7] Add CPU presentation boundary tests (#26)

---
 tests/test_presentation_cpu_boundary_m5.py | 126 +++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 tests/test_presentation_cpu_boundary_m5.py

diff --git a/tests/test_presentation_cpu_boundary_m5.py b/tests/test_presentation_cpu_boundary_m5.py
new file mode 100644
index 0000000..7f9b117
--- /dev/null
+++ b/tests/test_presentation_cpu_boundary_m5.py
@@ -0,0 +1,126 @@
+import json
+import subprocess
+import sys
+
+from agents.benchmark_artifacts import materialize_deep_benchmark_artifacts
+from agents.paper_orchestra_pipeline import assemble_main_tex
+
+
+FORBIDDEN_MODULES = {
+    "torch",
+    "transformers",
+    "vllm",
+    "agents.experiment_forge",
+    "agents.experiment_executor",
+}
+
+
+def test_m5_importing_presentation_modules_does_not_load_gpu_or_execution_modules():
+    script = (
+        "import sys\n"
+        "import agents.paper_orchestra_pipeline\n"
+        "import agents.paper_completeness\n"
+        f"forbidden = {sorted(FORBIDDEN_MODULES)!r}\n"
+        "loaded = [name for name in forbidden if name in sys.modules]\n"
+        "print('\\n'.join(loaded))\n"
+        "raise SystemExit(1 if loaded else 0)\n"
+    )
+
+    result = subprocess.run(
+        [sys.executable, "-c", script],
+        cwd=".",
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+
+    assert result.returncode == 0, result.stdout + result.stderr
+    assert result.stdout.strip() == ""
+
+
+def test_m5_assemble_main_tex_renders_offline_without_llm_or_gpu_modules():
+    state = {
+        "title": "CPU Boundary Paper",
+        "baseline_metric_name": "exact_match",
+        "baseline_metric_value": 0.5,
+        "best_metric_value": 0.61,
+        "effect_pct": 22.0,
+        "verdict": "confirmed",
+        "problem_statement": "Can a deterministic renderer run offline?",
+        "method_summary": "A presentation-only method summary.",
+        "problem_awareness": {
+            "central_question": "Can rendering run without benchmark execution?",
+            "motivation": "Presentation gates must stay CPU-only.",
+            "method_answer": "Use already materialized evidence.",
+            "result_claim": "The renderer emits LaTeX from snapshots.",
+        },
+        "contributions": ["A CPU-only rendering path."],
+        "benchmark_summary": {
+            "primary_metric": "exact_match",
+            "per_method": {
+                "Candidate": {"exact_match": 0.61, "n": 20},
+                "Baseline": {"exact_match": 0.50, "n": 20},
+            },
+        },
+    }
+    orchestrated = {
+        "refined": {
+            "abstract": "Offline abstract.",
+            "introduction": "Offline introduction.",
+            "method": "Offline method.",
+            "experiments": "Offline experiments.",
+            "discussion": "Offline discussion.",
+        },
+        "plotting": {"plotting_executor": {"assets": []}},
+    }
+
+    main_tex = assemble_main_tex(state, orchestrated, "conference")
+
+    assert "\\documentclass" in main_tex
+    assert "CPU Boundary Paper" in main_tex
+    assert "\\section{Experiments}" in main_tex
+    assert not (FORBIDDEN_MODULES & set(sys.modules))
+
+
+def test_m5_materialize_raw_predictions_to_cpu_artifacts(tmp_path):
+    results_dir = tmp_path / "results"
+    results_dir.mkdir()
+    methods = ["CPG", "Baseline", "CPG/no_guard"]
+    datasets = ["GSM8K", "StrategyQA"]
+    with (results_dir / "raw_predictions.jsonl").open("w", encoding="utf-8") as handle:
+        for method in methods:
+            for dataset in datasets:
+                for seed in range(3):
+                    for _ in range(7):
+                        score = 0.72 if method == "CPG" else 0.61 if method == "Baseline" else 0.66
+                        row = {
+                            "method": method,
+                            "dataset": dataset,
+                            "seed": seed,
+                            "exact_match": score,
+                        }
+                        handle.write(json.dumps(row) + "\n")
+
+    report = materialize_deep_benchmark_artifacts(
+        results_dir,
+        publication_contract={"required_ablations": ["CPG/no_guard"]},
+        metric_name="exact_match",
+        min_lines=100,
+    )
+
+    assert report["ok"] is True
+    for name in (
+        "benchmark_summary.json",
+        "main_results_table.json",
+        "seed_variance_table.json",
+        "per_dataset_results.json",
+        "ablation_table.json",
+    ):
+        assert (results_dir / name).exists()
+    summary = json.loads((results_dir / "benchmark_summary.json").read_text(encoding="utf-8"))
+    assert summary["primary_metric"] == "exact_match"
+    assert summary["per_method"]["CPG"]["exact_match"] == 0.72
+    assert summary["per_method"]["CPG"]["n"] == 42
+    assert summary["seed_variance"]["CPG"]["n_seeds"] == 3
+    assert summary["seed_variance"]["CPG"]["per_seed"] == {"0": 0.72, "1": 0.72, "2": 0.72}
+    assert summary["ablations"]["CPG/no_guard"]["executed"] is True

From df7881592f5b55f294ece29bb653067c13e688a8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:19:21 +0000
Subject: [PATCH 7/7] Add missing CLA signer for Protocol-zero-0

---
 cla-signers.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cla-signers.json b/cla-signers.json
index d24e8e5..054cb75 100644
--- a/cla-signers.json
+++ b/cla-signers.json
@@ -6,6 +6,9 @@
     },
     {
       "github": "hitome0123"
+    },
+    {
+      "github": "Protocol-zero-0"
     }
   ]
 }