From 60fcc096683d886ce57f41588900b64bf802fc55 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:40:14 +0000
Subject: [PATCH 1/7] Stabilize vnext manuscript regression fixture
---
tests/test_vnext_manuscript.py | 95 +++++++++++++++++++++++++++++++---
1 file changed, 89 insertions(+), 6 deletions(-)
diff --git a/tests/test_vnext_manuscript.py b/tests/test_vnext_manuscript.py
index 88c429f..53b9eb8 100644
--- a/tests/test_vnext_manuscript.py
+++ b/tests/test_vnext_manuscript.py
@@ -225,7 +225,18 @@ def _write_complete_benchmark_packet(self):
(results_dir / "ablation_table.json").write_text(json.dumps(benchmark_summary["ablation_table"]), encoding="utf-8")
(results_dir / "latency_tokens_table.json").write_text(json.dumps(benchmark_summary["latency_tokens_table"]), encoding="utf-8")
- def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name):
+ def _stub_orchestra(
+ self,
+ state,
+ literature_block,
+ paper_ids,
+ iterations,
+ *,
+ figures_dir,
+ baseline,
+ metric_name,
+ template_id=None,
+ ):
figures_dir.mkdir(parents=True, exist_ok=True)
(figures_dir / "fig_metric_trajectory.svg").write_text(
'',
@@ -256,10 +267,39 @@ def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, fig
"abstract": "Abstract text.",
"introduction": "Introduction text with \\cite{cite_a}.",
"method": "Method text.",
- "experiments": "Experiments text.",
+ "experiments": (
+ "Experiments text.\\n"
+ "\\begin{figure}[t]\\n"
+ "\\centering\\n"
+ "\\includegraphics[width=0.9\\linewidth]{figures/fig_metric_trajectory.svg}\\n"
+ "\\caption{Metric trajectory.}\\n"
+ "\\label{fig:metric_trajectory}\\n"
+ "\\end{figure}"
+ ),
"discussion": "Discussion text.",
},
- "refinement_full_text": "",
+ "refinement_full_text": (
+ "\\documentclass{article}\n"
+ "\\usepackage{graphicx}\n"
+ "\\title{Auto Manuscript Insight}\n"
+ "\\begin{document}\n"
+ "\\maketitle\n"
+ "\\begin{abstract}Abstract text.\\end{abstract}\n"
+ "\\section{Introduction}Intro with \\cite{cite_a}.\n"
+ "\\section{Related Work}Related work with \\cite{cite_a}.\n"
+ "\\section{Method}Method text.\n"
+ "\\section{Experiments}Experiments text.\n"
+ "\\begin{figure}[t]\n"
+ "\\centering\n"
+ "\\includegraphics[width=0.9\\linewidth]{figures/fig_metric_trajectory.svg}\n"
+ "\\caption{Metric trajectory.}\n"
+ "\\label{fig:metric_trajectory}\n"
+ "\\end{figure}\n"
+ "\\section{Discussion}Discussion text.\n"
+ "\\bibliographystyle{plain}\n"
+ "\\bibliography{references}\n"
+ "\\end{document}"
+ ),
"agentreview_worklog": [],
"bibtex": "@misc{cite_a,\n title = {Verified Paper},\n author = {Author One},\n year = {2024}\n}\n",
"bib_keys": ["cite_a"],
@@ -286,7 +326,23 @@ def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, fig
@mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(self, run_full):
run_full.side_effect = self._stub_orchestra
- result = generate_submission_bundle(1, bundle_formats=["conference"])
+ with mock.patch(
+ "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry",
+ side_effect=lambda main_tex, **_: (
+ main_tex,
+ {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}},
+ ),
+ ), mock.patch(
+ "agents.manuscript_page_budget.apply_exact_page_budget",
+ side_effect=lambda main_tex, *_args, **_kwargs: (
+ main_tex,
+ {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10},
+ ),
+ ), mock.patch(
+ "agents.manuscript_page_budget.page_budget_blockers",
+ return_value=[],
+ ):
+ result = generate_submission_bundle(1, bundle_formats=["conference"])
self.assertIn("manuscript_run_id", result)
self.assertEqual(result["backend"], "paper_orchestra")
bundle = database.fetchone("SELECT * FROM submission_bundles WHERE manuscript_run_id=?", (result["manuscript_run_id"],))
@@ -371,7 +427,17 @@ def test_generate_submission_bundle_blocks_benchmark_plan_without_artifact_manif
@mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
def test_generate_submission_bundle_blocks_placeholder_figure_assets(self, run_full):
- def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name):
+ def _stub_with_placeholder(
+ state,
+ literature_block,
+ paper_ids,
+ iterations,
+ *,
+ figures_dir,
+ baseline,
+ metric_name,
+ template_id=None,
+ ):
out = self._stub_orchestra(
state,
literature_block,
@@ -380,6 +446,7 @@ def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, fi
figures_dir=figures_dir,
baseline=baseline,
metric_name=metric_name,
+ template_id=template_id,
)
(figures_dir / "fig_metric_trajectory.svg").write_text(
'',
@@ -389,7 +456,23 @@ def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, fi
run_full.side_effect = _stub_with_placeholder
- result = generate_submission_bundle(1, bundle_formats=["conference"])
+ with mock.patch(
+ "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry",
+ side_effect=lambda main_tex, **_: (
+ main_tex,
+ {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}},
+ ),
+ ), mock.patch(
+ "agents.manuscript_page_budget.apply_exact_page_budget",
+ side_effect=lambda main_tex, *_args, **_kwargs: (
+ main_tex,
+ {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10},
+ ),
+ ), mock.patch(
+ "agents.manuscript_page_budget.page_budget_blockers",
+ return_value=[],
+ ):
+ result = generate_submission_bundle(1, bundle_formats=["conference"])
self.assertIn("error", result)
self.assertIn("placeholder", " ".join(result.get("submission_blockers") or []).lower())
From 062cf5523d84e4461e0c3046733e5b4e3eefb378 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:44:34 +0000
Subject: [PATCH 2/7] Fix significance evidence ledger gate (#24)
---
agents/paper_completeness.py | 71 ++++++++++++++++--
tests/fixtures/m1_p_eq_one.json | 38 ++++++++++
tests/fixtures/m1_refuted.json | 38 ++++++++++
tests/fixtures/m1_significant.json | 40 ++++++++++
tests/test_paper_completeness_m1.py | 110 ++++++++++++++++++++++++++++
5 files changed, 292 insertions(+), 5 deletions(-)
create mode 100644 tests/fixtures/m1_p_eq_one.json
create mode 100644 tests/fixtures/m1_refuted.json
create mode 100644 tests/fixtures/m1_significant.json
create mode 100644 tests/test_paper_completeness_m1.py
diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
index f3b8d0e..42061d7 100644
--- a/agents/paper_completeness.py
+++ b/agents/paper_completeness.py
@@ -145,6 +145,62 @@ def _numeric(value: Any) -> float | None:
return None
+def _significance_alpha() -> float:
+ raw = os.environ.get("DEEPGRAPH_SIGNIFICANCE_ALPHA")
+ alpha = _numeric(raw)
+ if alpha is None or alpha <= 0:
+ return 0.05
+ return alpha
+
+
+def build_evidence_ledger(
+ packet: dict,
+ benchmark_summary: dict,
+ *,
+ alpha: float,
+ provenance: dict | None = None,
+) -> dict:
+ """Build the single source of truth for presentation evidence."""
+ packet = _as_dict(packet)
+ summary = _as_dict(benchmark_summary)
+ p_value = _numeric(packet.get("p_value"))
+ effect_size = _numeric(_first_present(packet.get("effect_size"), packet.get("effect_pct")))
+ metric = _text(_first_present(packet.get("metric_name"), summary.get("primary_metric"), summary.get("metric_name")))
+ seeds = _as_list(_first_present(summary.get("seeds"), summary.get("seed_values")))
+ if not seeds:
+ seeds = _seed_list(summary)
+ confidence = round(1.0 - p_value, 10) if p_value is not None else None
+ return {
+ "schema_version": "1.0",
+ "run_id": _text(packet.get("run_id")),
+ "claim_id": _text(packet.get("claim_id")),
+ "metric": metric,
+ "alpha": alpha,
+ "verdict": _text(packet.get("verdict")),
+ "p_value": p_value,
+ "effect_size": effect_size,
+ "confidence": confidence,
+ "repro_ci": _as_list(_first_present(packet.get("repro_ci"), summary.get("repro_ci"))),
+ "kept_ci": _as_list(_first_present(packet.get("kept_ci"), summary.get("kept_ci"))),
+ "per_method": _as_dict(summary.get("per_method")),
+ "seed_variance": _as_dict(summary.get("seed_variance")),
+ "seeds": seeds,
+ "provenance": _as_dict(provenance),
+ }
+
+
+def _ledger_supports_significance(ledger: dict[str, Any]) -> bool:
+ p_value = _numeric(ledger.get("p_value"))
+ alpha = _numeric(ledger.get("alpha"))
+ verdict = _lower(ledger.get("verdict"))
+ return bool(
+ p_value is not None
+ and alpha is not None
+ and p_value < alpha
+ and verdict in {"confirmed", "reproduced"}
+ )
+
+
def _infer_model_size(name: str) -> str:
match = re.search(r"(\d+(?:\.\d+)?\s*[bB])", name or "")
return match.group(1).replace(" ", "") if match else ""
@@ -586,9 +642,9 @@ def build_claim_evidence_matrix(state: dict[str, Any], manifest: dict[str, Any])
packet, summary, _artifact_manifest, _contract = _packet_parts(state)
per_method = _as_dict(summary.get("per_method"))
seeds = _as_list(manifest.get("seeds"))
- p_text = _lower(manifest.get("statistical_tests"))
has_multi_seed = len(seeds) >= 3
- has_significance = "p=" in p_text or "bootstrap" in p_text or "permutation" in p_text
+ ledger = build_evidence_ledger(packet, summary, alpha=_significance_alpha())
+ has_significance = _ledger_supports_significance(ledger)
rows = [
{
"claim": "Improves utility",
@@ -634,8 +690,8 @@ def build_claim_evidence_matrix(state: dict[str, Any], manifest: dict[str, Any])
"claim": _text(packet.get("claim_text"))[:280],
"required_evidence": "mapped quantitative artifact in result_packet or benchmark_summary",
"current_evidence": "present" if len(per_method) >= 2 else "missing benchmark comparison",
- "can_appear_in_abstract": bool(len(per_method) >= 2 and has_multi_seed),
- "allowed_sections": ["Abstract", "Introduction", "Conclusion"] if len(per_method) >= 2 and has_multi_seed else ["Motivation", "Limitations"],
+ "can_appear_in_abstract": bool(len(per_method) >= 2 and has_multi_seed and has_significance),
+ "allowed_sections": ["Abstract", "Introduction", "Conclusion"] if len(per_method) >= 2 and has_multi_seed and has_significance else ["Motivation", "Limitations"],
},
)
return rows
@@ -742,6 +798,7 @@ def build_reviewer_report(
blockers: list[str],
) -> dict[str, Any]:
packet, summary, _artifact_manifest, _contract = _packet_parts(state)
+ ledger = build_evidence_ledger(packet, summary, alpha=_significance_alpha())
answers: list[dict[str, Any]] = []
def add(question: str, yes: bool, evidence: str) -> None:
@@ -750,7 +807,11 @@ def add(question: str, yes: bool, evidence: str) -> None:
add("What is the exact dataset?", bool(_as_list(manifest.get("datasets")) and not any("dataset" in b.lower() for b in blockers[:3])), str(manifest.get("datasets") or "missing"))
add("What is the exact model?", bool(_as_list(manifest.get("models")) and not any("model" in b.lower() for b in blockers)), str(manifest.get("models") or "missing"))
add("What is the exact baseline?", len(_as_list(manifest.get("baselines"))) >= 2, ", ".join(_as_list(manifest.get("baselines"))))
- add("Is the improvement statistically significant?", bool(_text(manifest.get("statistical_tests")) and not any("statistical" in b.lower() for b in blockers)), _text(manifest.get("statistical_tests")))
+ add(
+ "Is the improvement statistically significant?",
+ bool(_ledger_supports_significance(ledger) and not any("statistical" in b.lower() for b in blockers)),
+ f"p={ledger.get('p_value')}; alpha={ledger.get('alpha')}; verdict={ledger.get('verdict')}",
+ )
add("Is there more than one benchmark?", len(_as_list(manifest.get("datasets"))) > 1, str(len(_as_list(manifest.get("datasets")))))
add("Are compute savings actually measured?", bool(manifest.get("latency") or manifest.get("token_cost")), "latency/token payload present" if (manifest.get("latency") or manifest.get("token_cost")) else "missing")
add("Is the proposed gate better than confidence/disagreement routing?", any("Beats confidence" in row.get("claim", "") and row.get("can_appear_in_abstract") for row in claim_matrix), "claim-evidence matrix")
diff --git a/tests/fixtures/m1_p_eq_one.json b/tests/fixtures/m1_p_eq_one.json
new file mode 100644
index 0000000..099a315
--- /dev/null
+++ b/tests/fixtures/m1_p_eq_one.json
@@ -0,0 +1,38 @@
+{
+ "state": {
+ "result_packet": {
+ "run_id": "run-m1-p-one",
+ "claim_id": "claim-m1-p-one",
+ "claim_text": "The method improves utility.",
+ "metric_name": "exact_match",
+ "verdict": "inconclusive",
+ "p_value": 1.0,
+ "effect_size": 0.0,
+ "benchmark_summary": {
+ "primary_metric": "exact_match",
+ "per_method": {
+ "Candidate": {"exact_match": 0.705, "n": 200},
+ "Baseline": {"exact_match": 0.655, "n": 200}
+ },
+ "seed_variance": {
+ "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+ },
+ "seeds": [0, 1, 2],
+ "statistical_tests": "p=1.0000"
+ }
+ }
+ },
+ "manifest": {
+ "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+ "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+ "baselines": ["Baseline", "Candidate"],
+ "metrics": ["exact_match"],
+ "seeds": [0, 1, 2],
+ "hardware": "CPU test fixture",
+ "statistical_tests": "p=1.0000",
+ "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+ "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+ "ablation": [{"name": "no_guard"}],
+ "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+ }
+}
diff --git a/tests/fixtures/m1_refuted.json b/tests/fixtures/m1_refuted.json
new file mode 100644
index 0000000..a0743e6
--- /dev/null
+++ b/tests/fixtures/m1_refuted.json
@@ -0,0 +1,38 @@
+{
+ "state": {
+ "result_packet": {
+ "run_id": "run-m1-refuted",
+ "claim_id": "claim-m1-refuted",
+ "claim_text": "The method improves utility.",
+ "metric_name": "exact_match",
+ "verdict": "refuted",
+ "p_value": 0.0123,
+ "effect_size": -0.01,
+ "benchmark_summary": {
+ "primary_metric": "exact_match",
+ "per_method": {
+ "Candidate": {"exact_match": 0.605, "n": 200},
+ "Baseline": {"exact_match": 0.655, "n": 200}
+ },
+ "seed_variance": {
+ "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
+ },
+ "seeds": [0, 1, 2],
+ "statistical_tests": "paired permutation p=0.0123"
+ }
+ }
+ },
+ "manifest": {
+ "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+ "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+ "baselines": ["Baseline", "Candidate"],
+ "metrics": ["exact_match"],
+ "seeds": [0, 1, 2],
+ "hardware": "CPU test fixture",
+ "statistical_tests": "paired permutation p=0.0123",
+ "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+ "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+ "ablation": [{"name": "no_guard"}],
+ "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+ }
+}
diff --git a/tests/fixtures/m1_significant.json b/tests/fixtures/m1_significant.json
new file mode 100644
index 0000000..3da4701
--- /dev/null
+++ b/tests/fixtures/m1_significant.json
@@ -0,0 +1,40 @@
+{
+ "state": {
+ "result_packet": {
+ "run_id": "run-m1-significant",
+ "claim_id": "claim-m1-significant",
+ "claim_text": "The method improves utility.",
+ "metric_name": "exact_match",
+ "verdict": "confirmed",
+ "p_value": 0.0123,
+ "effect_size": 0.045,
+ "benchmark_summary": {
+ "primary_metric": "exact_match",
+ "per_method": {
+ "Candidate": {"exact_match": 0.705, "n": 200},
+ "Baseline": {"exact_match": 0.655, "n": 200}
+ },
+ "seed_variance": {
+ "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+ },
+ "seeds": [0, 1, 2],
+ "repro_ci": [0.01, 0.08],
+ "kept_ci": [0.0, 0.07],
+ "statistical_tests": "paired permutation p=0.0123"
+ }
+ }
+ },
+ "manifest": {
+ "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}],
+ "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}],
+ "baselines": ["Baseline", "Candidate"],
+ "metrics": ["exact_match"],
+ "seeds": [0, 1, 2],
+ "hardware": "CPU test fixture",
+ "statistical_tests": "paired permutation p=0.0123",
+ "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}},
+ "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}},
+ "ablation": [{"name": "no_guard"}],
+ "artifacts": {"benchmark_summary": "benchmark_summary.json"}
+ }
+}
diff --git a/tests/test_paper_completeness_m1.py b/tests/test_paper_completeness_m1.py
new file mode 100644
index 0000000..170de8f
--- /dev/null
+++ b/tests/test_paper_completeness_m1.py
@@ -0,0 +1,110 @@
+import json
+from pathlib import Path
+
+from agents.paper_completeness import (
+ audit_evidence_completeness,
+ build_claim_evidence_matrix,
+ build_evidence_ledger,
+ build_reviewer_report,
+)
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _load_fixture(name: str) -> dict:
+ return json.loads((FIXTURES / name).read_text(encoding="utf-8"))
+
+
+def _improves_row(matrix: list[dict]) -> dict:
+ return next(row for row in matrix if row["claim"] == "Improves utility")
+
+
+def _claim_row(matrix: list[dict], claim: str) -> dict:
+ return next(row for row in matrix if row["claim"].startswith(claim))
+
+
+def _significance_answer(report: dict) -> dict:
+ return next(
+ row
+ for row in report["checklist"]
+ if row["question"] == "Is the improvement statistically significant?"
+ )
+
+
+def test_m1_p_eq_one_is_not_significant_even_with_p_text():
+ payload = _load_fixture("m1_p_eq_one.json")
+ matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+ assert _improves_row(matrix)["can_appear_in_abstract"] is False
+ assert _claim_row(matrix, "The method improves utility")["can_appear_in_abstract"] is False
+
+
+def test_m1_confirmed_p_below_default_alpha_can_appear():
+ payload = _load_fixture("m1_significant.json")
+ matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+ assert _improves_row(matrix)["can_appear_in_abstract"] is True
+
+
+def test_m1_refuted_verdict_blocks_abstract_claim_even_with_low_p():
+ payload = _load_fixture("m1_refuted.json")
+ matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+ assert _improves_row(matrix)["can_appear_in_abstract"] is False
+ assert _claim_row(matrix, "The method improves utility")["can_appear_in_abstract"] is False
+
+
+def test_m1_alpha_env_override_controls_significance(monkeypatch):
+ payload = _load_fixture("m1_significant.json")
+ payload["state"]["result_packet"]["p_value"] = 0.049
+ payload["manifest"]["statistical_tests"] = "p=0.049"
+ monkeypatch.setenv("DEEPGRAPH_SIGNIFICANCE_ALPHA", "0.01")
+
+ matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+ assert _improves_row(matrix)["can_appear_in_abstract"] is False
+
+
+def test_m1_missing_p_value_is_not_significant_and_does_not_crash():
+ payload = _load_fixture("m1_significant.json")
+ payload["state"]["result_packet"]["p_value"] = None
+ payload["manifest"]["statistical_tests"] = ""
+
+ matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+
+ assert _improves_row(matrix)["can_appear_in_abstract"] is False
+
+
+def test_m1_reviewer_significance_answer_uses_numeric_p_but_preserves_existence_gate():
+ payload = _load_fixture("m1_p_eq_one.json")
+ matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"])
+ report = build_reviewer_report(payload["state"], payload["manifest"], matrix, blockers=[])
+
+ assert _significance_answer(report)["answer"] == "No"
+ audit = audit_evidence_completeness(payload["state"])
+ assert not any("Statistical test or confidence interval" in blocker for blocker in audit["blockers"])
+
+
+def test_m1_build_evidence_ledger_minimal_schema():
+ payload = _load_fixture("m1_significant.json")
+ packet = payload["state"]["result_packet"]
+ summary = packet["benchmark_summary"]
+
+ ledger = build_evidence_ledger(
+ packet,
+ summary,
+ alpha=0.01,
+ provenance={"command": "pytest tests/test_paper_completeness_m1.py"},
+ )
+
+ assert ledger["schema_version"] == "1.0"
+ assert ledger["alpha"] == 0.01
+ assert ledger["verdict"] == "confirmed"
+ assert ledger["p_value"] == 0.0123
+ assert ledger["effect_size"] == 0.045
+ assert ledger["confidence"] == 0.9877
+ assert ledger["per_method"]["Candidate"]["exact_match"] == 0.705
+ assert ledger["seed_variance"]["Candidate"]["per_seed"] == {"0": 0.71, "1": 0.7, "2": 0.705}
+ assert ledger["seeds"] == [0, 1, 2]
+ assert ledger["provenance"]["command"].startswith("pytest")
From 28b2c6355f25bb2c24b81c437039f75fb6629111 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:47:53 +0000
Subject: [PATCH 3/7] Add EvidenceLedger traceability checks (#27)
---
agents/paper_completeness.py | 146 +++++++++++++++++++
tests/fixtures/m4_clean.tex | 8 +
tests/fixtures/m4_clean_ledger.json | 22 +++
tests/fixtures/m4_method_only.tex | 10 ++
tests/fixtures/m4_missing_p_ledger.json | 21 +++
tests/fixtures/m4_refuted_ledger.json | 22 +++
tests/fixtures/m4_refuted_positive_claim.tex | 8 +
tests/fixtures/m4_unsourced_number.tex | 8 +
tests/fixtures/m4_whitelist_numbers.tex | 8 +
tests/test_paper_completeness_m4.py | 58 ++++++++
10 files changed, 311 insertions(+)
create mode 100644 tests/fixtures/m4_clean.tex
create mode 100644 tests/fixtures/m4_clean_ledger.json
create mode 100644 tests/fixtures/m4_method_only.tex
create mode 100644 tests/fixtures/m4_missing_p_ledger.json
create mode 100644 tests/fixtures/m4_refuted_ledger.json
create mode 100644 tests/fixtures/m4_refuted_positive_claim.tex
create mode 100644 tests/fixtures/m4_unsourced_number.tex
create mode 100644 tests/fixtures/m4_whitelist_numbers.tex
create mode 100644 tests/test_paper_completeness_m4.py
diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
index 42061d7..6e72905 100644
--- a/agents/paper_completeness.py
+++ b/agents/paper_completeness.py
@@ -201,6 +201,152 @@ def _ledger_supports_significance(ledger: dict[str, Any]) -> bool:
)
+EVIDENCE_LEDGER_REQUIRED_FIELDS = (
+ "schema_version",
+ "run_id",
+ "claim_id",
+ "metric",
+ "alpha",
+ "verdict",
+ "p_value",
+ "effect_size",
+ "confidence",
+ "repro_ci",
+ "kept_ci",
+ "per_method",
+ "seed_variance",
+ "seeds",
+ "provenance",
+)
+
+TRACEABILITY_POSITIVE_TERMS = (
+ "improves",
+ "improve",
+ "outperforms",
+ "outperform",
+ "significant",
+ "significantly",
+ "superior",
+ "beats",
+ "surpasses",
+)
+
+TRACEABILITY_NUMBER_RE = re.compile(r"-?\d+\.?\d*%?")
+
+
+def validate_evidence_ledger(ledger: dict) -> list[dict]:
+ """Return schema violations. Missing required fields are explicit."""
+ ledger = _as_dict(ledger)
+ violations: list[dict] = []
+ for field in EVIDENCE_LEDGER_REQUIRED_FIELDS:
+ if field not in ledger:
+ violations.append(
+ {
+ "rule": "schema_error",
+ "location": {"section": "ledger", "line": 0},
+ "snippet": f"missing required field: {field}",
+ "value": field,
+ }
+ )
+ return violations
+
+
+def _traceable_sections(main_tex: str) -> list[tuple[str, str, int]]:
+ sections: list[tuple[str, str, int]] = []
+ abstract = re.search(r"\\begin\{abstract\}(.*?)\\end\{abstract\}", main_tex or "", re.DOTALL | re.IGNORECASE)
+ if abstract:
+ sections.append(("abstract", abstract.group(1), abstract.start(1)))
+ section_re = re.compile(r"\\section\{([^}]*)\}", re.IGNORECASE)
+ matches = list(section_re.finditer(main_tex or ""))
+ for idx, match in enumerate(matches):
+ name = _lower(match.group(1))
+ if name not in {"conclusion", "conclusions", "discussion"}:
+ continue
+ end = matches[idx + 1].start() if idx + 1 < len(matches) else len(main_tex or "")
+ section_name = "discussion" if name == "discussion" else "conclusion"
+ sections.append((section_name, (main_tex or "")[match.end():end], match.end()))
+ return sections
+
+
+def _ledger_numeric_sources(ledger: dict[str, Any]) -> list[float]:
+ sources: list[float] = []
+
+ def add(value: Any) -> None:
+ numeric = _numeric(value)
+ if numeric is not None:
+ sources.append(numeric)
+
+ add(ledger.get("p_value"))
+ add(ledger.get("effect_size"))
+ for value in _as_list(ledger.get("repro_ci")) + _as_list(ledger.get("kept_ci")):
+ add(value)
+ metric = _text(ledger.get("metric"))
+ for payload in _as_dict(ledger.get("per_method")).values():
+ row = _as_dict(payload)
+ if metric:
+ add(row.get(metric))
+ for payload in _as_dict(ledger.get("seed_variance")).values():
+ add(_as_dict(payload).get("mean"))
+ return sources
+
+
+def _traceable_number(token: str, sources: list[float]) -> bool:
+ is_percent = token.endswith("%")
+ numeric = _numeric(token[:-1] if is_percent else token)
+ if numeric is None:
+ return True
+ if is_percent:
+ numeric /= 100.0
+ return any(abs(numeric - source) <= 1e-6 for source in sources)
+
+
+def _skip_traceability_number(line: str, start: int, end: int) -> bool:
+ prefix = line[:start]
+ suffix = line[end:]
+ if re.search(r"(?:Table|Figure|Section)\s*$", prefix, re.IGNORECASE):
+ return True
+ if re.match(r"\s*seeds?\b", suffix, re.IGNORECASE):
+ return True
+ return False
+
+
+def assert_traceable(main_tex: str, ledger: dict) -> list[dict]:
+ """Return evidence traceability violations for Abstract / Conclusion only."""
+ ledger = _as_dict(ledger)
+ violations = validate_evidence_ledger(ledger)
+ sources = _ledger_numeric_sources(ledger)
+ negative_verdict = _lower(ledger.get("verdict")) in {"refuted", "inconclusive"}
+ positive_re = re.compile(r"\b(" + "|".join(re.escape(term) for term in TRACEABILITY_POSITIVE_TERMS) + r")\b", re.IGNORECASE)
+ text = main_tex or ""
+
+ for section, body, offset in _traceable_sections(text):
+ for rel_line, line in enumerate(body.splitlines(), start=1):
+ absolute_line = text.count("\n", 0, offset) + rel_line
+ if negative_verdict and positive_re.search(line):
+ violations.append(
+ {
+ "rule": "positive_claim_with_negative_verdict",
+ "location": {"section": section, "line": absolute_line},
+ "snippet": line.strip(),
+ "value": ledger.get("verdict"),
+ }
+ )
+ for match in TRACEABILITY_NUMBER_RE.finditer(line):
+ if _skip_traceability_number(line, match.start(), match.end()):
+ continue
+ token = match.group(0)
+ if not _traceable_number(token, sources):
+ violations.append(
+ {
+ "rule": "unsourced_number",
+ "location": {"section": section, "line": absolute_line},
+ "snippet": line.strip(),
+ "value": token[:-1] if token.endswith("%") else token,
+ }
+ )
+ return violations
+
+
def _infer_model_size(name: str) -> str:
match = re.search(r"(\d+(?:\.\d+)?\s*[bB])", name or "")
return match.group(1).replace(" ", "") if match else ""
diff --git a/tests/fixtures/m4_clean.tex b/tests/fixtures/m4_clean.tex
new file mode 100644
index 0000000..df2b41e
--- /dev/null
+++ b/tests/fixtures/m4_clean.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate reaches 70.5% exact match with p=0.0123 and effect 0.045.
+\end{abstract}
+\section{Conclusion}
+The reproduced mean is 70.5% with interval endpoints 0.01 and 0.08.
+\end{document}
diff --git a/tests/fixtures/m4_clean_ledger.json b/tests/fixtures/m4_clean_ledger.json
new file mode 100644
index 0000000..875c817
--- /dev/null
+++ b/tests/fixtures/m4_clean_ledger.json
@@ -0,0 +1,22 @@
+{
+ "schema_version": "1.0",
+ "run_id": "run-m4-clean",
+ "claim_id": "claim-m4-clean",
+ "metric": "exact_match",
+ "alpha": 0.05,
+ "verdict": "confirmed",
+ "p_value": 0.0123,
+ "effect_size": 0.045,
+ "confidence": 0.9877,
+ "repro_ci": [0.01, 0.08],
+ "kept_ci": [0.0, 0.07],
+ "per_method": {
+ "Candidate": {"exact_match": 0.705, "n": 200},
+ "Baseline": {"exact_match": 0.655, "n": 200}
+ },
+ "seed_variance": {
+ "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+ },
+ "seeds": [0, 1, 2],
+ "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_method_only.tex b/tests/fixtures/m4_method_only.tex
new file mode 100644
index 0000000..924f5e2
--- /dev/null
+++ b/tests/fixtures/m4_method_only.tex
@@ -0,0 +1,10 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+This paper describes the experiment.
+\end{abstract}
+\section{Method}
+The method section mentions a calibration constant 0.777 that is not in the ledger.
+\section{Conclusion}
+The conclusion contains no numeric claim.
+\end{document}
diff --git a/tests/fixtures/m4_missing_p_ledger.json b/tests/fixtures/m4_missing_p_ledger.json
new file mode 100644
index 0000000..269c894
--- /dev/null
+++ b/tests/fixtures/m4_missing_p_ledger.json
@@ -0,0 +1,21 @@
+{
+ "schema_version": "1.0",
+ "run_id": "run-m4-missing-p",
+ "claim_id": "claim-m4-missing-p",
+ "metric": "exact_match",
+ "alpha": 0.05,
+ "verdict": "confirmed",
+ "effect_size": 0.045,
+ "confidence": 0.99,
+ "repro_ci": [0.02, 0.08],
+ "kept_ci": [0.0, 0.07],
+ "per_method": {
+ "Candidate": {"exact_match": 0.705, "n": 200},
+ "Baseline": {"exact_match": 0.655, "n": 200}
+ },
+ "seed_variance": {
+ "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}}
+ },
+ "seeds": [0, 1, 2],
+ "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_refuted_ledger.json b/tests/fixtures/m4_refuted_ledger.json
new file mode 100644
index 0000000..69ccf56
--- /dev/null
+++ b/tests/fixtures/m4_refuted_ledger.json
@@ -0,0 +1,22 @@
+{
+ "schema_version": "1.0",
+ "run_id": "run-m4-refuted",
+ "claim_id": "claim-m4-refuted",
+ "metric": "exact_match",
+ "alpha": 0.05,
+ "verdict": "refuted",
+ "p_value": 0.0123,
+ "effect_size": -0.01,
+ "confidence": 0.9877,
+ "repro_ci": [-0.08, -0.01],
+ "kept_ci": [-0.07, 0.0],
+ "per_method": {
+ "Candidate": {"exact_match": 0.605, "n": 200},
+ "Baseline": {"exact_match": 0.655, "n": 200}
+ },
+ "seed_variance": {
+ "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}}
+ },
+ "seeds": [0, 1, 2],
+ "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"}
+}
diff --git a/tests/fixtures/m4_refuted_positive_claim.tex b/tests/fixtures/m4_refuted_positive_claim.tex
new file mode 100644
index 0000000..9d7bced
--- /dev/null
+++ b/tests/fixtures/m4_refuted_positive_claim.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate significantly improves exact match over the baseline.
+\end{abstract}
+\section{Conclusion}
+The method outperforms the baseline.
+\end{document}
diff --git a/tests/fixtures/m4_unsourced_number.tex b/tests/fixtures/m4_unsourced_number.tex
new file mode 100644
index 0000000..9683f9a
--- /dev/null
+++ b/tests/fixtures/m4_unsourced_number.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Candidate significantly improves exact match with p=0.01.
+\end{abstract}
+\section{Conclusion}
+The conclusion repeats p=0.01.
+\end{document}
diff --git a/tests/fixtures/m4_whitelist_numbers.tex b/tests/fixtures/m4_whitelist_numbers.tex
new file mode 100644
index 0000000..8c00fa8
--- /dev/null
+++ b/tests/fixtures/m4_whitelist_numbers.tex
@@ -0,0 +1,8 @@
+\documentclass{article}
+\begin{document}
+\begin{abstract}
+Table 2 summarizes the result over 5 seeds.
+\end{abstract}
+\section{Discussion}
+Figure 3 shows the setup.
+\end{document}
diff --git a/tests/test_paper_completeness_m4.py b/tests/test_paper_completeness_m4.py
new file mode 100644
index 0000000..5722f84
--- /dev/null
+++ b/tests/test_paper_completeness_m4.py
@@ -0,0 +1,58 @@
+import json
+from pathlib import Path
+
+from agents.paper_completeness import assert_traceable, validate_evidence_ledger
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _tex(name: str) -> str:
+ return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+def _ledger(name: str) -> dict:
+ return json.loads((FIXTURES / name).read_text(encoding="utf-8"))
+
+
+def test_m4_unsourced_abstract_number_reports_violation_when_ledger_lacks_value():
+ violations = assert_traceable(_tex("m4_unsourced_number.tex"), _ledger("m4_missing_p_ledger.json"))
+
+ assert any(v["rule"] == "unsourced_number" and v["value"] == "0.01" for v in violations)
+
+
+def test_m4_refuted_verdict_blocks_positive_abstract_and_conclusion_claims():
+ violations = assert_traceable(
+ _tex("m4_refuted_positive_claim.tex"),
+ _ledger("m4_refuted_ledger.json"),
+ )
+
+ assert any(v["rule"] == "positive_claim_with_negative_verdict" for v in violations)
+ assert {v["location"]["section"] for v in violations} >= {"abstract", "conclusion"}
+
+
+def test_m4_clean_abstract_and_conclusion_numbers_are_traceable_with_percent_normalization():
+ violations = assert_traceable(_tex("m4_clean.tex"), _ledger("m4_clean_ledger.json"))
+
+ assert violations == []
+
+
+def test_m4_numbers_outside_abstract_and_conclusion_are_allowed():
+ violations = assert_traceable(_tex("m4_method_only.tex"), _ledger("m4_clean_ledger.json"))
+
+ assert violations == []
+
+
+def test_m4_schema_validation_reports_missing_required_field():
+ ledger = _ledger("m4_clean_ledger.json")
+ ledger.pop("verdict")
+
+ violations = validate_evidence_ledger(ledger)
+
+ assert any(v["rule"] == "schema_error" and v["value"] == "verdict" for v in violations)
+
+
+def test_m4_table_figure_and_seed_counts_are_not_unsourced_numbers():
+ violations = assert_traceable(_tex("m4_whitelist_numbers.tex"), _ledger("m4_clean_ledger.json"))
+
+ assert violations == []
From 9cfc1cbfcbb4b85bd2a9a05ca87a528c5a26b3d4 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:52:17 +0000
Subject: [PATCH 4/7] Add deterministic LaTeX sanity checks (#25)
---
agents/paper_completeness.py | 99 ++++++++++++++++++++++++++++-
tests/fixtures/m2a_clean.tex | 7 ++
tests/fixtures/m2a_unresolved.tex | 4 ++
tests/fixtures/m2b_clean.tex | 6 ++
tests/fixtures/m2b_scaffold.tex | 6 ++
tests/fixtures/m2c_clean.tex | 7 ++
tests/fixtures/m2c_contaminated.tex | 7 ++
tests/fixtures/m2d_clean.tex | 6 ++
tests/fixtures/m2d_repeat.tex | 7 ++
tests/test_latex_sanity_m2.py | 71 +++++++++++++++++++++
10 files changed, 219 insertions(+), 1 deletion(-)
create mode 100644 tests/fixtures/m2a_clean.tex
create mode 100644 tests/fixtures/m2a_unresolved.tex
create mode 100644 tests/fixtures/m2b_clean.tex
create mode 100644 tests/fixtures/m2b_scaffold.tex
create mode 100644 tests/fixtures/m2c_clean.tex
create mode 100644 tests/fixtures/m2c_contaminated.tex
create mode 100644 tests/fixtures/m2d_clean.tex
create mode 100644 tests/fixtures/m2d_repeat.tex
create mode 100644 tests/test_latex_sanity_m2.py
diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py
index 6e72905..4d3e4ab 100644
--- a/agents/paper_completeness.py
+++ b/agents/paper_completeness.py
@@ -36,6 +36,9 @@
"outside the verified claim",
"missing generated figure",
"diagram placeholder",
+ "fixme",
+ " dict[str, Any]:
if isinstance(value, Mapping):
@@ -1033,10 +1051,89 @@ def audit_evidence_completeness(state: dict[str, Any]) -> dict[str, Any]:
}
-def latex_sanity_check(text: str) -> dict[str, Any]:
+def _line_hit(rule: str, value: str, line_no: int, line: str, *, kind: str = "deterministic") -> dict[str, Any]:
+ return {
+ "kind": kind,
+ "rule": rule,
+ "value": value,
+ "location": {"line": line_no},
+ "snippet": line.strip(),
+ }
+
+
+def _strip_latex_code_blocks(text: str) -> str:
+ stripped = re.sub(
+ r"\\begin\{(?:verbatim|lstlisting|minted)\}.*?\\end\{(?:verbatim|lstlisting|minted)\}",
+ "",
+ text or "",
+ flags=re.DOTALL | re.IGNORECASE,
+ )
+ stripped = re.sub(r"```.*?```", "", stripped, flags=re.DOTALL)
+ return stripped
+
+
+def _deterministic_latex_hits(text: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
+ state = _as_dict(state)
+ scan_text = _strip_latex_code_blocks(text or "")
+ hits: list[dict[str, Any]] = []
+ for line_no, line in enumerate(scan_text.splitlines(), start=1):
+ if re.search(r"\?\?|Table\s+\?\?|Figure\s+\?\?", line):
+ hits.append(_line_hit("unresolved_reference", "??", line_no, line))
+ for match in re.finditer(r"\\(?:ref|cite[a-zA-Z*]*)\{([^}]*)\}", line):
+ key = match.group(1).strip()
+ if not key or "?" in key or _lower(key) in {"todo", "tbd", "placeholder"}:
+ hits.append(_line_hit("unresolved_reference", match.group(0), line_no, line))
+ for match in re.finditer(r"\{\{[a-z_][a-z0-9_]*\}\}", line):
+ prefix = line[: match.start()]
+ if re.search(r"\\[A-Za-z]+\s*$", prefix) or prefix.rstrip().endswith("}"):
+ continue
+ hits.append(_line_hit("template_placeholder", match.group(0), line_no, line))
+
+ method_name = _text(state.get("method_name"))
+ if method_name:
+ allowed = set(METHOD_TOKEN_WHITELIST)
+ allowed.update(re.findall(r"\b(?:[A-Z]{2,}[A-Za-z0-9]*|[A-Z][a-z]+(?:[A-Z][A-Za-z0-9]*)+)\b", method_name))
+ scoped_parts: list[tuple[str, int]] = []
+ for match in re.finditer(r"\\title\{([^}]*)\}", scan_text, re.DOTALL | re.IGNORECASE):
+ scoped_parts.append((match.group(1), scan_text.count("\n", 0, match.start(1)) + 1))
+ begin_doc = re.search(r"\\begin\{document\}", scan_text, re.IGNORECASE)
+ if begin_doc:
+ scoped_parts.append((scan_text[:begin_doc.start()], 1))
+ for match in re.finditer(r"\\caption\{([^}]*)\}", scan_text, re.DOTALL | re.IGNORECASE):
+ scoped_parts.append((match.group(1), scan_text.count("\n", 0, match.start(1)) + 1))
+ token_re = re.compile(r"\b(?:[A-Z]{2,}[A-Za-z0-9]*|[A-Z][a-z]+(?:[A-Z][A-Za-z0-9]*)+)\b")
+ for snippet, base_line in scoped_parts:
+ for token_match in token_re.finditer(snippet):
+ token = token_match.group(0)
+ if token not in allowed:
+ hits.append(
+ _line_hit(
+ "cross_run_identity",
+ token,
+ base_line + snippet.count("\n", 0, token_match.start()),
+ snippet.splitlines()[0] if snippet.splitlines() else snippet,
+ )
+ )
+
+ sentence_locations: dict[str, list[tuple[int, str]]] = {}
+ for line_no, line in enumerate(scan_text.splitlines(), start=1):
+ for sentence in re.split(r"(?<=[.!?])\s+", line.strip()):
+ normalized = re.sub(r"[^a-z0-9\s]", "", sentence.lower())
+ normalized = re.sub(r"\s+", " ", normalized).strip()
+ if len(normalized.split()) >= 8:
+ sentence_locations.setdefault(normalized, []).append((line_no, sentence))
+ for normalized, locations in sentence_locations.items():
+ if len(locations) >= BOILERPLATE_REPEAT_THRESHOLD:
+ line_no, sentence = locations[0]
+ hits.append(_line_hit("boilerplate_repetition", normalized, line_no, sentence))
+ return hits
+
+
+def latex_sanity_check(text: str, state: dict[str, Any] | None = None, ledger: dict[str, Any] | None = None) -> dict[str, Any]:
lower = (text or "").lower()
hits = [{"kind": "term", "value": term} for term in FORBIDDEN_LATEX_TERMS if term in lower]
hits.extend({"kind": "symbol", "value": symbol} for symbol in FORBIDDEN_LATEX_SYMBOLS if symbol in (text or ""))
+ hits.extend(_deterministic_latex_hits(text or "", state=state))
return {
"schema_version": "latex_sanity_v1",
"ok": not hits,
diff --git a/tests/fixtures/m2a_clean.tex b/tests/fixtures/m2a_clean.tex
new file mode 100644
index 0000000..68b02e7
--- /dev/null
+++ b/tests/fixtures/m2a_clean.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\begin{document}
+Does the method help?
+\begin{verbatim}
+Table ??
+\end{verbatim}
+\end{document}
diff --git a/tests/fixtures/m2a_unresolved.tex b/tests/fixtures/m2a_unresolved.tex
new file mode 100644
index 0000000..c18c553
--- /dev/null
+++ b/tests/fixtures/m2a_unresolved.tex
@@ -0,0 +1,4 @@
+\documentclass{article}
+\begin{document}
+Table ?? reports the unresolved result.
+\end{document}
diff --git a/tests/fixtures/m2b_clean.tex b/tests/fixtures/m2b_clean.tex
new file mode 100644
index 0000000..01c4970
--- /dev/null
+++ b/tests/fixtures/m2b_clean.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+\[
+\frac{{a}}{{b}}
+\]
+\end{document}
diff --git a/tests/fixtures/m2b_scaffold.tex b/tests/fixtures/m2b_scaffold.tex
new file mode 100644
index 0000000..673f786
--- /dev/null
+++ b/tests/fixtures/m2b_scaffold.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+\begin{figure}
+\caption{Generated plot command {{plot_cmd}}.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2c_clean.tex b/tests/fixtures/m2c_clean.tex
new file mode 100644
index 0000000..ae1dcb8
--- /dev/null
+++ b/tests/fixtures/m2c_clean.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\title{CGGR Results on GSM8K}
+\begin{document}
+\begin{figure}
+\caption{CGGR trajectory on GSM8K with CPU evaluation.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2c_contaminated.tex b/tests/fixtures/m2c_contaminated.tex
new file mode 100644
index 0000000..da9e89c
--- /dev/null
+++ b/tests/fixtures/m2c_contaminated.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\title{OtherMethod Results}
+\begin{document}
+\begin{figure}
+\caption{OtherMethod trajectory.}
+\end{figure}
+\end{document}
diff --git a/tests/fixtures/m2d_clean.tex b/tests/fixtures/m2d_clean.tex
new file mode 100644
index 0000000..10a0ea4
--- /dev/null
+++ b/tests/fixtures/m2d_clean.tex
@@ -0,0 +1,6 @@
+\documentclass{article}
+\begin{document}
+This section introduces the benchmark setup.
+The result section then describes the measured effects.
+Finally, the discussion names limitations without repeating a template.
+\end{document}
diff --git a/tests/fixtures/m2d_repeat.tex b/tests/fixtures/m2d_repeat.tex
new file mode 100644
index 0000000..1729d07
--- /dev/null
+++ b/tests/fixtures/m2d_repeat.tex
@@ -0,0 +1,7 @@
+\documentclass{article}
+\begin{document}
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+This repeated boilerplate sentence should not appear in every generated paragraph.
+\end{document}
diff --git a/tests/test_latex_sanity_m2.py b/tests/test_latex_sanity_m2.py
new file mode 100644
index 0000000..1a00845
--- /dev/null
+++ b/tests/test_latex_sanity_m2.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+
+from agents.paper_completeness import latex_sanity_check
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _tex(name: str) -> str:
+ return (FIXTURES / name).read_text(encoding="utf-8")
+
+
+def _rules(report: dict) -> set[str]:
+ return {hit.get("rule") or hit.get("kind") for hit in report.get("hits", [])}
+
+
+def test_m2a_unresolved_reference_fails_with_line_location():
+ report = latex_sanity_check(_tex("m2a_unresolved.tex"))
+
+ assert report["ok"] is False
+ assert "unresolved_reference" in _rules(report)
+ assert any(hit.get("location", {}).get("line") for hit in report["hits"])
+
+
+def test_m2a_single_question_and_verbatim_question_marks_pass():
+ report = latex_sanity_check(_tex("m2a_clean.tex"))
+
+ assert report["ok"] is True
+
+
+def test_m2b_scaffold_placeholder_regex_fails_but_latex_braces_pass():
+ bad = latex_sanity_check(_tex("m2b_scaffold.tex"))
+ clean = latex_sanity_check(_tex("m2b_clean.tex"))
+
+ assert bad["ok"] is False
+ assert "template_placeholder" in _rules(bad)
+ assert clean["ok"] is True
+
+
+def test_m2b_existing_placeholder_forbidden_term_still_blocks():
+ report = latex_sanity_check("This caption contains placeholder text.")
+
+ assert report["ok"] is False
+ assert any(hit.get("value") == "placeholder" for hit in report["hits"])
+
+
+def test_m2c_cross_run_method_token_fails_by_set_membership():
+ report = latex_sanity_check(_tex("m2c_contaminated.tex"), state={"method_name": "CGGR"})
+
+ assert report["ok"] is False
+ assert "cross_run_identity" in _rules(report)
+ assert any(hit.get("value") == "OtherMethod" for hit in report["hits"])
+
+
+def test_m2c_method_token_and_abbreviation_whitelist_pass():
+ report = latex_sanity_check(_tex("m2c_clean.tex"), state={"method_name": "CGGR"})
+
+ assert report["ok"] is True
+
+
+def test_m2d_repeated_boilerplate_sentence_fails():
+ report = latex_sanity_check(_tex("m2d_repeat.tex"))
+
+ assert report["ok"] is False
+ assert "boilerplate_repetition" in _rules(report)
+
+
+def test_m2d_clean_sentences_pass():
+ report = latex_sanity_check(_tex("m2d_clean.tex"))
+
+ assert report["ok"] is True
From 2c769f96046194f856c6372f4eb1e3f9e4f45835 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:56:46 +0000
Subject: [PATCH 5/7] Wire deterministic sanity checks into render gate (#28)
---
agents/paper_orchestra_pipeline.py | 2 +-
tests/test_vnext_manuscript.py | 124 +++++++++++++++++++++++++++++
2 files changed, 125 insertions(+), 1 deletion(-)
diff --git a/agents/paper_orchestra_pipeline.py b/agents/paper_orchestra_pipeline.py
index 98f352b..11497bd 100644
--- a/agents/paper_orchestra_pipeline.py
+++ b/agents/paper_orchestra_pipeline.py
@@ -1840,7 +1840,7 @@ def _resolve_template_id(fmt: str) -> str:
"backend": "paper_orchestra",
}
page_budget_warning = None
- latex_sanity_report = latex_sanity_check(main_tex)
+ latex_sanity_report = latex_sanity_check(main_tex, state=state)
_write(bundle_dir / "main.tex", main_tex)
_write(
bundle_dir / "latex_sanity_report.json",
diff --git a/tests/test_vnext_manuscript.py b/tests/test_vnext_manuscript.py
index 53b9eb8..2636c3a 100644
--- a/tests/test_vnext_manuscript.py
+++ b/tests/test_vnext_manuscript.py
@@ -323,6 +323,43 @@ def _stub_orchestra(
},
}
+ def _generate_bundle_offline(self, run_full, side_effect):
+ run_full.side_effect = side_effect
+ with mock.patch(
+ "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry",
+ side_effect=lambda main_tex, **_: (
+ main_tex,
+ {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}},
+ ),
+ ), mock.patch(
+ "agents.manuscript_page_budget.apply_exact_page_budget",
+ side_effect=lambda main_tex, *_args, **_kwargs: (
+ main_tex,
+ {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10},
+ ),
+ ), mock.patch(
+ "agents.manuscript_page_budget.page_budget_blockers",
+ return_value=[],
+ ):
+ return generate_submission_bundle(1, bundle_formats=["conference"])
+
+ def _orchestra_with_full_tex(self, full_tex):
+ def _stub(state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name, template_id=None):
+ out = self._stub_orchestra(
+ state,
+ literature_block,
+ paper_ids,
+ iterations,
+ figures_dir=figures_dir,
+ baseline=baseline,
+ metric_name=metric_name,
+ template_id=template_id,
+ )
+ out["refinement_full_text"] = full_tex
+ return out
+
+ return _stub
+
@mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(self, run_full):
run_full.side_effect = self._stub_orchestra
@@ -370,6 +407,93 @@ def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(se
self.assertIn("fig_metric_trajectory.svg", main_tex)
self.assertTrue((self.workspace_root / "idea_1" / "paper" / "current" / "main.tex").exists())
+ @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+ def test_generate_submission_bundle_blocks_unresolved_reference_in_rendered_tex(self, run_full):
+ full_tex = (
+ "\\documentclass{article}\\begin{document}"
+ "\\begin{abstract}Clean abstract.\\end{abstract}"
+ "\\section{Results}Table ?? reports the result."
+ "\\section{Discussion}Clean discussion."
+ "\\end{document}"
+ )
+ result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+ self.assertIn("error", result)
+ blockers = " ".join(result.get("submission_blockers") or []).lower()
+ self.assertIn("??", blockers)
+
+ @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+ def test_generate_submission_bundle_blocks_repeated_boilerplate_in_rendered_tex(self, run_full):
+ sentence = "This repeated boilerplate sentence should not appear in every generated paragraph."
+ full_tex = (
+ "\\documentclass{article}\\begin{document}"
+ "\\begin{abstract}Clean abstract.\\end{abstract}"
+ "\\section{Results}"
+ + "\n".join([sentence] * 4)
+ + "\n"
+ + "\\section{Discussion}Clean discussion."
+ "\\end{document}"
+ )
+ result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+ self.assertIn("error", result)
+ blockers = " ".join(result.get("submission_blockers") or []).lower()
+ self.assertIn("boilerplate", blockers)
+
+ @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+ def test_generate_submission_bundle_preserves_existing_placeholder_gate(self, run_full):
+ full_tex = (
+ "\\documentclass{article}\\begin{document}"
+ "\\begin{abstract}Clean abstract.\\end{abstract}"
+ "\\section{Results}This placeholder text must not ship."
+ "\\section{Discussion}Clean discussion."
+ "\\end{document}"
+ )
+ result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+ self.assertIn("error", result)
+ blockers = " ".join(result.get("submission_blockers") or []).lower()
+ self.assertIn("placeholder", blockers)
+
+ @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+ def test_generate_submission_bundle_reports_all_latex_sanity_hits(self, run_full):
+ full_tex = (
+ "\\documentclass{article}\\begin{document}"
+ "\\begin{abstract}Clean abstract.\\end{abstract}"
+ "\\section{Results}Table ?? appears with {{plot_cmd}}."
+ "\\section{Discussion}Clean discussion."
+ "\\end{document}"
+ )
+ result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+ self.assertIn("error", result)
+ blockers = " ".join(result.get("submission_blockers") or []).lower()
+ self.assertIn("??", blockers)
+ self.assertIn("plot_cmd", blockers)
+
+ @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline")
+ def test_generate_submission_bundle_blocks_cross_run_identity_in_rendered_tex(self, run_full):
+ full_tex = (
+ "\\documentclass{article}"
+ "\\title{Auto Manuscript Insight}"
+ "\\begin{document}"
+ "\\maketitle"
+ "\\begin{abstract}Clean abstract.\\end{abstract}"
+ "\\section{Introduction}Intro with \\cite{cite_a}."
+ "\\begin{figure}[t]"
+ "\\caption{OtherMethod trajectory.}"
+ "\\end{figure}"
+ "\\section{Discussion}Clean discussion."
+ "\\bibliographystyle{plain}"
+ "\\bibliography{references}"
+ "\\end{document}"
+ )
+ result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex))
+
+ self.assertIn("error", result)
+ blockers = " ".join(result.get("submission_blockers") or []).lower()
+ self.assertIn("othermethod", blockers)
+
def test_generate_submission_bundle_blocks_non_formal_run(self):
database.execute(
"UPDATE experiment_runs SET proxy_config=? WHERE id=1",
From 384664254e0960545678f37a1599ff2868de0008 Mon Sep 17 00:00:00 2001
From: Protocol Zero <257158451+Protocol-zero-0@users.noreply.github.com>
Date: Thu, 4 Jun 2026 20:59:55 +0000
Subject: [PATCH 6/7] Add CPU presentation boundary tests (#26)
---
tests/test_presentation_cpu_boundary_m5.py | 126 +++++++++++++++++++++
1 file changed, 126 insertions(+)
create mode 100644 tests/test_presentation_cpu_boundary_m5.py
diff --git a/tests/test_presentation_cpu_boundary_m5.py b/tests/test_presentation_cpu_boundary_m5.py
new file mode 100644
index 0000000..7f9b117
--- /dev/null
+++ b/tests/test_presentation_cpu_boundary_m5.py
@@ -0,0 +1,126 @@
+import json
+import subprocess
+import sys
+
+from agents.benchmark_artifacts import materialize_deep_benchmark_artifacts
+from agents.paper_orchestra_pipeline import assemble_main_tex
+
+
+FORBIDDEN_MODULES = {
+ "torch",
+ "transformers",
+ "vllm",
+ "agents.experiment_forge",
+ "agents.experiment_executor",
+}
+
+
+def test_m5_importing_presentation_modules_does_not_load_gpu_or_execution_modules():
+ script = (
+ "import sys\n"
+ "import agents.paper_orchestra_pipeline\n"
+ "import agents.paper_completeness\n"
+ f"forbidden = {sorted(FORBIDDEN_MODULES)!r}\n"
+ "loaded = [name for name in forbidden if name in sys.modules]\n"
+ "print('\\n'.join(loaded))\n"
+ "raise SystemExit(1 if loaded else 0)\n"
+ )
+
+ result = subprocess.run(
+ [sys.executable, "-c", script],
+ cwd=".",
+ text=True,
+ capture_output=True,
+ check=False,
+ )
+
+ assert result.returncode == 0, result.stdout + result.stderr
+ assert result.stdout.strip() == ""
+
+
+def test_m5_assemble_main_tex_renders_offline_without_llm_or_gpu_modules():
+ state = {
+ "title": "CPU Boundary Paper",
+ "baseline_metric_name": "exact_match",
+ "baseline_metric_value": 0.5,
+ "best_metric_value": 0.61,
+ "effect_pct": 22.0,
+ "verdict": "confirmed",
+ "problem_statement": "Can a deterministic renderer run offline?",
+ "method_summary": "A presentation-only method summary.",
+ "problem_awareness": {
+ "central_question": "Can rendering run without benchmark execution?",
+ "motivation": "Presentation gates must stay CPU-only.",
+ "method_answer": "Use already materialized evidence.",
+ "result_claim": "The renderer emits LaTeX from snapshots.",
+ },
+ "contributions": ["A CPU-only rendering path."],
+ "benchmark_summary": {
+ "primary_metric": "exact_match",
+ "per_method": {
+ "Candidate": {"exact_match": 0.61, "n": 20},
+ "Baseline": {"exact_match": 0.50, "n": 20},
+ },
+ },
+ }
+ orchestrated = {
+ "refined": {
+ "abstract": "Offline abstract.",
+ "introduction": "Offline introduction.",
+ "method": "Offline method.",
+ "experiments": "Offline experiments.",
+ "discussion": "Offline discussion.",
+ },
+ "plotting": {"plotting_executor": {"assets": []}},
+ }
+
+ main_tex = assemble_main_tex(state, orchestrated, "conference")
+
+ assert "\\documentclass" in main_tex
+ assert "CPU Boundary Paper" in main_tex
+ assert "\\section{Experiments}" in main_tex
+ assert not (FORBIDDEN_MODULES & set(sys.modules))
+
+
+def test_m5_materialize_raw_predictions_to_cpu_artifacts(tmp_path):
+ results_dir = tmp_path / "results"
+ results_dir.mkdir()
+ methods = ["CPG", "Baseline", "CPG/no_guard"]
+ datasets = ["GSM8K", "StrategyQA"]
+ with (results_dir / "raw_predictions.jsonl").open("w", encoding="utf-8") as handle:
+ for method in methods:
+ for dataset in datasets:
+ for seed in range(3):
+ for _ in range(7):
+ score = 0.72 if method == "CPG" else 0.61 if method == "Baseline" else 0.66
+ row = {
+ "method": method,
+ "dataset": dataset,
+ "seed": seed,
+ "exact_match": score,
+ }
+ handle.write(json.dumps(row) + "\n")
+
+ report = materialize_deep_benchmark_artifacts(
+ results_dir,
+ publication_contract={"required_ablations": ["CPG/no_guard"]},
+ metric_name="exact_match",
+ min_lines=100,
+ )
+
+ assert report["ok"] is True
+ for name in (
+ "benchmark_summary.json",
+ "main_results_table.json",
+ "seed_variance_table.json",
+ "per_dataset_results.json",
+ "ablation_table.json",
+ ):
+ assert (results_dir / name).exists()
+ summary = json.loads((results_dir / "benchmark_summary.json").read_text(encoding="utf-8"))
+ assert summary["primary_metric"] == "exact_match"
+ assert summary["per_method"]["CPG"]["exact_match"] == 0.72
+ assert summary["per_method"]["CPG"]["n"] == 42
+ assert summary["seed_variance"]["CPG"]["n_seeds"] == 3
+ assert summary["seed_variance"]["CPG"]["per_seed"] == {"0": 0.72, "1": 0.72, "2": 0.72}
+ assert summary["ablations"]["CPG/no_guard"]["executed"] is True
From df7881592f5b55f294ece29bb653067c13e688a8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:19:21 +0000
Subject: [PATCH 7/7] Add missing CLA signer for Protocol-zero-0
---
cla-signers.json | 3 +++
1 file changed, 3 insertions(+)
diff --git a/cla-signers.json b/cla-signers.json
index d24e8e5..054cb75 100644
--- a/cla-signers.json
+++ b/cla-signers.json
@@ -6,6 +6,9 @@
},
{
"github": "hitome0123"
+ },
+ {
+ "github": "Protocol-zero-0"
}
]
}