diff --git a/agents/paper_completeness.py b/agents/paper_completeness.py index f3b8d0e..4d3e4ab 100644 --- a/agents/paper_completeness.py +++ b/agents/paper_completeness.py @@ -36,6 +36,9 @@ "outside the verified claim", "missing generated figure", "diagram placeholder", + "fixme", + " dict[str, Any]: if isinstance(value, Mapping): @@ -145,6 +163,208 @@ def _numeric(value: Any) -> float | None: return None +def _significance_alpha() -> float: + raw = os.environ.get("DEEPGRAPH_SIGNIFICANCE_ALPHA") + alpha = _numeric(raw) + if alpha is None or alpha <= 0: + return 0.05 + return alpha + + +def build_evidence_ledger( + packet: dict, + benchmark_summary: dict, + *, + alpha: float, + provenance: dict | None = None, +) -> dict: + """Build the single source of truth for presentation evidence.""" + packet = _as_dict(packet) + summary = _as_dict(benchmark_summary) + p_value = _numeric(packet.get("p_value")) + effect_size = _numeric(_first_present(packet.get("effect_size"), packet.get("effect_pct"))) + metric = _text(_first_present(packet.get("metric_name"), summary.get("primary_metric"), summary.get("metric_name"))) + seeds = _as_list(_first_present(summary.get("seeds"), summary.get("seed_values"))) + if not seeds: + seeds = _seed_list(summary) + confidence = round(1.0 - p_value, 10) if p_value is not None else None + return { + "schema_version": "1.0", + "run_id": _text(packet.get("run_id")), + "claim_id": _text(packet.get("claim_id")), + "metric": metric, + "alpha": alpha, + "verdict": _text(packet.get("verdict")), + "p_value": p_value, + "effect_size": effect_size, + "confidence": confidence, + "repro_ci": _as_list(_first_present(packet.get("repro_ci"), summary.get("repro_ci"))), + "kept_ci": _as_list(_first_present(packet.get("kept_ci"), summary.get("kept_ci"))), + "per_method": _as_dict(summary.get("per_method")), + "seed_variance": _as_dict(summary.get("seed_variance")), + "seeds": seeds, + "provenance": _as_dict(provenance), + } + + +def _ledger_supports_significance(ledger: dict[str, Any]) -> bool: + p_value = _numeric(ledger.get("p_value")) + alpha = _numeric(ledger.get("alpha")) + verdict = _lower(ledger.get("verdict")) + return bool( + p_value is not None + and alpha is not None + and p_value < alpha + and verdict in {"confirmed", "reproduced"} + ) + + +EVIDENCE_LEDGER_REQUIRED_FIELDS = ( + "schema_version", + "run_id", + "claim_id", + "metric", + "alpha", + "verdict", + "p_value", + "effect_size", + "confidence", + "repro_ci", + "kept_ci", + "per_method", + "seed_variance", + "seeds", + "provenance", +) + +TRACEABILITY_POSITIVE_TERMS = ( + "improves", + "improve", + "outperforms", + "outperform", + "significant", + "significantly", + "superior", + "beats", + "surpasses", +) + +TRACEABILITY_NUMBER_RE = re.compile(r"-?\d+\.?\d*%?") + + +def validate_evidence_ledger(ledger: dict) -> list[dict]: + """Return schema violations. Missing required fields are explicit.""" + ledger = _as_dict(ledger) + violations: list[dict] = [] + for field in EVIDENCE_LEDGER_REQUIRED_FIELDS: + if field not in ledger: + violations.append( + { + "rule": "schema_error", + "location": {"section": "ledger", "line": 0}, + "snippet": f"missing required field: {field}", + "value": field, + } + ) + return violations + + +def _traceable_sections(main_tex: str) -> list[tuple[str, str, int]]: + sections: list[tuple[str, str, int]] = [] + abstract = re.search(r"\\begin\{abstract\}(.*?)\\end\{abstract\}", main_tex or "", re.DOTALL | re.IGNORECASE) + if abstract: + sections.append(("abstract", abstract.group(1), abstract.start(1))) + section_re = re.compile(r"\\section\{([^}]*)\}", re.IGNORECASE) + matches = list(section_re.finditer(main_tex or "")) + for idx, match in enumerate(matches): + name = _lower(match.group(1)) + if name not in {"conclusion", "conclusions", "discussion"}: + continue + end = matches[idx + 1].start() if idx + 1 < len(matches) else len(main_tex or "") + section_name = "discussion" if name == "discussion" else "conclusion" + sections.append((section_name, (main_tex or "")[match.end():end], match.end())) + return sections + + +def _ledger_numeric_sources(ledger: dict[str, Any]) -> list[float]: + sources: list[float] = [] + + def add(value: Any) -> None: + numeric = _numeric(value) + if numeric is not None: + sources.append(numeric) + + add(ledger.get("p_value")) + add(ledger.get("effect_size")) + for value in _as_list(ledger.get("repro_ci")) + _as_list(ledger.get("kept_ci")): + add(value) + metric = _text(ledger.get("metric")) + for payload in _as_dict(ledger.get("per_method")).values(): + row = _as_dict(payload) + if metric: + add(row.get(metric)) + for payload in _as_dict(ledger.get("seed_variance")).values(): + add(_as_dict(payload).get("mean")) + return sources + + +def _traceable_number(token: str, sources: list[float]) -> bool: + is_percent = token.endswith("%") + numeric = _numeric(token[:-1] if is_percent else token) + if numeric is None: + return True + if is_percent: + numeric /= 100.0 + return any(abs(numeric - source) <= 1e-6 for source in sources) + + +def _skip_traceability_number(line: str, start: int, end: int) -> bool: + prefix = line[:start] + suffix = line[end:] + if re.search(r"(?:Table|Figure|Section)\s*$", prefix, re.IGNORECASE): + return True + if re.match(r"\s*seeds?\b", suffix, re.IGNORECASE): + return True + return False + + +def assert_traceable(main_tex: str, ledger: dict) -> list[dict]: + """Return evidence traceability violations for Abstract / Conclusion only.""" + ledger = _as_dict(ledger) + violations = validate_evidence_ledger(ledger) + sources = _ledger_numeric_sources(ledger) + negative_verdict = _lower(ledger.get("verdict")) in {"refuted", "inconclusive"} + positive_re = re.compile(r"\b(" + "|".join(re.escape(term) for term in TRACEABILITY_POSITIVE_TERMS) + r")\b", re.IGNORECASE) + text = main_tex or "" + + for section, body, offset in _traceable_sections(text): + for rel_line, line in enumerate(body.splitlines(), start=1): + absolute_line = text.count("\n", 0, offset) + rel_line + if negative_verdict and positive_re.search(line): + violations.append( + { + "rule": "positive_claim_with_negative_verdict", + "location": {"section": section, "line": absolute_line}, + "snippet": line.strip(), + "value": ledger.get("verdict"), + } + ) + for match in TRACEABILITY_NUMBER_RE.finditer(line): + if _skip_traceability_number(line, match.start(), match.end()): + continue + token = match.group(0) + if not _traceable_number(token, sources): + violations.append( + { + "rule": "unsourced_number", + "location": {"section": section, "line": absolute_line}, + "snippet": line.strip(), + "value": token[:-1] if token.endswith("%") else token, + } + ) + return violations + + def _infer_model_size(name: str) -> str: match = re.search(r"(\d+(?:\.\d+)?\s*[bB])", name or "") return match.group(1).replace(" ", "") if match else "" @@ -586,9 +806,9 @@ def build_claim_evidence_matrix(state: dict[str, Any], manifest: dict[str, Any]) packet, summary, _artifact_manifest, _contract = _packet_parts(state) per_method = _as_dict(summary.get("per_method")) seeds = _as_list(manifest.get("seeds")) - p_text = _lower(manifest.get("statistical_tests")) has_multi_seed = len(seeds) >= 3 - has_significance = "p=" in p_text or "bootstrap" in p_text or "permutation" in p_text + ledger = build_evidence_ledger(packet, summary, alpha=_significance_alpha()) + has_significance = _ledger_supports_significance(ledger) rows = [ { "claim": "Improves utility", @@ -634,8 +854,8 @@ def build_claim_evidence_matrix(state: dict[str, Any], manifest: dict[str, Any]) "claim": _text(packet.get("claim_text"))[:280], "required_evidence": "mapped quantitative artifact in result_packet or benchmark_summary", "current_evidence": "present" if len(per_method) >= 2 else "missing benchmark comparison", - "can_appear_in_abstract": bool(len(per_method) >= 2 and has_multi_seed), - "allowed_sections": ["Abstract", "Introduction", "Conclusion"] if len(per_method) >= 2 and has_multi_seed else ["Motivation", "Limitations"], + "can_appear_in_abstract": bool(len(per_method) >= 2 and has_multi_seed and has_significance), + "allowed_sections": ["Abstract", "Introduction", "Conclusion"] if len(per_method) >= 2 and has_multi_seed and has_significance else ["Motivation", "Limitations"], }, ) return rows @@ -742,6 +962,7 @@ def build_reviewer_report( blockers: list[str], ) -> dict[str, Any]: packet, summary, _artifact_manifest, _contract = _packet_parts(state) + ledger = build_evidence_ledger(packet, summary, alpha=_significance_alpha()) answers: list[dict[str, Any]] = [] def add(question: str, yes: bool, evidence: str) -> None: @@ -750,7 +971,11 @@ def add(question: str, yes: bool, evidence: str) -> None: add("What is the exact dataset?", bool(_as_list(manifest.get("datasets")) and not any("dataset" in b.lower() for b in blockers[:3])), str(manifest.get("datasets") or "missing")) add("What is the exact model?", bool(_as_list(manifest.get("models")) and not any("model" in b.lower() for b in blockers)), str(manifest.get("models") or "missing")) add("What is the exact baseline?", len(_as_list(manifest.get("baselines"))) >= 2, ", ".join(_as_list(manifest.get("baselines")))) - add("Is the improvement statistically significant?", bool(_text(manifest.get("statistical_tests")) and not any("statistical" in b.lower() for b in blockers)), _text(manifest.get("statistical_tests"))) + add( + "Is the improvement statistically significant?", + bool(_ledger_supports_significance(ledger) and not any("statistical" in b.lower() for b in blockers)), + f"p={ledger.get('p_value')}; alpha={ledger.get('alpha')}; verdict={ledger.get('verdict')}", + ) add("Is there more than one benchmark?", len(_as_list(manifest.get("datasets"))) > 1, str(len(_as_list(manifest.get("datasets"))))) add("Are compute savings actually measured?", bool(manifest.get("latency") or manifest.get("token_cost")), "latency/token payload present" if (manifest.get("latency") or manifest.get("token_cost")) else "missing") add("Is the proposed gate better than confidence/disagreement routing?", any("Beats confidence" in row.get("claim", "") and row.get("can_appear_in_abstract") for row in claim_matrix), "claim-evidence matrix") @@ -826,10 +1051,89 @@ def audit_evidence_completeness(state: dict[str, Any]) -> dict[str, Any]: } -def latex_sanity_check(text: str) -> dict[str, Any]: +def _line_hit(rule: str, value: str, line_no: int, line: str, *, kind: str = "deterministic") -> dict[str, Any]: + return { + "kind": kind, + "rule": rule, + "value": value, + "location": {"line": line_no}, + "snippet": line.strip(), + } + + +def _strip_latex_code_blocks(text: str) -> str: + stripped = re.sub( + r"\\begin\{(?:verbatim|lstlisting|minted)\}.*?\\end\{(?:verbatim|lstlisting|minted)\}", + "", + text or "", + flags=re.DOTALL | re.IGNORECASE, + ) + stripped = re.sub(r"```.*?```", "", stripped, flags=re.DOTALL) + return stripped + + +def _deterministic_latex_hits(text: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]: + state = _as_dict(state) + scan_text = _strip_latex_code_blocks(text or "") + hits: list[dict[str, Any]] = [] + for line_no, line in enumerate(scan_text.splitlines(), start=1): + if re.search(r"\?\?|Table\s+\?\?|Figure\s+\?\?", line): + hits.append(_line_hit("unresolved_reference", "??", line_no, line)) + for match in re.finditer(r"\\(?:ref|cite[a-zA-Z*]*)\{([^}]*)\}", line): + key = match.group(1).strip() + if not key or "?" in key or _lower(key) in {"todo", "tbd", "placeholder"}: + hits.append(_line_hit("unresolved_reference", match.group(0), line_no, line)) + for match in re.finditer(r"\{\{[a-z_][a-z0-9_]*\}\}", line): + prefix = line[: match.start()] + if re.search(r"\\[A-Za-z]+\s*$", prefix) or prefix.rstrip().endswith("}"): + continue + hits.append(_line_hit("template_placeholder", match.group(0), line_no, line)) + + method_name = _text(state.get("method_name")) + if method_name: + allowed = set(METHOD_TOKEN_WHITELIST) + allowed.update(re.findall(r"\b(?:[A-Z]{2,}[A-Za-z0-9]*|[A-Z][a-z]+(?:[A-Z][A-Za-z0-9]*)+)\b", method_name)) + scoped_parts: list[tuple[str, int]] = [] + for match in re.finditer(r"\\title\{([^}]*)\}", scan_text, re.DOTALL | re.IGNORECASE): + scoped_parts.append((match.group(1), scan_text.count("\n", 0, match.start(1)) + 1)) + begin_doc = re.search(r"\\begin\{document\}", scan_text, re.IGNORECASE) + if begin_doc: + scoped_parts.append((scan_text[:begin_doc.start()], 1)) + for match in re.finditer(r"\\caption\{([^}]*)\}", scan_text, re.DOTALL | re.IGNORECASE): + scoped_parts.append((match.group(1), scan_text.count("\n", 0, match.start(1)) + 1)) + token_re = re.compile(r"\b(?:[A-Z]{2,}[A-Za-z0-9]*|[A-Z][a-z]+(?:[A-Z][A-Za-z0-9]*)+)\b") + for snippet, base_line in scoped_parts: + for token_match in token_re.finditer(snippet): + token = token_match.group(0) + if token not in allowed: + hits.append( + _line_hit( + "cross_run_identity", + token, + base_line + snippet.count("\n", 0, token_match.start()), + snippet.splitlines()[0] if snippet.splitlines() else snippet, + ) + ) + + sentence_locations: dict[str, list[tuple[int, str]]] = {} + for line_no, line in enumerate(scan_text.splitlines(), start=1): + for sentence in re.split(r"(?<=[.!?])\s+", line.strip()): + normalized = re.sub(r"[^a-z0-9\s]", "", sentence.lower()) + normalized = re.sub(r"\s+", " ", normalized).strip() + if len(normalized.split()) >= 8: + sentence_locations.setdefault(normalized, []).append((line_no, sentence)) + for normalized, locations in sentence_locations.items(): + if len(locations) >= BOILERPLATE_REPEAT_THRESHOLD: + line_no, sentence = locations[0] + hits.append(_line_hit("boilerplate_repetition", normalized, line_no, sentence)) + return hits + + +def latex_sanity_check(text: str, state: dict[str, Any] | None = None, ledger: dict[str, Any] | None = None) -> dict[str, Any]: lower = (text or "").lower() hits = [{"kind": "term", "value": term} for term in FORBIDDEN_LATEX_TERMS if term in lower] hits.extend({"kind": "symbol", "value": symbol} for symbol in FORBIDDEN_LATEX_SYMBOLS if symbol in (text or "")) + hits.extend(_deterministic_latex_hits(text or "", state=state)) return { "schema_version": "latex_sanity_v1", "ok": not hits, diff --git a/agents/paper_orchestra_pipeline.py b/agents/paper_orchestra_pipeline.py index 98f352b..11497bd 100644 --- a/agents/paper_orchestra_pipeline.py +++ b/agents/paper_orchestra_pipeline.py @@ -1840,7 +1840,7 @@ def _resolve_template_id(fmt: str) -> str: "backend": "paper_orchestra", } page_budget_warning = None - latex_sanity_report = latex_sanity_check(main_tex) + latex_sanity_report = latex_sanity_check(main_tex, state=state) _write(bundle_dir / "main.tex", main_tex) _write( bundle_dir / "latex_sanity_report.json", diff --git a/cla-signers.json b/cla-signers.json index d24e8e5..054cb75 100644 --- a/cla-signers.json +++ b/cla-signers.json @@ -6,6 +6,9 @@ }, { "github": "hitome0123" + }, + { + "github": "Protocol-zero-0" } ] } diff --git a/tests/fixtures/m1_p_eq_one.json b/tests/fixtures/m1_p_eq_one.json new file mode 100644 index 0000000..099a315 --- /dev/null +++ b/tests/fixtures/m1_p_eq_one.json @@ -0,0 +1,38 @@ +{ + "state": { + "result_packet": { + "run_id": "run-m1-p-one", + "claim_id": "claim-m1-p-one", + "claim_text": "The method improves utility.", + "metric_name": "exact_match", + "verdict": "inconclusive", + "p_value": 1.0, + "effect_size": 0.0, + "benchmark_summary": { + "primary_metric": "exact_match", + "per_method": { + "Candidate": {"exact_match": 0.705, "n": 200}, + "Baseline": {"exact_match": 0.655, "n": 200} + }, + "seed_variance": { + "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}} + }, + "seeds": [0, 1, 2], + "statistical_tests": "p=1.0000" + } + } + }, + "manifest": { + "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}], + "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}], + "baselines": ["Baseline", "Candidate"], + "metrics": ["exact_match"], + "seeds": [0, 1, 2], + "hardware": "CPU test fixture", + "statistical_tests": "p=1.0000", + "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}}, + "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}}, + "ablation": [{"name": "no_guard"}], + "artifacts": {"benchmark_summary": "benchmark_summary.json"} + } +} diff --git a/tests/fixtures/m1_refuted.json b/tests/fixtures/m1_refuted.json new file mode 100644 index 0000000..a0743e6 --- /dev/null +++ b/tests/fixtures/m1_refuted.json @@ -0,0 +1,38 @@ +{ + "state": { + "result_packet": { + "run_id": "run-m1-refuted", + "claim_id": "claim-m1-refuted", + "claim_text": "The method improves utility.", + "metric_name": "exact_match", + "verdict": "refuted", + "p_value": 0.0123, + "effect_size": -0.01, + "benchmark_summary": { + "primary_metric": "exact_match", + "per_method": { + "Candidate": {"exact_match": 0.605, "n": 200}, + "Baseline": {"exact_match": 0.655, "n": 200} + }, + "seed_variance": { + "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}} + }, + "seeds": [0, 1, 2], + "statistical_tests": "paired permutation p=0.0123" + } + } + }, + "manifest": { + "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}], + "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}], + "baselines": ["Baseline", "Candidate"], + "metrics": ["exact_match"], + "seeds": [0, 1, 2], + "hardware": "CPU test fixture", + "statistical_tests": "paired permutation p=0.0123", + "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}}, + "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}}, + "ablation": [{"name": "no_guard"}], + "artifacts": {"benchmark_summary": "benchmark_summary.json"} + } +} diff --git a/tests/fixtures/m1_significant.json b/tests/fixtures/m1_significant.json new file mode 100644 index 0000000..3da4701 --- /dev/null +++ b/tests/fixtures/m1_significant.json @@ -0,0 +1,40 @@ +{ + "state": { + "result_packet": { + "run_id": "run-m1-significant", + "claim_id": "claim-m1-significant", + "claim_text": "The method improves utility.", + "metric_name": "exact_match", + "verdict": "confirmed", + "p_value": 0.0123, + "effect_size": 0.045, + "benchmark_summary": { + "primary_metric": "exact_match", + "per_method": { + "Candidate": {"exact_match": 0.705, "n": 200}, + "Baseline": {"exact_match": 0.655, "n": 200} + }, + "seed_variance": { + "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}} + }, + "seeds": [0, 1, 2], + "repro_ci": [0.01, 0.08], + "kept_ci": [0.0, 0.07], + "statistical_tests": "paired permutation p=0.0123" + } + } + }, + "manifest": { + "datasets": [{"name": "GSM8K", "split": "test", "num_test": 200}], + "models": [{"name": "Qwen", "prompt_template": "qa", "decoding": "greedy"}], + "baselines": ["Baseline", "Candidate"], + "metrics": ["exact_match"], + "seeds": [0, 1, 2], + "hardware": "CPU test fixture", + "statistical_tests": "paired permutation p=0.0123", + "latency": {"per_method": {"Candidate": 1.0, "Baseline": 0.8}}, + "token_cost": {"per_method": {"Candidate": 100, "Baseline": 80}}, + "ablation": [{"name": "no_guard"}], + "artifacts": {"benchmark_summary": "benchmark_summary.json"} + } +} diff --git a/tests/fixtures/m2a_clean.tex b/tests/fixtures/m2a_clean.tex new file mode 100644 index 0000000..68b02e7 --- /dev/null +++ b/tests/fixtures/m2a_clean.tex @@ -0,0 +1,7 @@ +\documentclass{article} +\begin{document} +Does the method help? +\begin{verbatim} +Table ?? +\end{verbatim} +\end{document} diff --git a/tests/fixtures/m2a_unresolved.tex b/tests/fixtures/m2a_unresolved.tex new file mode 100644 index 0000000..c18c553 --- /dev/null +++ b/tests/fixtures/m2a_unresolved.tex @@ -0,0 +1,4 @@ +\documentclass{article} +\begin{document} +Table ?? reports the unresolved result. +\end{document} diff --git a/tests/fixtures/m2b_clean.tex b/tests/fixtures/m2b_clean.tex new file mode 100644 index 0000000..01c4970 --- /dev/null +++ b/tests/fixtures/m2b_clean.tex @@ -0,0 +1,6 @@ +\documentclass{article} +\begin{document} +\[ +\frac{{a}}{{b}} +\] +\end{document} diff --git a/tests/fixtures/m2b_scaffold.tex b/tests/fixtures/m2b_scaffold.tex new file mode 100644 index 0000000..673f786 --- /dev/null +++ b/tests/fixtures/m2b_scaffold.tex @@ -0,0 +1,6 @@ +\documentclass{article} +\begin{document} +\begin{figure} +\caption{Generated plot command {{plot_cmd}}.} +\end{figure} +\end{document} diff --git a/tests/fixtures/m2c_clean.tex b/tests/fixtures/m2c_clean.tex new file mode 100644 index 0000000..ae1dcb8 --- /dev/null +++ b/tests/fixtures/m2c_clean.tex @@ -0,0 +1,7 @@ +\documentclass{article} +\title{CGGR Results on GSM8K} +\begin{document} +\begin{figure} +\caption{CGGR trajectory on GSM8K with CPU evaluation.} +\end{figure} +\end{document} diff --git a/tests/fixtures/m2c_contaminated.tex b/tests/fixtures/m2c_contaminated.tex new file mode 100644 index 0000000..da9e89c --- /dev/null +++ b/tests/fixtures/m2c_contaminated.tex @@ -0,0 +1,7 @@ +\documentclass{article} +\title{OtherMethod Results} +\begin{document} +\begin{figure} +\caption{OtherMethod trajectory.} +\end{figure} +\end{document} diff --git a/tests/fixtures/m2d_clean.tex b/tests/fixtures/m2d_clean.tex new file mode 100644 index 0000000..10a0ea4 --- /dev/null +++ b/tests/fixtures/m2d_clean.tex @@ -0,0 +1,6 @@ +\documentclass{article} +\begin{document} +This section introduces the benchmark setup. +The result section then describes the measured effects. +Finally, the discussion names limitations without repeating a template. +\end{document} diff --git a/tests/fixtures/m2d_repeat.tex b/tests/fixtures/m2d_repeat.tex new file mode 100644 index 0000000..1729d07 --- /dev/null +++ b/tests/fixtures/m2d_repeat.tex @@ -0,0 +1,7 @@ +\documentclass{article} +\begin{document} +This repeated boilerplate sentence should not appear in every generated paragraph. +This repeated boilerplate sentence should not appear in every generated paragraph. +This repeated boilerplate sentence should not appear in every generated paragraph. +This repeated boilerplate sentence should not appear in every generated paragraph. +\end{document} diff --git a/tests/fixtures/m4_clean.tex b/tests/fixtures/m4_clean.tex new file mode 100644 index 0000000..df2b41e --- /dev/null +++ b/tests/fixtures/m4_clean.tex @@ -0,0 +1,8 @@ +\documentclass{article} +\begin{document} +\begin{abstract} +Candidate reaches 70.5% exact match with p=0.0123 and effect 0.045. +\end{abstract} +\section{Conclusion} +The reproduced mean is 70.5% with interval endpoints 0.01 and 0.08. +\end{document} diff --git a/tests/fixtures/m4_clean_ledger.json b/tests/fixtures/m4_clean_ledger.json new file mode 100644 index 0000000..875c817 --- /dev/null +++ b/tests/fixtures/m4_clean_ledger.json @@ -0,0 +1,22 @@ +{ + "schema_version": "1.0", + "run_id": "run-m4-clean", + "claim_id": "claim-m4-clean", + "metric": "exact_match", + "alpha": 0.05, + "verdict": "confirmed", + "p_value": 0.0123, + "effect_size": 0.045, + "confidence": 0.9877, + "repro_ci": [0.01, 0.08], + "kept_ci": [0.0, 0.07], + "per_method": { + "Candidate": {"exact_match": 0.705, "n": 200}, + "Baseline": {"exact_match": 0.655, "n": 200} + }, + "seed_variance": { + "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}} + }, + "seeds": [0, 1, 2], + "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"} +} diff --git a/tests/fixtures/m4_method_only.tex b/tests/fixtures/m4_method_only.tex new file mode 100644 index 0000000..924f5e2 --- /dev/null +++ b/tests/fixtures/m4_method_only.tex @@ -0,0 +1,10 @@ +\documentclass{article} +\begin{document} +\begin{abstract} +This paper describes the experiment. +\end{abstract} +\section{Method} +The method section mentions a calibration constant 0.777 that is not in the ledger. +\section{Conclusion} +The conclusion contains no numeric claim. +\end{document} diff --git a/tests/fixtures/m4_missing_p_ledger.json b/tests/fixtures/m4_missing_p_ledger.json new file mode 100644 index 0000000..269c894 --- /dev/null +++ b/tests/fixtures/m4_missing_p_ledger.json @@ -0,0 +1,21 @@ +{ + "schema_version": "1.0", + "run_id": "run-m4-missing-p", + "claim_id": "claim-m4-missing-p", + "metric": "exact_match", + "alpha": 0.05, + "verdict": "confirmed", + "effect_size": 0.045, + "confidence": 0.99, + "repro_ci": [0.02, 0.08], + "kept_ci": [0.0, 0.07], + "per_method": { + "Candidate": {"exact_match": 0.705, "n": 200}, + "Baseline": {"exact_match": 0.655, "n": 200} + }, + "seed_variance": { + "Candidate": {"mean": 0.705, "std": 0.012, "per_seed": {"0": 0.71, "1": 0.70, "2": 0.705}} + }, + "seeds": [0, 1, 2], + "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"} +} diff --git a/tests/fixtures/m4_refuted_ledger.json b/tests/fixtures/m4_refuted_ledger.json new file mode 100644 index 0000000..69ccf56 --- /dev/null +++ b/tests/fixtures/m4_refuted_ledger.json @@ -0,0 +1,22 @@ +{ + "schema_version": "1.0", + "run_id": "run-m4-refuted", + "claim_id": "claim-m4-refuted", + "metric": "exact_match", + "alpha": 0.05, + "verdict": "refuted", + "p_value": 0.0123, + "effect_size": -0.01, + "confidence": 0.9877, + "repro_ci": [-0.08, -0.01], + "kept_ci": [-0.07, 0.0], + "per_method": { + "Candidate": {"exact_match": 0.605, "n": 200}, + "Baseline": {"exact_match": 0.655, "n": 200} + }, + "seed_variance": { + "Candidate": {"mean": 0.605, "std": 0.012, "per_seed": {"0": 0.61, "1": 0.60, "2": 0.605}} + }, + "seeds": [0, 1, 2], + "provenance": {"code_commit": "abc123", "dataset": "GSM8K", "command": "pytest"} +} diff --git a/tests/fixtures/m4_refuted_positive_claim.tex b/tests/fixtures/m4_refuted_positive_claim.tex new file mode 100644 index 0000000..9d7bced --- /dev/null +++ b/tests/fixtures/m4_refuted_positive_claim.tex @@ -0,0 +1,8 @@ +\documentclass{article} +\begin{document} +\begin{abstract} +Candidate significantly improves exact match over the baseline. +\end{abstract} +\section{Conclusion} +The method outperforms the baseline. +\end{document} diff --git a/tests/fixtures/m4_unsourced_number.tex b/tests/fixtures/m4_unsourced_number.tex new file mode 100644 index 0000000..9683f9a --- /dev/null +++ b/tests/fixtures/m4_unsourced_number.tex @@ -0,0 +1,8 @@ +\documentclass{article} +\begin{document} +\begin{abstract} +Candidate significantly improves exact match with p=0.01. +\end{abstract} +\section{Conclusion} +The conclusion repeats p=0.01. +\end{document} diff --git a/tests/fixtures/m4_whitelist_numbers.tex b/tests/fixtures/m4_whitelist_numbers.tex new file mode 100644 index 0000000..8c00fa8 --- /dev/null +++ b/tests/fixtures/m4_whitelist_numbers.tex @@ -0,0 +1,8 @@ +\documentclass{article} +\begin{document} +\begin{abstract} +Table 2 summarizes the result over 5 seeds. +\end{abstract} +\section{Discussion} +Figure 3 shows the setup. +\end{document} diff --git a/tests/test_latex_sanity_m2.py b/tests/test_latex_sanity_m2.py new file mode 100644 index 0000000..1a00845 --- /dev/null +++ b/tests/test_latex_sanity_m2.py @@ -0,0 +1,71 @@ +from pathlib import Path + +from agents.paper_completeness import latex_sanity_check + + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _tex(name: str) -> str: + return (FIXTURES / name).read_text(encoding="utf-8") + + +def _rules(report: dict) -> set[str]: + return {hit.get("rule") or hit.get("kind") for hit in report.get("hits", [])} + + +def test_m2a_unresolved_reference_fails_with_line_location(): + report = latex_sanity_check(_tex("m2a_unresolved.tex")) + + assert report["ok"] is False + assert "unresolved_reference" in _rules(report) + assert any(hit.get("location", {}).get("line") for hit in report["hits"]) + + +def test_m2a_single_question_and_verbatim_question_marks_pass(): + report = latex_sanity_check(_tex("m2a_clean.tex")) + + assert report["ok"] is True + + +def test_m2b_scaffold_placeholder_regex_fails_but_latex_braces_pass(): + bad = latex_sanity_check(_tex("m2b_scaffold.tex")) + clean = latex_sanity_check(_tex("m2b_clean.tex")) + + assert bad["ok"] is False + assert "template_placeholder" in _rules(bad) + assert clean["ok"] is True + + +def test_m2b_existing_placeholder_forbidden_term_still_blocks(): + report = latex_sanity_check("This caption contains placeholder text.") + + assert report["ok"] is False + assert any(hit.get("value") == "placeholder" for hit in report["hits"]) + + +def test_m2c_cross_run_method_token_fails_by_set_membership(): + report = latex_sanity_check(_tex("m2c_contaminated.tex"), state={"method_name": "CGGR"}) + + assert report["ok"] is False + assert "cross_run_identity" in _rules(report) + assert any(hit.get("value") == "OtherMethod" for hit in report["hits"]) + + +def test_m2c_method_token_and_abbreviation_whitelist_pass(): + report = latex_sanity_check(_tex("m2c_clean.tex"), state={"method_name": "CGGR"}) + + assert report["ok"] is True + + +def test_m2d_repeated_boilerplate_sentence_fails(): + report = latex_sanity_check(_tex("m2d_repeat.tex")) + + assert report["ok"] is False + assert "boilerplate_repetition" in _rules(report) + + +def test_m2d_clean_sentences_pass(): + report = latex_sanity_check(_tex("m2d_clean.tex")) + + assert report["ok"] is True diff --git a/tests/test_paper_completeness_m1.py b/tests/test_paper_completeness_m1.py new file mode 100644 index 0000000..170de8f --- /dev/null +++ b/tests/test_paper_completeness_m1.py @@ -0,0 +1,110 @@ +import json +from pathlib import Path + +from agents.paper_completeness import ( + audit_evidence_completeness, + build_claim_evidence_matrix, + build_evidence_ledger, + build_reviewer_report, +) + + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _load_fixture(name: str) -> dict: + return json.loads((FIXTURES / name).read_text(encoding="utf-8")) + + +def _improves_row(matrix: list[dict]) -> dict: + return next(row for row in matrix if row["claim"] == "Improves utility") + + +def _claim_row(matrix: list[dict], claim: str) -> dict: + return next(row for row in matrix if row["claim"].startswith(claim)) + + +def _significance_answer(report: dict) -> dict: + return next( + row + for row in report["checklist"] + if row["question"] == "Is the improvement statistically significant?" + ) + + +def test_m1_p_eq_one_is_not_significant_even_with_p_text(): + payload = _load_fixture("m1_p_eq_one.json") + matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"]) + + assert _improves_row(matrix)["can_appear_in_abstract"] is False + assert _claim_row(matrix, "The method improves utility")["can_appear_in_abstract"] is False + + +def test_m1_confirmed_p_below_default_alpha_can_appear(): + payload = _load_fixture("m1_significant.json") + matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"]) + + assert _improves_row(matrix)["can_appear_in_abstract"] is True + + +def test_m1_refuted_verdict_blocks_abstract_claim_even_with_low_p(): + payload = _load_fixture("m1_refuted.json") + matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"]) + + assert _improves_row(matrix)["can_appear_in_abstract"] is False + assert _claim_row(matrix, "The method improves utility")["can_appear_in_abstract"] is False + + +def test_m1_alpha_env_override_controls_significance(monkeypatch): + payload = _load_fixture("m1_significant.json") + payload["state"]["result_packet"]["p_value"] = 0.049 + payload["manifest"]["statistical_tests"] = "p=0.049" + monkeypatch.setenv("DEEPGRAPH_SIGNIFICANCE_ALPHA", "0.01") + + matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"]) + + assert _improves_row(matrix)["can_appear_in_abstract"] is False + + +def test_m1_missing_p_value_is_not_significant_and_does_not_crash(): + payload = _load_fixture("m1_significant.json") + payload["state"]["result_packet"]["p_value"] = None + payload["manifest"]["statistical_tests"] = "" + + matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"]) + + assert _improves_row(matrix)["can_appear_in_abstract"] is False + + +def test_m1_reviewer_significance_answer_uses_numeric_p_but_preserves_existence_gate(): + payload = _load_fixture("m1_p_eq_one.json") + matrix = build_claim_evidence_matrix(payload["state"], payload["manifest"]) + report = build_reviewer_report(payload["state"], payload["manifest"], matrix, blockers=[]) + + assert _significance_answer(report)["answer"] == "No" + audit = audit_evidence_completeness(payload["state"]) + assert not any("Statistical test or confidence interval" in blocker for blocker in audit["blockers"]) + + +def test_m1_build_evidence_ledger_minimal_schema(): + payload = _load_fixture("m1_significant.json") + packet = payload["state"]["result_packet"] + summary = packet["benchmark_summary"] + + ledger = build_evidence_ledger( + packet, + summary, + alpha=0.01, + provenance={"command": "pytest tests/test_paper_completeness_m1.py"}, + ) + + assert ledger["schema_version"] == "1.0" + assert ledger["alpha"] == 0.01 + assert ledger["verdict"] == "confirmed" + assert ledger["p_value"] == 0.0123 + assert ledger["effect_size"] == 0.045 + assert ledger["confidence"] == 0.9877 + assert ledger["per_method"]["Candidate"]["exact_match"] == 0.705 + assert ledger["seed_variance"]["Candidate"]["per_seed"] == {"0": 0.71, "1": 0.7, "2": 0.705} + assert ledger["seeds"] == [0, 1, 2] + assert ledger["provenance"]["command"].startswith("pytest") diff --git a/tests/test_paper_completeness_m4.py b/tests/test_paper_completeness_m4.py new file mode 100644 index 0000000..5722f84 --- /dev/null +++ b/tests/test_paper_completeness_m4.py @@ -0,0 +1,58 @@ +import json +from pathlib import Path + +from agents.paper_completeness import assert_traceable, validate_evidence_ledger + + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _tex(name: str) -> str: + return (FIXTURES / name).read_text(encoding="utf-8") + + +def _ledger(name: str) -> dict: + return json.loads((FIXTURES / name).read_text(encoding="utf-8")) + + +def test_m4_unsourced_abstract_number_reports_violation_when_ledger_lacks_value(): + violations = assert_traceable(_tex("m4_unsourced_number.tex"), _ledger("m4_missing_p_ledger.json")) + + assert any(v["rule"] == "unsourced_number" and v["value"] == "0.01" for v in violations) + + +def test_m4_refuted_verdict_blocks_positive_abstract_and_conclusion_claims(): + violations = assert_traceable( + _tex("m4_refuted_positive_claim.tex"), + _ledger("m4_refuted_ledger.json"), + ) + + assert any(v["rule"] == "positive_claim_with_negative_verdict" for v in violations) + assert {v["location"]["section"] for v in violations} >= {"abstract", "conclusion"} + + +def test_m4_clean_abstract_and_conclusion_numbers_are_traceable_with_percent_normalization(): + violations = assert_traceable(_tex("m4_clean.tex"), _ledger("m4_clean_ledger.json")) + + assert violations == [] + + +def test_m4_numbers_outside_abstract_and_conclusion_are_allowed(): + violations = assert_traceable(_tex("m4_method_only.tex"), _ledger("m4_clean_ledger.json")) + + assert violations == [] + + +def test_m4_schema_validation_reports_missing_required_field(): + ledger = _ledger("m4_clean_ledger.json") + ledger.pop("verdict") + + violations = validate_evidence_ledger(ledger) + + assert any(v["rule"] == "schema_error" and v["value"] == "verdict" for v in violations) + + +def test_m4_table_figure_and_seed_counts_are_not_unsourced_numbers(): + violations = assert_traceable(_tex("m4_whitelist_numbers.tex"), _ledger("m4_clean_ledger.json")) + + assert violations == [] diff --git a/tests/test_presentation_cpu_boundary_m5.py b/tests/test_presentation_cpu_boundary_m5.py new file mode 100644 index 0000000..7f9b117 --- /dev/null +++ b/tests/test_presentation_cpu_boundary_m5.py @@ -0,0 +1,126 @@ +import json +import subprocess +import sys + +from agents.benchmark_artifacts import materialize_deep_benchmark_artifacts +from agents.paper_orchestra_pipeline import assemble_main_tex + + +FORBIDDEN_MODULES = { + "torch", + "transformers", + "vllm", + "agents.experiment_forge", + "agents.experiment_executor", +} + + +def test_m5_importing_presentation_modules_does_not_load_gpu_or_execution_modules(): + script = ( + "import sys\n" + "import agents.paper_orchestra_pipeline\n" + "import agents.paper_completeness\n" + f"forbidden = {sorted(FORBIDDEN_MODULES)!r}\n" + "loaded = [name for name in forbidden if name in sys.modules]\n" + "print('\\n'.join(loaded))\n" + "raise SystemExit(1 if loaded else 0)\n" + ) + + result = subprocess.run( + [sys.executable, "-c", script], + cwd=".", + text=True, + capture_output=True, + check=False, + ) + + assert result.returncode == 0, result.stdout + result.stderr + assert result.stdout.strip() == "" + + +def test_m5_assemble_main_tex_renders_offline_without_llm_or_gpu_modules(): + state = { + "title": "CPU Boundary Paper", + "baseline_metric_name": "exact_match", + "baseline_metric_value": 0.5, + "best_metric_value": 0.61, + "effect_pct": 22.0, + "verdict": "confirmed", + "problem_statement": "Can a deterministic renderer run offline?", + "method_summary": "A presentation-only method summary.", + "problem_awareness": { + "central_question": "Can rendering run without benchmark execution?", + "motivation": "Presentation gates must stay CPU-only.", + "method_answer": "Use already materialized evidence.", + "result_claim": "The renderer emits LaTeX from snapshots.", + }, + "contributions": ["A CPU-only rendering path."], + "benchmark_summary": { + "primary_metric": "exact_match", + "per_method": { + "Candidate": {"exact_match": 0.61, "n": 20}, + "Baseline": {"exact_match": 0.50, "n": 20}, + }, + }, + } + orchestrated = { + "refined": { + "abstract": "Offline abstract.", + "introduction": "Offline introduction.", + "method": "Offline method.", + "experiments": "Offline experiments.", + "discussion": "Offline discussion.", + }, + "plotting": {"plotting_executor": {"assets": []}}, + } + + main_tex = assemble_main_tex(state, orchestrated, "conference") + + assert "\\documentclass" in main_tex + assert "CPU Boundary Paper" in main_tex + assert "\\section{Experiments}" in main_tex + assert not (FORBIDDEN_MODULES & set(sys.modules)) + + +def test_m5_materialize_raw_predictions_to_cpu_artifacts(tmp_path): + results_dir = tmp_path / "results" + results_dir.mkdir() + methods = ["CPG", "Baseline", "CPG/no_guard"] + datasets = ["GSM8K", "StrategyQA"] + with (results_dir / "raw_predictions.jsonl").open("w", encoding="utf-8") as handle: + for method in methods: + for dataset in datasets: + for seed in range(3): + for _ in range(7): + score = 0.72 if method == "CPG" else 0.61 if method == "Baseline" else 0.66 + row = { + "method": method, + "dataset": dataset, + "seed": seed, + "exact_match": score, + } + handle.write(json.dumps(row) + "\n") + + report = materialize_deep_benchmark_artifacts( + results_dir, + publication_contract={"required_ablations": ["CPG/no_guard"]}, + metric_name="exact_match", + min_lines=100, + ) + + assert report["ok"] is True + for name in ( + "benchmark_summary.json", + "main_results_table.json", + "seed_variance_table.json", + "per_dataset_results.json", + "ablation_table.json", + ): + assert (results_dir / name).exists() + summary = json.loads((results_dir / "benchmark_summary.json").read_text(encoding="utf-8")) + assert summary["primary_metric"] == "exact_match" + assert summary["per_method"]["CPG"]["exact_match"] == 0.72 + assert summary["per_method"]["CPG"]["n"] == 42 + assert summary["seed_variance"]["CPG"]["n_seeds"] == 3 + assert summary["seed_variance"]["CPG"]["per_seed"] == {"0": 0.72, "1": 0.72, "2": 0.72} + assert summary["ablations"]["CPG/no_guard"]["executed"] is True diff --git a/tests/test_vnext_manuscript.py b/tests/test_vnext_manuscript.py index 88c429f..2636c3a 100644 --- a/tests/test_vnext_manuscript.py +++ b/tests/test_vnext_manuscript.py @@ -225,7 +225,18 @@ def _write_complete_benchmark_packet(self): (results_dir / "ablation_table.json").write_text(json.dumps(benchmark_summary["ablation_table"]), encoding="utf-8") (results_dir / "latency_tokens_table.json").write_text(json.dumps(benchmark_summary["latency_tokens_table"]), encoding="utf-8") - def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name): + def _stub_orchestra( + self, + state, + literature_block, + paper_ids, + iterations, + *, + figures_dir, + baseline, + metric_name, + template_id=None, + ): figures_dir.mkdir(parents=True, exist_ok=True) (figures_dir / "fig_metric_trajectory.svg").write_text( 'metric', @@ -256,10 +267,39 @@ def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, fig "abstract": "Abstract text.", "introduction": "Introduction text with \\cite{cite_a}.", "method": "Method text.", - "experiments": "Experiments text.", + "experiments": ( + "Experiments text.\\n" + "\\begin{figure}[t]\\n" + "\\centering\\n" + "\\includegraphics[width=0.9\\linewidth]{figures/fig_metric_trajectory.svg}\\n" + "\\caption{Metric trajectory.}\\n" + "\\label{fig:metric_trajectory}\\n" + "\\end{figure}" + ), "discussion": "Discussion text.", }, - "refinement_full_text": "", + "refinement_full_text": ( + "\\documentclass{article}\n" + "\\usepackage{graphicx}\n" + "\\title{Auto Manuscript Insight}\n" + "\\begin{document}\n" + "\\maketitle\n" + "\\begin{abstract}Abstract text.\\end{abstract}\n" + "\\section{Introduction}Intro with \\cite{cite_a}.\n" + "\\section{Related Work}Related work with \\cite{cite_a}.\n" + "\\section{Method}Method text.\n" + "\\section{Experiments}Experiments text.\n" + "\\begin{figure}[t]\n" + "\\centering\n" + "\\includegraphics[width=0.9\\linewidth]{figures/fig_metric_trajectory.svg}\n" + "\\caption{Metric trajectory.}\n" + "\\label{fig:metric_trajectory}\n" + "\\end{figure}\n" + "\\section{Discussion}Discussion text.\n" + "\\bibliographystyle{plain}\n" + "\\bibliography{references}\n" + "\\end{document}" + ), "agentreview_worklog": [], "bibtex": "@misc{cite_a,\n title = {Verified Paper},\n author = {Author One},\n year = {2024}\n}\n", "bib_keys": ["cite_a"], @@ -283,10 +323,63 @@ def _stub_orchestra(self, state, literature_block, paper_ids, iterations, *, fig }, } + def _generate_bundle_offline(self, run_full, side_effect): + run_full.side_effect = side_effect + with mock.patch( + "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry", + side_effect=lambda main_tex, **_: ( + main_tex, + {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}}, + ), + ), mock.patch( + "agents.manuscript_page_budget.apply_exact_page_budget", + side_effect=lambda main_tex, *_args, **_kwargs: ( + main_tex, + {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10}, + ), + ), mock.patch( + "agents.manuscript_page_budget.page_budget_blockers", + return_value=[], + ): + return generate_submission_bundle(1, bundle_formats=["conference"]) + + def _orchestra_with_full_tex(self, full_tex): + def _stub(state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name, template_id=None): + out = self._stub_orchestra( + state, + literature_block, + paper_ids, + iterations, + figures_dir=figures_dir, + baseline=baseline, + metric_name=metric_name, + template_id=template_id, + ) + out["refinement_full_text"] = full_tex + return out + + return _stub + @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(self, run_full): run_full.side_effect = self._stub_orchestra - result = generate_submission_bundle(1, bundle_formats=["conference"]) + with mock.patch( + "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry", + side_effect=lambda main_tex, **_: ( + main_tex, + {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}}, + ), + ), mock.patch( + "agents.manuscript_page_budget.apply_exact_page_budget", + side_effect=lambda main_tex, *_args, **_kwargs: ( + main_tex, + {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10}, + ), + ), mock.patch( + "agents.manuscript_page_budget.page_budget_blockers", + return_value=[], + ): + result = generate_submission_bundle(1, bundle_formats=["conference"]) self.assertIn("manuscript_run_id", result) self.assertEqual(result["backend"], "paper_orchestra") bundle = database.fetchone("SELECT * FROM submission_bundles WHERE manuscript_run_id=?", (result["manuscript_run_id"],)) @@ -314,6 +407,93 @@ def test_generate_submission_bundle_creates_verified_bundle_files_and_db_rows(se self.assertIn("fig_metric_trajectory.svg", main_tex) self.assertTrue((self.workspace_root / "idea_1" / "paper" / "current" / "main.tex").exists()) + @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") + def test_generate_submission_bundle_blocks_unresolved_reference_in_rendered_tex(self, run_full): + full_tex = ( + "\\documentclass{article}\\begin{document}" + "\\begin{abstract}Clean abstract.\\end{abstract}" + "\\section{Results}Table ?? reports the result." + "\\section{Discussion}Clean discussion." + "\\end{document}" + ) + result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex)) + + self.assertIn("error", result) + blockers = " ".join(result.get("submission_blockers") or []).lower() + self.assertIn("??", blockers) + + @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") + def test_generate_submission_bundle_blocks_repeated_boilerplate_in_rendered_tex(self, run_full): + sentence = "This repeated boilerplate sentence should not appear in every generated paragraph." + full_tex = ( + "\\documentclass{article}\\begin{document}" + "\\begin{abstract}Clean abstract.\\end{abstract}" + "\\section{Results}" + + "\n".join([sentence] * 4) + + "\n" + + "\\section{Discussion}Clean discussion." + "\\end{document}" + ) + result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex)) + + self.assertIn("error", result) + blockers = " ".join(result.get("submission_blockers") or []).lower() + self.assertIn("boilerplate", blockers) + + @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") + def test_generate_submission_bundle_preserves_existing_placeholder_gate(self, run_full): + full_tex = ( + "\\documentclass{article}\\begin{document}" + "\\begin{abstract}Clean abstract.\\end{abstract}" + "\\section{Results}This placeholder text must not ship." + "\\section{Discussion}Clean discussion." + "\\end{document}" + ) + result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex)) + + self.assertIn("error", result) + blockers = " ".join(result.get("submission_blockers") or []).lower() + self.assertIn("placeholder", blockers) + + @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") + def test_generate_submission_bundle_reports_all_latex_sanity_hits(self, run_full): + full_tex = ( + "\\documentclass{article}\\begin{document}" + "\\begin{abstract}Clean abstract.\\end{abstract}" + "\\section{Results}Table ?? appears with {{plot_cmd}}." + "\\section{Discussion}Clean discussion." + "\\end{document}" + ) + result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex)) + + self.assertIn("error", result) + blockers = " ".join(result.get("submission_blockers") or []).lower() + self.assertIn("??", blockers) + self.assertIn("plot_cmd", blockers) + + @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") + def test_generate_submission_bundle_blocks_cross_run_identity_in_rendered_tex(self, run_full): + full_tex = ( + "\\documentclass{article}" + "\\title{Auto Manuscript Insight}" + "\\begin{document}" + "\\maketitle" + "\\begin{abstract}Clean abstract.\\end{abstract}" + "\\section{Introduction}Intro with \\cite{cite_a}." + "\\begin{figure}[t]" + "\\caption{OtherMethod trajectory.}" + "\\end{figure}" + "\\section{Discussion}Clean discussion." + "\\bibliographystyle{plain}" + "\\bibliography{references}" + "\\end{document}" + ) + result = self._generate_bundle_offline(run_full, self._orchestra_with_full_tex(full_tex)) + + self.assertIn("error", result) + blockers = " ".join(result.get("submission_blockers") or []).lower() + self.assertIn("othermethod", blockers) + def test_generate_submission_bundle_blocks_non_formal_run(self): database.execute( "UPDATE experiment_runs SET proxy_config=? WHERE id=1", @@ -371,7 +551,17 @@ def test_generate_submission_bundle_blocks_benchmark_plan_without_artifact_manif @mock.patch("agents.paper_orchestra_pipeline._run_full_pipeline") def test_generate_submission_bundle_blocks_placeholder_figure_assets(self, run_full): - def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, figures_dir, baseline, metric_name): + def _stub_with_placeholder( + state, + literature_block, + paper_ids, + iterations, + *, + figures_dir, + baseline, + metric_name, + template_id=None, + ): out = self._stub_orchestra( state, literature_block, @@ -380,6 +570,7 @@ def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, fi figures_dir=figures_dir, baseline=baseline, metric_name=metric_name, + template_id=template_id, ) (figures_dir / "fig_metric_trajectory.svg").write_text( 'Diagram placeholder: failed API figure.', @@ -389,7 +580,23 @@ def _stub_with_placeholder(state, literature_block, paper_ids, iterations, *, fi run_full.side_effect = _stub_with_placeholder - result = generate_submission_bundle(1, bundle_formats=["conference"]) + with mock.patch( + "agents.manuscript_submission_enrichment.apply_venue_gates_with_retry", + side_effect=lambda main_tex, **_: ( + main_tex, + {"pass": True, "final": {"pass": True, "template_id": "iclr2026"}}, + ), + ), mock.patch( + "agents.manuscript_page_budget.apply_exact_page_budget", + side_effect=lambda main_tex, *_args, **_kwargs: ( + main_tex, + {"pass": True, "main_body_pages": 9, "total_pdf_pages": 10}, + ), + ), mock.patch( + "agents.manuscript_page_budget.page_budget_blockers", + return_value=[], + ): + result = generate_submission_bundle(1, bundle_formats=["conference"]) self.assertIn("error", result) self.assertIn("placeholder", " ".join(result.get("submission_blockers") or []).lower())