diff --git a/.gitignore b/.gitignore
index 33d91b6..c8aac99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,11 +9,19 @@ build/
 .venv/
 venv/
 .env
+.pytest_cache/
 
 # Generated eval datasets (local, not shared)
 datasets/**/*.jsonl
 datasets/**/*.json
 !datasets/.gitkeep
+!datasets/skills/dogfood/baidu-homepage/train.jsonl
+!datasets/skills/dogfood/baidu-homepage/val.jsonl
+!datasets/skills/dogfood/baidu-homepage/holdout.jsonl
+
+# Generated run artifacts
+output/
+dogfood-output/
 
 # Evolution snapshots
 snapshots/
diff --git a/README.md b/README.md
index c59f9a6..9de5629 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,9 @@ GEPA reads execution traces to understand *why* things fail (not just that they
 # Install
 git clone https://github.com/NousResearch/hermes-agent-self-evolution.git
 cd hermes-agent-self-evolution
-pip install -e ".[dev]"
+uv venv --python 3.11 .venv
+source .venv/bin/activate
+uv pip install -e '.[dev]'
 
 # Point at your hermes-agent repo
 export HERMES_AGENT_REPO=~/.hermes/hermes-agent
@@ -49,6 +51,57 @@ python -m evolution.skills.evolve_skill \
     --eval-source sessiondb
 ```
 
+## Advanced Usage
+
+### Evaluate through a real Hermes runtime
+
+```bash
+python -m evolution.skills.evolve_skill \
+    --skill dogfood \
+    --eval-source golden \
+    --dataset-path datasets/skills/dogfood/baidu-homepage \
+    --eval-backend hermes
+```
+
+### Add an optional TBLite regression gate
+
+```bash
+python -m evolution.skills.evolve_skill \
+    --skill github-code-review \
+    --eval-source synthetic \
+    --run-tblite \
+    --tblite-mode fast
+```
+
+### Generate git / PR handoff artifacts or execute them directly
+
+```bash
+python -m evolution.skills.evolve_skill \
+    --skill github-code-review \
+    --eval-source synthetic \
+    --execute-git-apply
+
+python -m evolution.skills.evolve_skill \
+    --skill github-code-review \
+    --eval-source synthetic \
+    --execute-git-apply \
+    --execute-push \
+    --execute-pr
+```
+
+Notes:
+- By default the tool loads credentials from `~/.hermes/.env` and local `.env` when present, without overwriting already exported values.
+- When the local Hermes runtime uses `model.provider: custom`, default self-evolution model settings are aligned to the active Hermes config automatically.
+- `--execute-pr` requires `--execute-push`.
+
+## Golden Dataset Sample
+
+A browser-heavy golden sample for the `dogfood` skill is included at:
+
+- `datasets/skills/dogfood/baidu-homepage/`
+
+It captures both positive paths and blockers from a real Baidu homepage dogfood run, which makes it useful for evaluating browser QA skills more realistically than purely synthetic examples.
+
 ## What It Optimizes
 
 | Phase | Target | Engine | Status |
diff --git a/datasets/skills/dogfood/baidu-homepage/README.md b/datasets/skills/dogfood/baidu-homepage/README.md
new file mode 100644
index 0000000..732ff2e
--- /dev/null
+++ b/datasets/skills/dogfood/baidu-homepage/README.md
@@ -0,0 +1,34 @@
+# Dogfood Golden Sample: Baidu Homepage
+
+This dataset incorporates the real dogfood run against <https://www.baidu.com/> on 2026-04-15 into the self-evolution sample set for the `dogfood` skill.
+
+## Source Artifacts
+
+- Source report: `datasets/skills/dogfood/baidu-homepage/source_report.md`
+- Dataset directory: `datasets/skills/dogfood/baidu-homepage`
+
+## What This Sample Covers
+
+- Homepage load health and console cleanliness
+- Search submission flow
+- Search suggestion relevance
+- Wenxin assistant entry and back-navigation chain
+- Top-nav News entry health
+
+## Why It Matters
+
+This sample gives `dogfood` a real browser-heavy golden set with both:
+
+- **positive paths**: homepage load, Wenxin single-turn QA, News page load
+- **negative/blocking paths**: search flow interrupted by Baidu security verification, unrelated suggestions, unstable back-navigation chain
+
+That makes it more useful than a purely synthetic sample when evaluating whether the evolved skill:
+
+1. tests the intended user flows,
+2. distinguishes blockers from non-blockers,
+3. captures evidence correctly, and
+4. writes a balanced QA report with both working and broken paths.
+
+## Notes
+
+- The source report is stored in-repo as text only; screenshot binaries from the original run are intentionally not committed.
diff --git a/datasets/skills/dogfood/baidu-homepage/holdout.jsonl b/datasets/skills/dogfood/baidu-homepage/holdout.jsonl
new file mode 100644
index 0000000..dbf0821
--- /dev/null
+++ b/datasets/skills/dogfood/baidu-homepage/holdout.jsonl
@@ -0,0 +1 @@
+{"task_input": "用 dogfood 测这个网站：https://www.baidu.com/，重点看顶部导航中的“新闻”入口，并总结哪些路径正常、哪些路径被阻断。", "expected_behavior": "应验证“新闻”入口能否正常打开百度新闻页，并区分成功路径与失败路径：例如新闻页应被记录为正常打开、无明显布局异常或 console 错误；同时如果搜索主流程被安全验证打断，也应在总结中列为 blocker，而不是把所有路径都误判为失败。", "difficulty": "hard", "category": "navigation-health", "source": "golden"}
diff --git a/datasets/skills/dogfood/baidu-homepage/source_report.md b/datasets/skills/dogfood/baidu-homepage/source_report.md
new file mode 100644
index 0000000..8dea480
--- /dev/null
+++ b/datasets/skills/dogfood/baidu-homepage/source_report.md
@@ -0,0 +1,165 @@
+# Dogfood QA Report
+
+**Target:** https://www.baidu.com/
+**Date:** 2026-04-15
+**Scope:** 百度首页桌面站小样本探索式测试：首页加载、顶部导航、搜索输入与提交主流程、文心助手入口、百度新闻入口。
+**Tester:** Hermes Agent (automated exploratory QA)
+
+---
+
+## Executive Summary
+
+| Severity | Count |
+|----------|-------|
+| 🔴 Critical | 0 |
+| 🟠 High | 1 |
+| 🟡 Medium | 2 |
+| 🔵 Low | 0 |
+| **Total** | **3** |
+
+**Overall Assessment:** 百度首页与主要入口整体可用，但搜索主流程触发安全验证拦截，且搜索联想与返回链路存在可用性问题，影响无登录/自动化场景下的连续使用体验。
+
+---
+
+## Issues
+
+### Issue #1: 首页搜索主流程被安全验证拦截，无法直接进入结果页
+
+| Field | Value |
+|-------|-------|
+| **Severity** | High |
+| **Category** | Functional |
+| **URL** | https://www.baidu.com/ |
+
+**Description:**
+在首页输入测试关键词并提交后，未直接进入正常搜索结果页，而是被“百度安全验证”拦截。页面要求用户完成“拖动左侧滑块使图片为正”的图片旋转验证，导致标准搜索主流程中断。对于自动化代理、辅助技术用户或希望快速搜索的用户来说，这属于明显阻断。
+
+**Steps to Reproduce:**
+1. 打开 https://www.baidu.com/
+2. 在首页搜索框输入“Hermes Agent dogfood 测试”
+3. 按 Enter 提交搜索
+
+**Expected Behavior:**
+直接进入对应关键词的搜索结果页，用户可以继续浏览结果。
+
+**Actual Behavior:**
+页面跳转到“百度安全验证”，要求完成滑块旋转图片验证后才能继续，正常搜索结果未展示。
+
+**Screenshot:**
+Original screenshot captured during the source run (binary not committed in this repo).
+
+**Console Errors** (if applicable):
+```text
+None observed.
+```
+
+---
+
+### Issue #2: 搜索联想词与已输入查询明显不相关
+
+| Field | Value |
+|-------|-------|
+| **Severity** | Medium |
+| **Category** | UX |
+| **URL** | https://www.baidu.com/ |
+
+**Description:**
+在首页搜索框输入“Hermes Agent dogfood 测试”后，下拉联想建议并未围绕完整查询或“dogfood 测试”意图展开，而是出现大量泛化的英文品牌/词条，如 “hermes tracking”、“hermes track”、“hermes trismegistus”等。这种联想结果与当前查询意图偏差较大，容易误导用户点击到无关搜索方向。
+
+**Steps to Reproduce:**
+1. 打开 https://www.baidu.com/
+2. 在首页搜索框输入“Hermes Agent dogfood 测试”
+3. 观察联想词下拉列表
+
+**Expected Behavior:**
+联想词应尽量贴近当前完整查询，或至少与“Agent / dogfood / 测试”意图相关。
+
+**Actual Behavior:**
+联想词主要围绕泛化的“Hermes”品牌/英文词条展开，与完整查询相关性较弱。
+
+**Screenshot:**
+Original screenshot captured during the source run (binary not committed in this repo).
+
+**Console Errors** (if applicable):
+```text
+None observed.
+```
+
+---
+
+### Issue #3: 从文心助手页使用返回操作未能回到百度首页，历史链路表现不稳定
+
+| Field | Value |
+|-------|-------|
+| **Severity** | Medium |
+| **Category** | Functional |
+| **URL** | https://chat.baidu.com/?enter_type=home_operate |
+
+**Description:**
+从百度首页点击“复杂问题就找文心助手”进入文心助手页后，使用浏览器后退操作时，并未顺利回到百度首页，而是停留在文心相关页面。对用户来说，这会造成页面链路理解困难；对自动化工作流来说，也会增加 flow 恢复成本。
+
+**Steps to Reproduce:**
+1. 打开 https://www.baidu.com/
+2. 点击“复杂问题就找文心助手，深入思考回答更优”入口
+3. 在文心助手页面执行浏览器后退
+
+**Expected Behavior:**
+后退应返回原始百度首页。
+
+**Actual Behavior:**
+后退后仍停留在文心相关页面，未恢复到首页，需要重新导航到百度首页。
+
+**Screenshot:**
+Original screenshot captured during the source run (binary not committed in this repo).
+
+**Console Errors** (if applicable):
+```text
+None observed.
+```
+
+---
+
+## Issues Summary Table
+
+| # | Title | Severity | Category | URL |
+|---|-------|----------|----------|-----|
+| 1 | 首页搜索主流程被安全验证拦截，无法直接进入结果页 | High | Functional | https://www.baidu.com/ |
+| 2 | 搜索联想词与已输入查询明显不相关 | Medium | UX | https://www.baidu.com/ |
+| 3 | 从文心助手页使用返回操作未能回到百度首页，历史链路表现不稳定 | Medium | Functional | https://chat.baidu.com/?enter_type=home_operate |
+
+## Testing Coverage
+
+### Pages Tested
+- 百度首页（https://www.baidu.com/）
+- 百度安全验证页（搜索后触发）
+- 文心助手入口页 / 对话页（https://chat.baidu.com/）
+- 百度新闻页（http://news.baidu.com）
+
+### Features Tested
+- 首页加载与视觉检查
+- 浏览器 console 基础检查
+- 首页搜索输入与提交
+- 搜索联想词观察
+- 文心助手入口跳转
+- 文心助手单轮提问与回答返回
+- 顶部“新闻”导航入口跳转
+
+### Not Tested / Out of Scope
+- 登录流程
+- 图片、视频、地图、贴吧、网盘、文库等其余顶部入口的深入测试
+- 首页“设置”菜单展开行为
+- 热搜条目逐条点击验证
+- 移动端布局与响应式行为
+- 安全验证滑块的人工完成与验证后结果页质量
+
+### Blockers
+- 搜索主流程被百度安全验证拦截，无法在当前会话中继续检查正常搜索结果页的相关性、结果布局与分页链路。
+
+---
+
+## Notes
+
+1. 百度首页本身在首屏加载、布局和视觉呈现上表现稳定，未见明显白屏、JS 报错或布局错位。
+2. 文心助手入口可正常打开，且无需登录即可完成单轮问答，这一入口的可用性较好。
+3. 百度新闻页正常打开，说明顶部导航至少部分入口工作正常。
+4. 本次最主要的问题集中在“主搜索流程被风控打断”和“返回链路不稳定”，这两点对真实 end-to-end 体验影响最大。
diff --git a/datasets/skills/dogfood/baidu-homepage/train.jsonl b/datasets/skills/dogfood/baidu-homepage/train.jsonl
new file mode 100644
index 0000000..14a6996
--- /dev/null
+++ b/datasets/skills/dogfood/baidu-homepage/train.jsonl
@@ -0,0 +1,2 @@
+{"task_input": "用 dogfood 测这个网站：https://www.baidu.com/，重点看首页搜索输入和提交主流程。", "expected_behavior": "应导航到百度首页，检查 console 与首屏状态，在搜索框输入明确测试词并提交；若出现百度安全验证，应将其识别为高严重级别的 Functional blocker，记录验证文案、触发步骤、结果页未展示这一事实，并附截图证据。", "difficulty": "medium", "category": "search-flow", "source": "golden"}
+{"task_input": "用 dogfood 测这个网站：https://www.baidu.com/，重点看首页搜索联想词是否贴合查询意图。", "expected_behavior": "应在首页搜索框输入具有明确意图的测试查询，观察下拉联想词，并判断其是否与完整查询相关；若联想词大量偏向泛化品牌词而非当前测试意图，应记录为中等级别 UX 问题，说明误导风险并保存截图。", "difficulty": "medium", "category": "search-suggestions", "source": "golden"}
diff --git a/datasets/skills/dogfood/baidu-homepage/val.jsonl b/datasets/skills/dogfood/baidu-homepage/val.jsonl
new file mode 100644
index 0000000..3abeeec
--- /dev/null
+++ b/datasets/skills/dogfood/baidu-homepage/val.jsonl
@@ -0,0 +1 @@
+{"task_input": "用 dogfood 测这个网站：https://www.baidu.com/，重点看“复杂问题就找文心助手”入口，以及从该页返回首页的链路。", "expected_behavior": "应点击文心助手入口，验证页面是否正常打开、是否能在未登录状态下完成至少一轮问答，再执行返回操作；若返回未恢复到原始百度首页，应记录为中等级别 Functional 问题，同时注明文心页面本身可用、无明显 console 错误或登录阻断。", "difficulty": "hard", "category": "subpage-flow", "source": "golden"}
diff --git a/evolution/core/__init__.py b/evolution/core/__init__.py
index 0e8396a..f941dca 100644
--- a/evolution/core/__init__.py
+++ b/evolution/core/__init__.py
@@ -1,3 +1,25 @@
 """Core infrastructure shared across all evolution phases."""
 
 from evolution.core.config import EvolutionConfig, get_hermes_agent_path
+from evolution.core.benchmark_gate import TBLiteGateResult, run_tblite_benchmark_gate
+from evolution.core.git_pr_automation import (
+    build_evolution_branch_name,
+    build_git_apply_plan,
+    build_target_skill_path,
+    write_git_apply_plan_artifacts,
+    write_git_pr_automation_artifacts,
+    write_skill_patch_artifacts,
+)
+from evolution.core.report_artifact import (
+    build_diff_summary,
+    build_evolution_report,
+    build_github_pr_body,
+    build_github_pr_title,
+    build_gh_pr_create_command,
+    build_pr_draft,
+    build_review_checklist,
+    summarize_recommendation,
+    write_github_pr_artifacts,
+    write_pr_ready_artifacts,
+    write_report_artifacts,
+)
diff --git a/evolution/core/benchmark_gate.py b/evolution/core/benchmark_gate.py
new file mode 100644
index 0000000..40d3a05
--- /dev/null
+++ b/evolution/core/benchmark_gate.py
@@ -0,0 +1,232 @@
+"""Helpers for running benchmark regression gates against a local Hermes checkout."""
+
+from __future__ import annotations
+
+import re
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import yaml
+
+from evolution.core.hermes_eval import build_skill_system_prompt
+
+
+DEFAULT_TBLITE_CONFIG = Path("environments/benchmarks/tblite/local.yaml")
+DEFAULT_TBLITE_SCRIPT = Path("environments/benchmarks/tblite/tblite_env.py")
+DEFAULT_FAST_TBLITE_TASK_FILTER = "broken-python,pandas-etl"
+
+
+@dataclass
+class TBLiteGatePlan:
+    mode: str
+    base_config_path: Path
+    task_filter: str | None
+    benchmark_script_path: Path
+
+
+@dataclass
+class TBLiteGateResult:
+    passed: bool
+    baseline_pass_rate: float
+    evolved_pass_rate: float
+    delta: float
+    threshold: float
+    summary: str
+    baseline_stdout: str
+    evolved_stdout: str
+    mode: str
+    task_filter: str | None
+    base_config_path: Path
+
+
+def parse_tblite_pass_rate(stdout: str) -> float:
+    """Extract the overall pass rate from a TBLite/TB2 evaluation stdout string."""
+    match = re.search(r"Overall Pass Rate:\s*([0-9]*\.?[0-9]+)", stdout)
+    if not match:
+        raise ValueError("Could not find 'Overall Pass Rate' in benchmark output")
+    return float(match.group(1))
+
+
+def resolve_tblite_gate_plan(
+    *,
+    mode: str,
+    hermes_repo: str | Path,
+    task_filter: str | None = None,
+    base_config_path: str | Path | None = None,
+    benchmark_script_path: str | Path | None = None,
+) -> TBLiteGatePlan:
+    """Resolve the concrete config/script/filter used for a TBLite gate run."""
+    hermes_repo_path = Path(hermes_repo).expanduser().resolve()
+
+    normalized_mode = mode.lower().strip()
+    if normalized_mode not in {"fast", "full"}:
+        raise ValueError(f"Unsupported TBLite gate mode: {mode}")
+
+    default_config = (
+        hermes_repo_path / DEFAULT_TBLITE_CONFIG
+        if normalized_mode == "fast"
+        else hermes_repo_path / Path("environments/benchmarks/tblite/default.yaml")
+    )
+    default_filter = DEFAULT_FAST_TBLITE_TASK_FILTER if normalized_mode == "fast" else None
+
+    if base_config_path is None:
+        resolved_config = default_config
+    else:
+        supplied = Path(base_config_path)
+        resolved_config = supplied if supplied.is_absolute() else hermes_repo_path / supplied
+
+    if benchmark_script_path is None:
+        resolved_script = hermes_repo_path / DEFAULT_TBLITE_SCRIPT
+    else:
+        supplied_script = Path(benchmark_script_path)
+        resolved_script = supplied_script if supplied_script.is_absolute() else hermes_repo_path / supplied_script
+
+    return TBLiteGatePlan(
+        mode=normalized_mode,
+        base_config_path=resolved_config,
+        task_filter=task_filter if task_filter is not None else default_filter,
+        benchmark_script_path=resolved_script,
+    )
+
+
+def _write_benchmark_config(
+    *,
+    base_config_path: Path,
+    system_prompt: str,
+    output_dir: Path,
+    label: str,
+) -> Path:
+    base_config = yaml.safe_load(base_config_path.read_text()) or {}
+    env_cfg = dict(base_config.get("env") or {})
+    env_cfg["system_prompt"] = system_prompt
+    env_cfg["use_wandb"] = False
+    env_cfg["data_dir_to_save_evals"] = str(output_dir / f"tblite-{label}")
+    base_config["env"] = env_cfg
+
+    config_path = output_dir / f"tblite_{label}.yaml"
+    config_path.write_text(yaml.safe_dump(base_config, sort_keys=False))
+    return config_path
+
+
+def _run_tblite_eval(
+    *,
+    hermes_repo: Path,
+    benchmark_script_path: Path,
+    config_path: Path,
+    task_filter: str | None,
+) -> str:
+    command = [
+        shutil.which("python") or "python",
+        str(benchmark_script_path),
+        "evaluate",
+        "--config",
+        str(config_path),
+    ]
+    if task_filter:
+        command.extend(["--env.task_filter", task_filter])
+
+    result = subprocess.run(
+        command,
+        capture_output=True,
+        text=True,
+        timeout=3600,
+        cwd=str(hermes_repo),
+    )
+    if result.returncode != 0:
+        raise RuntimeError(
+            "TBLite evaluation failed with exit code "
+            f"{result.returncode}: {result.stderr.strip() or result.stdout.strip()}"
+        )
+    return result.stdout
+
+
+def run_tblite_benchmark_gate(
+    *,
+    skill_name: str,
+    baseline_skill_body: str,
+    evolved_skill_body: str,
+    hermes_repo: str | Path,
+    regression_threshold: float = 0.02,
+    task_filter: str | None = None,
+    base_config_path: str | Path | None = None,
+    benchmark_script_path: str | Path | None = None,
+    mode: str = "fast",
+) -> TBLiteGateResult:
+    """Run a TBLite regression gate against baseline and evolved skill prompts."""
+    hermes_repo_path = Path(hermes_repo).expanduser().resolve()
+    plan = resolve_tblite_gate_plan(
+        mode=mode,
+        hermes_repo=hermes_repo_path,
+        task_filter=task_filter,
+        base_config_path=base_config_path,
+        benchmark_script_path=benchmark_script_path,
+    )
+
+    baseline_prompt, _ = build_skill_system_prompt(
+        skill_name,
+        hermes_repo=hermes_repo_path,
+        skill_body_override=baseline_skill_body,
+    )
+    evolved_prompt, _ = build_skill_system_prompt(
+        skill_name,
+        hermes_repo=hermes_repo_path,
+        skill_body_override=evolved_skill_body,
+    )
+
+    with tempfile.TemporaryDirectory(prefix="tblite-gate-") as tmpdir:
+        output_dir = Path(tmpdir)
+        baseline_config = _write_benchmark_config(
+            base_config_path=plan.base_config_path,
+            system_prompt=baseline_prompt,
+            output_dir=output_dir,
+            label="baseline",
+        )
+        evolved_config = _write_benchmark_config(
+            base_config_path=plan.base_config_path,
+            system_prompt=evolved_prompt,
+            output_dir=output_dir,
+            label="evolved",
+        )
+
+        baseline_stdout = _run_tblite_eval(
+            hermes_repo=hermes_repo_path,
+            benchmark_script_path=plan.benchmark_script_path,
+            config_path=baseline_config,
+            task_filter=plan.task_filter,
+        )
+        evolved_stdout = _run_tblite_eval(
+            hermes_repo=hermes_repo_path,
+            benchmark_script_path=plan.benchmark_script_path,
+            config_path=evolved_config,
+            task_filter=plan.task_filter,
+        )
+
+    baseline_pass_rate = parse_tblite_pass_rate(baseline_stdout)
+    evolved_pass_rate = parse_tblite_pass_rate(evolved_stdout)
+    delta = evolved_pass_rate - baseline_pass_rate
+    passed = delta >= -regression_threshold
+
+    summary = (
+        f"TBLite {plan.mode} gate passed: baseline={baseline_pass_rate:.4f}, evolved={evolved_pass_rate:.4f}, "
+        f"delta={delta:+.4f}, threshold=-{regression_threshold:.4f}"
+        if passed
+        else f"TBLite {plan.mode} regression detected: baseline={baseline_pass_rate:.4f}, evolved={evolved_pass_rate:.4f}, "
+        f"delta={delta:+.4f}, threshold=-{regression_threshold:.4f}"
+    )
+
+    return TBLiteGateResult(
+        passed=passed,
+        baseline_pass_rate=baseline_pass_rate,
+        evolved_pass_rate=evolved_pass_rate,
+        delta=delta,
+        threshold=regression_threshold,
+        summary=summary,
+        baseline_stdout=baseline_stdout,
+        evolved_stdout=evolved_stdout,
+        mode=plan.mode,
+        task_filter=plan.task_filter,
+        base_config_path=plan.base_config_path,
+    )
diff --git a/evolution/core/git_pr_automation.py b/evolution/core/git_pr_automation.py
new file mode 100644
index 0000000..df98c06
--- /dev/null
+++ b/evolution/core/git_pr_automation.py
@@ -0,0 +1,375 @@
+"""Helpers for turning evolved skill artifacts into git/GitHub automation plans."""
+
+from __future__ import annotations
+
+import json
+import re
+import shlex
+import subprocess
+from pathlib import Path
+from typing import Any, Callable
+
+from evolution.core.report_artifact import build_github_pr_title, summarize_recommendation
+
+TerminalRunner = Callable[[str], dict[str, Any]]
+
+
+def _slugify_skill_name(skill_name: str) -> str:
+    slug = re.sub(r"[^a-zA-Z0-9]+", "-", skill_name).strip("-").lower()
+    return slug or "skill"
+
+
+def _skills_root(hermes_repo: str | Path) -> Path:
+    repo = Path(hermes_repo).expanduser().resolve()
+    return (repo / "skills").resolve(strict=False)
+
+
+def _ensure_within_skills_root(*, hermes_repo: str | Path, target_path: str | Path) -> Path:
+    repo = Path(hermes_repo).expanduser().resolve()
+    skills_root = _skills_root(repo)
+    target = Path(target_path).expanduser()
+    if not target.is_absolute():
+        target = repo / target
+    target = target.resolve(strict=False)
+    try:
+        target.relative_to(skills_root)
+    except ValueError as exc:
+        raise ValueError(f"target skill path must stay within {skills_root}") from exc
+    return target
+
+
+def _default_runner(command: str, *, workdir: str | Path | None = None) -> dict[str, Any]:
+    completed = subprocess.run(
+        command,
+        shell=True,
+        cwd=str(workdir) if workdir is not None else None,
+        capture_output=True,
+        text=True,
+    )
+    return {
+        "exit_code": completed.returncode,
+        "output": (completed.stdout or "") + (completed.stderr or ""),
+    }
+
+
+def _run_step(
+    *,
+    name: str,
+    command: str,
+    workdir: str | Path,
+    runner: Callable[..., dict[str, Any]],
+) -> dict[str, Any]:
+    result = runner(command, workdir=workdir)
+    exit_code = result.get("exit_code", 1)
+    output = result.get("output", "")
+    if exit_code != 0:
+        raise RuntimeError(f"{name} failed ({exit_code}): {output}")
+    return {
+        "name": name,
+        "command": command,
+        "exit_code": exit_code,
+        "output": output,
+    }
+
+
+def build_evolution_branch_name(*, skill_name: str, timestamp: str) -> str:
+    """Build a deterministic branch name for an evolution candidate."""
+    return f"evolution/{_slugify_skill_name(skill_name)}-{timestamp}"
+
+
+def build_target_skill_path(*, hermes_repo: str | Path, skill_relpath: str) -> Path:
+    """Resolve the target skill path inside a hermes-agent checkout."""
+    relative_path = Path(skill_relpath)
+    if relative_path.is_absolute():
+        raise ValueError("skill_relpath must be relative to the skills directory")
+    return _ensure_within_skills_root(
+        hermes_repo=hermes_repo,
+        target_path=Path("skills") / relative_path,
+    )
+
+
+def write_skill_patch_artifacts(
+    *,
+    output_dir: str | Path,
+    skill_relpath: str,
+    evolved_skill_text: str,
+    hermes_repo: str | Path,
+) -> dict[str, Path]:
+    """Write a candidate skill file plus a manifest describing where it should land."""
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    candidate_skill_file = out_dir / "candidate_skill_patch.md"
+    candidate_skill_file.write_text(evolved_skill_text)
+
+    target_skill_path = build_target_skill_path(hermes_repo=hermes_repo, skill_relpath=skill_relpath)
+    git_patch_manifest_json = out_dir / "git_patch_manifest.json"
+    git_patch_manifest_json.write_text(json.dumps({
+        "skill_relpath": skill_relpath,
+        "target_skill_path": str(target_skill_path),
+        "candidate_skill_file": str(candidate_skill_file),
+    }, indent=2))
+
+    return {
+        "candidate_skill_file": candidate_skill_file,
+        "git_patch_manifest_json": git_patch_manifest_json,
+        "target_skill_path": target_skill_path,
+    }
+
+
+def build_git_apply_plan(
+    *,
+    hermes_repo: str | Path,
+    branch_name: str,
+    target_skill_path: str | Path,
+    candidate_skill_file: str | Path,
+    commit_message: str,
+    push_remote: str = "origin",
+) -> str:
+    """Build a shell plan that applies the evolved skill in a hermes repo."""
+    repo = Path(hermes_repo).expanduser().resolve()
+    target = _ensure_within_skills_root(hermes_repo=repo, target_path=target_skill_path)
+    candidate = Path(candidate_skill_file).expanduser().resolve(strict=False)
+
+    return "\n".join([
+        "#!/usr/bin/env bash",
+        "set -euo pipefail",
+        f"cd {shlex.quote(str(repo))}",
+        f"git checkout -b {shlex.quote(branch_name)}",
+        f"mkdir -p {shlex.quote(str(target.parent))}",
+        f"cp {shlex.quote(str(candidate))} {shlex.quote(str(target))}",
+        f"git add {shlex.quote(str(target))}",
+        f"git commit -m {shlex.quote(commit_message)}",
+        f"git push -u {shlex.quote(push_remote)} {shlex.quote(branch_name)}",
+        "",
+    ])
+
+
+def write_git_apply_plan_artifacts(
+    *,
+    output_dir: str | Path,
+    hermes_repo: str | Path,
+    branch_name: str,
+    target_skill_path: str | Path,
+    candidate_skill_file: str | Path,
+    commit_message: str,
+    push_remote: str = "origin",
+) -> dict[str, Path]:
+    """Write shell and markdown plans for applying the evolved skill via git."""
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    git_apply_plan_sh = out_dir / "git_apply_plan.sh"
+    git_apply_plan_md = out_dir / "git_apply_plan.md"
+
+    shell_plan = build_git_apply_plan(
+        hermes_repo=hermes_repo,
+        branch_name=branch_name,
+        target_skill_path=target_skill_path,
+        candidate_skill_file=candidate_skill_file,
+        commit_message=commit_message,
+        push_remote=push_remote,
+    )
+    git_apply_plan_sh.write_text(shell_plan)
+    git_apply_plan_md.write_text(
+        "\n".join([
+            "# Git Apply Plan",
+            "",
+            f"- Hermes repo: `{hermes_repo}`",
+            f"- Branch: `{branch_name}`",
+            f"- Target skill path: `{target_skill_path}`",
+            f"- Candidate file: `{candidate_skill_file}`",
+            f"- Commit message: `{commit_message}`",
+            "",
+            "## Shell Plan",
+            "```bash",
+            shell_plan.rstrip(),
+            "```",
+            "",
+        ])
+    )
+
+    return {
+        "git_apply_plan_sh": git_apply_plan_sh,
+        "git_apply_plan_md": git_apply_plan_md,
+    }
+
+
+def write_git_pr_automation_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict[str, Any],
+    hermes_repo: str | Path,
+    skill_relpath: str,
+    evolved_skill_text: str,
+    github_pr_body_path: str | Path,
+    push_remote: str = "origin",
+) -> dict[str, Path | None]:
+    """Write the full git/GitHub handoff pack for applying an evolved skill."""
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    patch_paths = write_skill_patch_artifacts(
+        output_dir=out_dir,
+        skill_relpath=skill_relpath,
+        evolved_skill_text=evolved_skill_text,
+        hermes_repo=hermes_repo,
+    )
+
+    branch_name = build_evolution_branch_name(
+        skill_name=metrics.get("skill_name", "skill"),
+        timestamp=metrics.get("timestamp", "unknown"),
+    )
+    commit_message = build_github_pr_title(metrics)
+    plan_paths = write_git_apply_plan_artifacts(
+        output_dir=out_dir,
+        hermes_repo=hermes_repo,
+        branch_name=branch_name,
+        target_skill_path=patch_paths["target_skill_path"],
+        candidate_skill_file=patch_paths["candidate_skill_file"],
+        commit_message=commit_message,
+        push_remote=push_remote,
+    )
+
+    decision = summarize_recommendation(metrics)["decision"]
+    gh_pr_create_after_push_txt = out_dir / "gh_pr_create_after_push.txt"
+    if decision == "reject":
+        command_path: Path | None = None
+        if gh_pr_create_after_push_txt.exists():
+            gh_pr_create_after_push_txt.unlink()
+    else:
+        draft_flag = " --draft" if decision == "review_needed" else ""
+        command = (
+            f"cd {shlex.quote(str(hermes_repo))} && "
+            f"gh pr create --base main --head {shlex.quote(branch_name)} "
+            f"--title {shlex.quote(commit_message)} "
+            f"--body-file {shlex.quote(str(github_pr_body_path))}{draft_flag}"
+        )
+        gh_pr_create_after_push_txt.write_text(command + "\n")
+        command_path = gh_pr_create_after_push_txt
+
+    return {
+        "candidate_skill_file": patch_paths["candidate_skill_file"],
+        "git_patch_manifest_json": patch_paths["git_patch_manifest_json"],
+        "target_skill_path": patch_paths["target_skill_path"],
+        "git_apply_plan_sh": plan_paths["git_apply_plan_sh"],
+        "git_apply_plan_md": plan_paths["git_apply_plan_md"],
+        "gh_pr_create_after_push_txt": command_path,
+    }
+
+
+def execute_git_apply_plan(
+    *,
+    hermes_repo: str | Path,
+    branch_name: str,
+    target_skill_path: str | Path,
+    candidate_skill_file: str | Path,
+    commit_message: str,
+    push_remote: str = "origin",
+    run_push: bool = True,
+    runner: Callable[..., dict[str, Any]] | None = None,
+) -> dict[str, Any]:
+    """Execute the git apply plan step by step with an injectable runner."""
+    run = runner or _default_runner
+    repo = Path(hermes_repo).expanduser().resolve()
+    target = _ensure_within_skills_root(hermes_repo=repo, target_path=target_skill_path)
+    candidate = Path(candidate_skill_file).expanduser().resolve(strict=False)
+
+    steps = [
+        _run_step(
+            name="checkout_branch",
+            command=f"git checkout -b {shlex.quote(branch_name)}",
+            workdir=repo,
+            runner=run,
+        ),
+        _run_step(
+            name="ensure_target_dir",
+            command=f"mkdir -p {shlex.quote(str(target.parent))}",
+            workdir=repo,
+            runner=run,
+        ),
+        _run_step(
+            name="copy_candidate_skill",
+            command=f"cp {shlex.quote(str(candidate))} {shlex.quote(str(target))}",
+            workdir=repo,
+            runner=run,
+        ),
+        _run_step(
+            name="git_add",
+            command=f"git add {shlex.quote(str(target))}",
+            workdir=repo,
+            runner=run,
+        ),
+        _run_step(
+            name="git_commit",
+            command=f"git commit -m {shlex.quote(commit_message)}",
+            workdir=repo,
+            runner=run,
+        ),
+    ]
+    if run_push:
+        steps.append(
+            _run_step(
+                name="push_branch",
+                command=f"git push -u {shlex.quote(push_remote)} {shlex.quote(branch_name)}",
+                workdir=repo,
+                runner=run,
+            )
+        )
+
+    return {
+        "executed": True,
+        "steps": steps,
+    }
+
+
+def execute_gh_pr_create(
+    *,
+    command: str | None,
+    hermes_repo: str | Path,
+    runner: Callable[..., dict[str, Any]] | None = None,
+) -> dict[str, Any]:
+    """Execute a gh pr create command when present."""
+    if not command:
+        return {"executed": False, "reason": "no_command"}
+
+    run = runner or _default_runner
+    result = run(command, workdir=hermes_repo)
+    exit_code = result.get("exit_code", 1)
+    output = result.get("output", "")
+    if exit_code != 0:
+        raise RuntimeError(f"gh_pr_create failed ({exit_code}): {output}")
+    return {
+        "executed": True,
+        "command": command,
+        "exit_code": exit_code,
+        "output": output,
+    }
+
+
+def execute_git_pr_automation(
+    *,
+    git_apply_plan: dict[str, Any],
+    gh_pr_create_command: str | None,
+    execute_push: bool,
+    execute_pr: bool,
+    execute_git_apply_plan_fn=execute_git_apply_plan,
+    execute_gh_pr_create_fn=execute_gh_pr_create,
+) -> dict[str, Any]:
+    """Execute the real git apply flow, optionally followed by PR creation."""
+    git_result = execute_git_apply_plan_fn(
+        **git_apply_plan,
+        run_push=execute_push,
+    )
+
+    pr_result = None
+    if execute_pr and gh_pr_create_command:
+        pr_result = execute_gh_pr_create_fn(
+            command=gh_pr_create_command,
+            hermes_repo=git_apply_plan["hermes_repo"],
+        )
+
+    return {
+        "git": git_result,
+        "pr": pr_result,
+    }
diff --git a/evolution/core/hermes_eval.py b/evolution/core/hermes_eval.py
new file mode 100644
index 0000000..007bc4e
--- /dev/null
+++ b/evolution/core/hermes_eval.py
@@ -0,0 +1,179 @@
+"""Helpers for evaluating skills with a real Hermes Agent instance.
+
+This bridges hermes-agent-self-evolution to the actual Hermes runtime instead of
+only evaluating a DSPy skill wrapper in isolation.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from types import ModuleType, SimpleNamespace
+from typing import Any, Optional
+
+from evolution.core.config import get_hermes_agent_path
+
+
+@dataclass
+class HermesSkillEvalCase:
+    """Single evaluation case for a skill using a real Hermes Agent."""
+
+    skill_name: str
+    task_input: str
+    system_prompt: str = ""
+    conversation_history: Optional[list[dict[str, Any]]] = None
+
+
+@dataclass
+class HermesSkillEvalResult:
+    """Result of a real Hermes Agent skill evaluation run."""
+
+    final_response: str
+    loaded_skills: list[str]
+    effective_system_prompt: str
+    raw_result: Any
+
+
+def _import_module_from_path(module_name: str, path: Path) -> ModuleType:
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load module {module_name} from {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+@contextmanager
+def _temporary_hermes_import_shims(repo: Path | None = None):
+    """Provide tiny stubs and import path support for dynamic Hermes imports."""
+    installed: list[str] = []
+    inserted_repo = False
+
+    if repo is not None:
+        repo_str = str(repo)
+        if repo_str not in sys.path:
+            sys.path.insert(0, repo_str)
+            inserted_repo = True
+
+    if "fire" not in sys.modules:
+        fire_stub = ModuleType("fire")
+
+        def _fire(*args, **kwargs):
+            return None
+
+        fire_stub.Fire = _fire
+        sys.modules["fire"] = fire_stub
+        installed.append("fire")
+
+    try:
+        yield
+    finally:
+        for module_name in installed:
+            sys.modules.pop(module_name, None)
+        if inserted_repo:
+            try:
+                sys.path.remove(repo_str)
+            except ValueError:
+                pass
+
+
+def _load_hermes_symbols(hermes_repo: str | Path | None = None) -> SimpleNamespace:
+    repo = Path(hermes_repo).expanduser() if hermes_repo else get_hermes_agent_path()
+    repo = repo.resolve()
+
+    with _temporary_hermes_import_shims(repo):
+        run_agent_module = _import_module_from_path(
+            "_self_evolution_run_agent",
+            repo / "run_agent.py",
+        )
+        skill_commands_module = _import_module_from_path(
+            "_self_evolution_skill_commands",
+            repo / "agent" / "skill_commands.py",
+        )
+
+    return SimpleNamespace(
+        AIAgent=run_agent_module.AIAgent,
+        build_preloaded_skills_prompt=skill_commands_module.build_preloaded_skills_prompt,
+    )
+
+
+def build_skill_system_prompt(
+    skill_name: str,
+    hermes_repo: str | Path | None = None,
+    skill_body_override: str | None = None,
+) -> tuple[str, list[str]]:
+    """Load a Hermes skill the same way the CLI preloads it for a session.
+
+    When ``skill_body_override`` is provided, inline it as the active skill body so
+    we can evaluate candidate variants with the real Hermes runtime before they are
+    written back into the target repository.
+    """
+    if skill_body_override is not None:
+        prompt = (
+            f'[SYSTEM: The user launched this evaluation session with the "{skill_name}" skill '
+            "preloaded. Treat its instructions as active guidance for the duration of this "
+            "session unless overridden.]\n\n"
+            f"# Active Skill: {skill_name}\n\n{skill_body_override}"
+        )
+        return prompt, [skill_name]
+
+    symbols = _load_hermes_symbols(hermes_repo)
+    skills_prompt, loaded_skills, missing_skills = symbols.build_preloaded_skills_prompt([skill_name])
+    if missing_skills:
+        missing_display = ", ".join(missing_skills)
+        raise ValueError(f"Unknown skill(s): {missing_display}")
+    if not skills_prompt:
+        raise ValueError(f"Failed to build prompt for skill: {skill_name}")
+    return skills_prompt, loaded_skills
+
+
+def run_skill_eval(
+    case: HermesSkillEvalCase,
+    *,
+    model: str = "openai/gpt-4.1-mini",
+    hermes_repo: str | Path | None = None,
+    agent_kwargs: Optional[dict[str, Any]] = None,
+    skill_body_override: str | None = None,
+) -> HermesSkillEvalResult:
+    """Run one evaluation case through a real Hermes Agent instance."""
+    symbols = _load_hermes_symbols(hermes_repo)
+    skills_prompt, loaded_skills = build_skill_system_prompt(
+        case.skill_name,
+        hermes_repo=hermes_repo,
+        skill_body_override=skill_body_override,
+    )
+    effective_system_prompt = "\n\n".join(
+        part for part in (case.system_prompt, skills_prompt) if part
+    ).strip()
+
+    merged_agent_kwargs = {
+        "model": model,
+        "quiet_mode": True,
+        "skip_context_files": True,
+        "skip_memory": True,
+        "ephemeral_system_prompt": effective_system_prompt or None,
+    }
+    if agent_kwargs:
+        merged_agent_kwargs.update(agent_kwargs)
+
+    agent = symbols.AIAgent(**merged_agent_kwargs)
+    raw_result = agent.run_conversation(
+        user_message=case.task_input,
+        conversation_history=case.conversation_history,
+    )
+
+    if isinstance(raw_result, dict):
+        final_response = str(raw_result.get("final_response", ""))
+    else:
+        final_response = str(raw_result)
+
+    return HermesSkillEvalResult(
+        final_response=final_response,
+        loaded_skills=loaded_skills,
+        effective_system_prompt=effective_system_prompt,
+        raw_result=raw_result,
+    )
diff --git a/evolution/core/report_artifact.py b/evolution/core/report_artifact.py
new file mode 100644
index 0000000..444852b
--- /dev/null
+++ b/evolution/core/report_artifact.py
@@ -0,0 +1,352 @@
+"""Helpers for generating structured evolution report artifacts."""
+
+from __future__ import annotations
+
+import json
+import shlex
+from pathlib import Path
+from typing import Any
+
+
+def summarize_recommendation(metrics: dict[str, Any]) -> dict[str, Any]:
+    """Derive a human-review recommendation from holdout + gate metrics."""
+    reasons: list[str] = []
+    tblite_gate = metrics.get("tblite_gate")
+
+    if tblite_gate and not tblite_gate.get("passed", False):
+        decision = "reject"
+        reasons.append("benchmark_gate_failed")
+    elif metrics.get("improvement", 0.0) > 0:
+        decision = "candidate_for_review"
+        reasons.append("holdout_improved")
+        if tblite_gate and tblite_gate.get("passed"):
+            reasons.append("benchmark_gate_passed")
+    else:
+        decision = "review_needed"
+        reasons.append("no_holdout_improvement")
+        if tblite_gate and tblite_gate.get("passed"):
+            reasons.append("benchmark_gate_passed")
+
+    return {
+        "skill_name": metrics.get("skill_name"),
+        "decision": decision,
+        "reasons": reasons,
+        "improvement": metrics.get("improvement"),
+        "tblite_gate": tblite_gate,
+    }
+
+
+def build_evolution_report(
+    *,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+) -> str:
+    """Render a markdown report for one evolution run."""
+    summary = summarize_recommendation(metrics)
+    tblite_gate = metrics.get("tblite_gate")
+
+    lines = [
+        "# Evolution Report",
+        "",
+        f"- Skill: `{metrics.get('skill_name')}`",
+        f"- Timestamp: `{metrics.get('timestamp')}`",
+        f"- Decision: `{summary['decision']}`",
+        f"- Reasons: `{', '.join(summary['reasons']) or 'none'}`",
+        "",
+        "## Holdout Results",
+        "",
+        f"- Eval backend: `{metrics.get('eval_backend')}`",
+        f"- Baseline score: `{metrics.get('baseline_score'):.3f}`",
+        f"- Evolved score: `{metrics.get('evolved_score'):.3f}`",
+        f"- Improvement: `{metrics.get('improvement'):+.3f}`",
+        "",
+        "## Benchmark Gate",
+        "",
+    ]
+
+    if tblite_gate:
+        lines.extend([
+            f"- Summary: {tblite_gate.get('summary')}",
+            f"- Mode: `{tblite_gate.get('mode')}`",
+            f"- Task filter: `{tblite_gate.get('task_filter')}`",
+            f"- Baseline pass rate: `{tblite_gate.get('baseline_pass_rate'):.3f}`",
+            f"- Evolved pass rate: `{tblite_gate.get('evolved_pass_rate'):.3f}`",
+            f"- Delta: `{tblite_gate.get('delta'):+.3f}`",
+        ])
+    else:
+        lines.append("- Not run")
+
+    lines.extend([
+        "",
+        "## Artifacts",
+        "",
+        f"- Baseline skill: `{baseline_skill_path}`",
+        f"- Evolved skill: `{evolved_skill_path}`",
+    ])
+    return "\n".join(lines) + "\n"
+
+
+def build_diff_summary(
+    *,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+) -> str:
+    """Render a concise markdown diff summary for reviewers."""
+    size_delta = metrics.get("evolved_size", 0) - metrics.get("baseline_size", 0)
+    return "\n".join([
+        "# Diff Summary",
+        "",
+        f"- Skill: `{metrics.get('skill_name')}`",
+        f"- Holdout improvement: `{metrics.get('improvement', 0.0):+.3f}`",
+        f"- Skill size delta: `{size_delta:+d}` chars",
+        f"- Baseline artifact: `{baseline_skill_path}`",
+        f"- Evolved artifact: `{evolved_skill_path}`",
+        "",
+        "## Reviewer Focus",
+        "- Confirm the evolved skill improves behavior without overfitting.",
+        "- Inspect prompt growth and instruction clarity.",
+    ]) + "\n"
+
+
+def build_review_checklist(metrics: dict[str, Any]) -> str:
+    """Render a reviewer checklist for human approval."""
+    summary = summarize_recommendation(metrics)
+    tblite_gate = metrics.get("tblite_gate")
+
+    lines = [
+        "# Review Checklist",
+        "",
+        f"- [ ] Decision `{summary['decision']}` matches the evidence",
+        f"- [ ] Holdout improvement looks meaningful (`{metrics.get('improvement', 0.0):+.3f}`)",
+        "- [ ] Evolved skill wording is clearer or more robust than baseline",
+    ]
+    if tblite_gate:
+        lines.append(f"- [ ] TBLite gate result reviewed (`{tblite_gate.get('summary')}`)")
+    else:
+        lines.append("- [ ] Confirm whether a benchmark gate is still needed")
+    lines.append("- [ ] Approve only after inspecting the actual skill diff")
+    return "\n".join(lines) + "\n"
+
+
+def build_pr_draft(
+    *,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+) -> str:
+    """Render a PR-ready markdown draft for candidate variants."""
+    summary = summarize_recommendation(metrics)
+    tblite_gate = metrics.get("tblite_gate")
+    gate_line = tblite_gate.get("summary") if tblite_gate else "Not run"
+
+    return "\n".join([
+        "# PR Draft",
+        "",
+        f"Decision: `{summary['decision']}`",
+        "",
+        "## Summary",
+        f"- Candidate skill: `{metrics.get('skill_name')}`",
+        f"- Holdout improvement: `{metrics.get('improvement', 0.0):+.3f}`",
+        f"- Benchmark gate: {gate_line}",
+        "",
+        "## Artifacts",
+        f"- Baseline skill: `{baseline_skill_path}`",
+        f"- Evolved skill: `{evolved_skill_path}`",
+        "",
+        "## Test Plan",
+        "- [x] Focused self-evolution unit tests",
+        "- [ ] Human review of the skill diff",
+        "- [ ] Optional broader benchmark rerun before merge",
+    ]) + "\n"
+
+
+def build_github_pr_title(metrics: dict[str, Any]) -> str:
+    """Build a concise GitHub PR title for an evolved skill candidate."""
+    decision = summarize_recommendation(metrics)["decision"]
+    suffix = "candidate" if decision == "candidate_for_review" else "review"
+    return f"feat: evolve skill {metrics.get('skill_name')} ({suffix})"
+
+
+def build_github_pr_body(
+    *,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+    report_path: str | Path,
+    summary_path: str | Path,
+    diff_summary_path: str | Path,
+    review_checklist_path: str | Path,
+) -> str:
+    """Render a GitHub-ready PR body from evolution artifacts."""
+    summary = summarize_recommendation(metrics)
+    tblite_gate = metrics.get("tblite_gate")
+    gate_line = tblite_gate.get("summary") if tblite_gate else "Not run"
+
+    return "\n".join([
+        "## Summary",
+        f"- Evolved skill: `{metrics.get('skill_name')}`",
+        f"- Decision: `{summary['decision']}`",
+        f"- Reasons: `{', '.join(summary['reasons']) or 'none'}`",
+        f"- Holdout improvement: `{metrics.get('improvement', 0.0):+.3f}`",
+        "",
+        "## Evaluation Evidence",
+        f"- Holdout backend: `{metrics.get('eval_backend')}`",
+        f"- Baseline score: `{metrics.get('baseline_score', 0.0):.3f}`",
+        f"- Evolved score: `{metrics.get('evolved_score', 0.0):.3f}`",
+        f"- Benchmark gate: {gate_line}",
+        "",
+        "## Artifacts",
+        f"- Baseline skill: `{baseline_skill_path}`",
+        f"- Evolved skill: `{evolved_skill_path}`",
+        f"- Evolution report: `{report_path}`",
+        f"- Summary JSON: `{summary_path}`",
+        f"- Diff summary: `{diff_summary_path}`",
+        f"- Review checklist: `{review_checklist_path}`",
+        "",
+        "## Test Plan",
+        "- [x] Focused self-evolution unit tests",
+        "- [ ] Review generated artifact set",
+        "- [ ] Inspect evolved skill diff in context",
+        "- [ ] Re-run broader benchmark gate if needed before merge",
+    ]) + "\n"
+
+
+def build_gh_pr_create_command(
+    *,
+    metrics: dict[str, Any],
+    body_path: str | Path,
+    base_branch: str = "main",
+) -> str | None:
+    """Build a gh CLI command suggestion when the run is PR-worthy."""
+    decision = summarize_recommendation(metrics)["decision"]
+    if decision == "reject":
+        return None
+
+    title = build_github_pr_title(metrics)
+    draft_flag = " --draft" if decision == "review_needed" else ""
+    return (
+        "gh pr create"
+        f" --base {shlex.quote(base_branch)}"
+        f" --title {shlex.quote(title)}"
+        f" --body-file {shlex.quote(str(body_path))}"
+        f"{draft_flag}"
+    )
+
+
+def write_report_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+) -> dict[str, Path]:
+    """Write markdown + machine-readable summary artifacts."""
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    summary = summarize_recommendation(metrics)
+    report_md = out_dir / "report.md"
+    summary_json = out_dir / "summary.json"
+
+    report_md.write_text(
+        build_evolution_report(
+            metrics=metrics,
+            baseline_skill_path=baseline_skill_path,
+            evolved_skill_path=evolved_skill_path,
+        )
+    )
+    summary_json.write_text(json.dumps(summary, indent=2))
+
+    return {
+        "report_md": report_md,
+        "summary_json": summary_json,
+    }
+
+
+def write_pr_ready_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+) -> dict[str, Path]:
+    """Write PR-ready markdown artifacts for review and handoff."""
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    pr_draft_md = out_dir / "pr_draft.md"
+    review_checklist_md = out_dir / "review_checklist.md"
+    diff_summary_md = out_dir / "diff_summary.md"
+
+    pr_draft_md.write_text(
+        build_pr_draft(
+            metrics=metrics,
+            baseline_skill_path=baseline_skill_path,
+            evolved_skill_path=evolved_skill_path,
+        )
+    )
+    review_checklist_md.write_text(build_review_checklist(metrics))
+    diff_summary_md.write_text(
+        build_diff_summary(
+            metrics=metrics,
+            baseline_skill_path=baseline_skill_path,
+            evolved_skill_path=evolved_skill_path,
+        )
+    )
+
+    return {
+        "pr_draft_md": pr_draft_md,
+        "review_checklist_md": review_checklist_md,
+        "diff_summary_md": diff_summary_md,
+    }
+
+
+def write_github_pr_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict[str, Any],
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+    report_path: str | Path,
+    summary_path: str | Path,
+    diff_summary_path: str | Path,
+    review_checklist_path: str | Path,
+    base_branch: str = "main",
+) -> dict[str, Path | None]:
+    """Write GitHub-ready PR body + optional gh command suggestion."""
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    github_pr_body_md = out_dir / "github_pr_body.md"
+    github_pr_body_md.write_text(
+        build_github_pr_body(
+            metrics=metrics,
+            baseline_skill_path=baseline_skill_path,
+            evolved_skill_path=evolved_skill_path,
+            report_path=report_path,
+            summary_path=summary_path,
+            diff_summary_path=diff_summary_path,
+            review_checklist_path=review_checklist_path,
+        )
+    )
+
+    gh_pr_create_command_txt = out_dir / "gh_pr_create_command.txt"
+    command = build_gh_pr_create_command(
+        metrics=metrics,
+        body_path=github_pr_body_md,
+        base_branch=base_branch,
+    )
+    if command is None:
+        if gh_pr_create_command_txt.exists():
+            gh_pr_create_command_txt.unlink()
+        command_path: Path | None = None
+    else:
+        gh_pr_create_command_txt.write_text(command + "\n")
+        command_path = gh_pr_create_command_txt
+
+    return {
+        "github_pr_body_md": github_pr_body_md,
+        "gh_pr_create_command_txt": command_path,
+    }
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 8ad4d89..28893e5 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -5,7 +5,9 @@
     python -m evolution.skills.evolve_skill --skill arxiv --eval-source golden --dataset datasets/skills/arxiv/
 """
 
+import inspect
 import json
+import os
 import sys
 import time
 from pathlib import Path
@@ -14,15 +16,29 @@
 
 import click
 import dspy
+import yaml
 from rich.console import Console
-from rich.panel import Panel
 from rich.table import Table
 
-from evolution.core.config import EvolutionConfig, get_hermes_agent_path
+from evolution.core.config import EvolutionConfig
 from evolution.core.dataset_builder import SyntheticDatasetBuilder, EvalDataset, GoldenDatasetLoader
 from evolution.core.external_importers import build_dataset_from_external
-from evolution.core.fitness import skill_fitness_metric, LLMJudge, FitnessScore
+from evolution.core.fitness import LLMJudge, skill_fitness_metric
 from evolution.core.constraints import ConstraintValidator
+from evolution.core.hermes_eval import HermesSkillEvalCase, run_skill_eval
+from evolution.core.benchmark_gate import run_tblite_benchmark_gate
+from evolution.core.report_artifact import (
+    build_github_pr_title,
+    write_github_pr_artifacts,
+    write_pr_ready_artifacts,
+    write_report_artifacts,
+)
+from evolution.core.git_pr_automation import (
+    build_evolution_branch_name,
+    build_target_skill_path,
+    execute_git_pr_automation,
+    write_git_pr_automation_artifacts,
+)
 from evolution.skills.skill_module import (
     SkillModule,
     load_skill,
@@ -32,32 +48,447 @@
 
 console = Console()
 
+DEFAULT_OPTIMIZER_MODEL = "openai/gpt-4.1"
+DEFAULT_EVAL_MODEL = "openai/gpt-4.1-mini"
+DEFAULT_HERMES_EVAL_MAX_ITERATIONS = 12
+
+
+def load_env_file(path: str | Path) -> dict[str, str]:
+    """Load KEY=VALUE lines from an env file without overwriting existing env vars."""
+    env_path = Path(path).expanduser()
+    if not env_path.exists():
+        return {}
+
+    loaded: dict[str, str] = {}
+    for raw_line in env_path.read_text().splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        if line.startswith("export "):
+            line = line[len("export "):].strip()
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip()
+        if not key or key in os.environ:
+            continue
+        if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
+            value = value[1:-1]
+        os.environ[key] = value
+        loaded[key] = value
+
+    return loaded
+
+
+def load_default_env_files() -> list[Path]:
+    """Load default credential env files used by local Hermes workflows."""
+    loaded_paths: list[Path] = []
+    for candidate in [Path.home() / ".hermes" / ".env", Path.cwd() / ".env"]:
+        if load_env_file(candidate):
+            loaded_paths.append(candidate)
+    return loaded_paths
+
+
+def load_hermes_model_config(config_path: str | Path | None = None) -> dict:
+    """Read the local Hermes model config used by the runtime, if available."""
+    candidate = Path(config_path).expanduser() if config_path else Path.home() / ".hermes" / "config.yaml"
+    if not candidate.exists():
+        return {}
+    data = yaml.safe_load(candidate.read_text()) or {}
+    model_cfg = data.get("model") or {}
+    return model_cfg if isinstance(model_cfg, dict) else {}
+
+
+def resolve_runtime_model_settings(
+    *,
+    optimizer_model: str,
+    eval_model: str,
+    config_path: str | Path | None = None,
+) -> tuple[str, str, dict[str, str]]:
+    """Align self-evolution model defaults with the active Hermes runtime config.
+
+    On machines where Hermes is configured against a custom OpenAI-compatible endpoint,
+    the repo's hardcoded OpenAI defaults tend to fail. When the caller is still using
+    those defaults, prefer the Hermes runtime's configured default model and expose its
+    base URL to LiteLLM/DSPy via env vars without overwriting user-provided values.
+    """
+    model_cfg = load_hermes_model_config(config_path)
+    provider = str(model_cfg.get("provider") or "").strip().lower()
+    default_model = str(model_cfg.get("default") or "").strip()
+    base_url = str(model_cfg.get("base_url") or "").strip().rstrip("/")
+    applied_env: dict[str, str] = {}
+
+    if provider == "custom" and default_model:
+        if optimizer_model == DEFAULT_OPTIMIZER_MODEL:
+            optimizer_model = default_model
+        if eval_model == DEFAULT_EVAL_MODEL:
+            eval_model = default_model
+
+    if provider == "custom" and base_url:
+        for env_name in ("OPENAI_BASE_URL", "OPENAI_API_BASE"):
+            if not os.environ.get(env_name):
+                os.environ[env_name] = base_url
+                applied_env[env_name] = base_url
+
+    return optimizer_model, eval_model, applied_env
+
+
+def score_output_against_example(
+    *,
+    example,
+    agent_output: str,
+    skill_body: str,
+    eval_model: str,
+    hermes_agent_path: str | Path | None = None,
+) -> float:
+    """Score one agent output against an eval example using the richer LLM judge."""
+    judge_config = EvolutionConfig(
+        eval_model=eval_model,
+        judge_model=eval_model,
+    )
+    if hermes_agent_path is not None:
+        judge_config.hermes_agent_path = Path(hermes_agent_path).expanduser()
+    judge = LLMJudge(judge_config)
+    score = judge.score(
+        task_input=example.task_input,
+        expected_behavior=example.expected_behavior,
+        agent_output=agent_output,
+        skill_text=skill_body,
+    )
+    return score.composite
+
+
+def evaluate_holdout(
+    *,
+    dataset: EvalDataset,
+    eval_backend: str,
+    baseline_module,
+    evolved_module,
+    baseline_skill_body: str,
+    evolved_skill_body: str,
+    eval_model: str,
+    hermes_repo: str | None = None,
+    skill_name: str | None = None,
+):
+    """Evaluate baseline and evolved variants on the holdout split."""
+    holdout_examples = dataset.to_dspy_examples("holdout")
+
+    baseline_scores = []
+    evolved_scores = []
+
+    if eval_backend == "dspy":
+        lm = dspy.LM(eval_model)
+        for ex in holdout_examples:
+            with dspy.context(lm=lm):
+                baseline_pred = baseline_module(task_input=ex.task_input)
+                baseline_scores.append(skill_fitness_metric(ex, baseline_pred))
+
+                evolved_pred = evolved_module(task_input=ex.task_input)
+                evolved_scores.append(skill_fitness_metric(ex, evolved_pred))
+        return baseline_scores, evolved_scores
+
+    if eval_backend != "hermes":
+        raise ValueError(f"Unknown eval backend: {eval_backend}")
+    if not skill_name:
+        raise ValueError("skill_name is required when eval_backend='hermes'")
+
+    for ex in holdout_examples:
+        case = HermesSkillEvalCase(
+            skill_name=skill_name,
+            task_input=ex.task_input,
+        )
+        baseline_result = run_skill_eval(
+            case,
+            model=eval_model,
+            hermes_repo=hermes_repo,
+            skill_body_override=baseline_skill_body,
+            agent_kwargs={"max_iterations": DEFAULT_HERMES_EVAL_MAX_ITERATIONS},
+        )
+        baseline_scores.append(
+            score_output_against_example(
+                example=ex,
+                agent_output=baseline_result.final_response,
+                skill_body=baseline_skill_body,
+                eval_model=eval_model,
+                hermes_agent_path=hermes_repo,
+            )
+        )
+
+        evolved_result = run_skill_eval(
+            case,
+            model=eval_model,
+            hermes_repo=hermes_repo,
+            skill_body_override=evolved_skill_body,
+            agent_kwargs={"max_iterations": DEFAULT_HERMES_EVAL_MAX_ITERATIONS},
+        )
+        evolved_scores.append(
+            score_output_against_example(
+                example=ex,
+                agent_output=evolved_result.final_response,
+                skill_body=evolved_skill_body,
+                eval_model=eval_model,
+                hermes_agent_path=hermes_repo,
+            )
+        )
+
+    return baseline_scores, evolved_scores
+
+
+def maybe_run_tblite_gate(
+    *,
+    run_tblite: bool,
+    skill_name: str,
+    baseline_skill_body: str,
+    evolved_skill_body: str,
+    hermes_repo: str | None,
+    tblite_regression_threshold: float = 0.02,
+    tblite_task_filter: str | None = None,
+    tblite_mode: str = "fast",
+):
+    """Run the optional TBLite regression gate when enabled."""
+    if not run_tblite:
+        return None
+
+    if not hermes_repo:
+        raise ValueError("hermes_repo is required when run_tblite=True")
+
+    return run_tblite_benchmark_gate(
+        skill_name=skill_name,
+        baseline_skill_body=baseline_skill_body,
+        evolved_skill_body=evolved_skill_body,
+        hermes_repo=hermes_repo,
+        regression_threshold=tblite_regression_threshold,
+        task_filter=tblite_task_filter,
+        mode=tblite_mode,
+    )
+
+
+def write_evolution_report_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict,
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+):
+    """Write structured report artifacts for one evolution run."""
+    return write_report_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        baseline_skill_path=baseline_skill_path,
+        evolved_skill_path=evolved_skill_path,
+    )
+
+
+def write_evolution_pr_ready_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict,
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+):
+    """Write PR-ready artifacts for one evolution run."""
+    return write_pr_ready_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        baseline_skill_path=baseline_skill_path,
+        evolved_skill_path=evolved_skill_path,
+    )
+
+
+def write_evolution_github_pr_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict,
+    baseline_skill_path: str | Path,
+    evolved_skill_path: str | Path,
+    report_path: str | Path,
+    summary_path: str | Path,
+    diff_summary_path: str | Path,
+    review_checklist_path: str | Path,
+    base_branch: str = "main",
+):
+    """Write GitHub-ready PR body + gh command suggestion artifacts."""
+    return write_github_pr_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        baseline_skill_path=baseline_skill_path,
+        evolved_skill_path=evolved_skill_path,
+        report_path=report_path,
+        summary_path=summary_path,
+        diff_summary_path=diff_summary_path,
+        review_checklist_path=review_checklist_path,
+        base_branch=base_branch,
+    )
+
+
+def write_evolution_git_pr_automation_artifacts(
+    *,
+    output_dir: str | Path,
+    metrics: dict,
+    hermes_repo: str | Path,
+    skill_relpath: str,
+    evolved_skill_text: str,
+    github_pr_body_path: str | Path,
+):
+    """Write git/GitHub connected handoff artifacts for an evolved skill."""
+    return write_git_pr_automation_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        hermes_repo=hermes_repo,
+        skill_relpath=skill_relpath,
+        evolved_skill_text=evolved_skill_text,
+        github_pr_body_path=github_pr_body_path,
+    )
+
+
+def execute_evolution_git_pr_automation(
+    *,
+    git_apply_plan: dict,
+    gh_pr_create_command: str | None,
+    execute_push: bool,
+    execute_pr: bool,
+):
+    """Execute the real git/PR automation flow."""
+    return execute_git_pr_automation(
+        git_apply_plan=git_apply_plan,
+        gh_pr_create_command=gh_pr_create_command,
+        execute_push=execute_push,
+        execute_pr=execute_pr,
+    )
+
+
+def validate_skill_constraints(
+    *,
+    validator: ConstraintValidator,
+    full_skill_text: str,
+    baseline_full_text: str | None = None,
+):
+    """Validate skill constraints against the complete skill text, including frontmatter."""
+    return validator.validate_all(
+        full_skill_text,
+        "skill",
+        baseline_text=baseline_full_text,
+    )
+
+
+class NormalizedReflectionLM:
+    """Adapt DSPy/LiteLLM outputs to the list[str|dict] shape GEPA reflective mutation expects."""
+
+    def __init__(self, base_lm):
+        self.base_lm = base_lm
+
+    def __call__(self, *args, **kwargs):
+        result = self.base_lm(*args, **kwargs)
+        if isinstance(result, list):
+            if not result:
+                return [""]
+            normalized = []
+            for item in result:
+                if item is None:
+                    normalized.append("")
+                elif isinstance(item, (str, dict)):
+                    normalized.append(item)
+                else:
+                    normalized.append(str(item))
+            return normalized
+        if result is None:
+            return [""]
+        if isinstance(result, (str, dict)):
+            return [result]
+        return [str(result)]
+
+    def __getattr__(self, name):
+        return getattr(self.base_lm, name)
+
+
+def create_gepa_optimizer(*, metric, iterations: int, optimizer_model: str | None = None):
+    """Instantiate GEPA across DSPy API variants."""
+    params = inspect.signature(dspy.GEPA).parameters
+    kwargs = {"metric": metric}
+    if "max_steps" in params:
+        kwargs["max_steps"] = iterations
+    else:
+        if callable(metric):
+            metric_arity = len(inspect.signature(metric).parameters)
+            if metric_arity < 5:
+                def _wrapped_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
+                    return metric(gold, pred, trace)
+                kwargs["metric"] = _wrapped_metric
+        if "reflection_lm" in params and optimizer_model:
+            kwargs["reflection_lm"] = NormalizedReflectionLM(dspy.LM(optimizer_model))
+        if "max_full_evals" in params:
+            kwargs["max_full_evals"] = max(1, iterations)
+        elif "auto" in params:
+            kwargs["auto"] = "light"
+    return dspy.GEPA(**kwargs)
+
+
+def optimize_skill_module(*, baseline_module, trainset, valset, iterations: int, metric, optimizer_model: str | None = None):
+    """Run GEPA when available, otherwise fall back to MIPROv2."""
+    try:
+        optimizer = create_gepa_optimizer(metric=metric, iterations=iterations, optimizer_model=optimizer_model)
+        optimized = optimizer.compile(
+            baseline_module,
+            trainset=trainset,
+            valset=valset,
+        )
+        return "GEPA", optimized
+    except Exception as e:
+        console.print(f"[yellow]GEPA not available ({e}), falling back to MIPROv2[/yellow]")
+        optimizer = dspy.MIPROv2(
+            metric=metric,
+            auto="light",
+        )
+        optimized = optimizer.compile(
+            baseline_module,
+            trainset=trainset,
+            valset=valset,
+        )
+        return "MIPROv2", optimized
+
 
 def evolve(
     skill_name: str,
     iterations: int = 10,
     eval_source: str = "synthetic",
     dataset_path: Optional[str] = None,
-    optimizer_model: str = "openai/gpt-4.1",
-    eval_model: str = "openai/gpt-4.1-mini",
+    optimizer_model: str = DEFAULT_OPTIMIZER_MODEL,
+    eval_model: str = DEFAULT_EVAL_MODEL,
     hermes_repo: Optional[str] = None,
     run_tests: bool = False,
     dry_run: bool = False,
+    eval_backend: str = "dspy",
+    run_tblite: bool = False,
+    tblite_task_filter: Optional[str] = None,
+    tblite_mode: str = "fast",
+    execute_git_apply: bool = False,
+    execute_push: bool = False,
+    execute_pr: bool = False,
 ):
     """Main evolution function — orchestrates the full optimization loop."""
 
-    config = EvolutionConfig(
-        iterations=iterations,
+    load_default_env_files()
+    optimizer_model, eval_model, runtime_env = resolve_runtime_model_settings(
         optimizer_model=optimizer_model,
         eval_model=eval_model,
-        judge_model=eval_model,  # Use same model for dataset generation
-        run_pytest=run_tests,
     )
+
+    config_kwargs = {
+        "iterations": iterations,
+        "optimizer_model": optimizer_model,
+        "eval_model": eval_model,
+        "judge_model": eval_model,
+        "run_pytest": run_tests,
+        "run_tblite": run_tblite,
+    }
     if hermes_repo:
-        config.hermes_agent_path = Path(hermes_repo)
+        config_kwargs["hermes_agent_path"] = Path(hermes_repo).expanduser()
+
+    config = EvolutionConfig(**config_kwargs)
 
-    # ── 1. Find and load the skill ──────────────────────────────────────
     console.print(f"\n[bold cyan]🧬 Hermes Agent Self-Evolution[/bold cyan] — Evolving skill: [bold]{skill_name}[/bold]\n")
+    if runtime_env:
+        env_list = ", ".join(f"{k}={v}" for k, v in runtime_env.items())
+        console.print(f"  Runtime model alignment: applied {env_list}")
 
     skill_path = find_skill(skill_name, config.hermes_agent_path)
     if not skill_path:
@@ -74,10 +505,19 @@ def evolve(
         console.print(f"\n[bold green]DRY RUN — setup validated successfully.[/bold green]")
         console.print(f"  Would generate eval dataset (source: {eval_source})")
         console.print(f"  Would run GEPA optimization ({iterations} iterations)")
+        console.print(f"  Would evaluate holdout via backend: {eval_backend}")
+        if run_tblite:
+            console.print(
+                f"  Would run TBLite {tblite_mode} regression gate "
+                f"(task filter: {tblite_task_filter or 'mode default'})"
+            )
+        if execute_git_apply:
+            console.print(
+                f"  Would execute git apply flow (push={execute_push}, pr={execute_pr})"
+            )
         console.print(f"  Would validate constraints and create PR")
         return
 
-    # ── 2. Build or load evaluation dataset ─────────────────────────────
     console.print(f"\n[bold]Building evaluation dataset[/bold] (source: {eval_source})")
 
     if eval_source == "golden" and dataset_path:
@@ -102,7 +542,6 @@ def evolve(
             artifact_text=skill["raw"],
             artifact_type="skill",
         )
-        # Save for reuse
         save_path = Path("datasets") / "skills" / skill_name
         dataset.save(save_path)
         console.print(f"  Generated {len(dataset.all_examples)} synthetic examples")
@@ -116,10 +555,12 @@ def evolve(
 
     console.print(f"  Split: {len(dataset.train)} train / {len(dataset.val)} val / {len(dataset.holdout)} holdout")
 
-    # ── 3. Validate constraints on baseline ─────────────────────────────
     console.print(f"\n[bold]Validating baseline constraints[/bold]")
     validator = ConstraintValidator(config)
-    baseline_constraints = validator.validate_all(skill["body"], "skill")
+    baseline_constraints = validate_skill_constraints(
+        validator=validator,
+        full_skill_text=skill["raw"],
+    )
     all_pass = True
     for c in baseline_constraints:
         icon = "✓" if c.passed else "✗"
@@ -131,62 +572,46 @@ def evolve(
     if not all_pass:
         console.print("[yellow]⚠ Baseline skill has constraint violations — proceeding anyway[/yellow]")
 
-    # ── 4. Set up DSPy + GEPA optimizer ─────────────────────────────────
     console.print(f"\n[bold]Configuring optimizer[/bold]")
     console.print(f"  Optimizer: GEPA ({iterations} iterations)")
     console.print(f"  Optimizer model: {optimizer_model}")
     console.print(f"  Eval model: {eval_model}")
+    console.print(f"  Holdout backend: {eval_backend}")
 
-    # Configure DSPy
     lm = dspy.LM(eval_model)
     dspy.configure(lm=lm)
 
-    # Create the baseline skill module
     baseline_module = SkillModule(skill["body"])
-
-    # Prepare DSPy examples
     trainset = dataset.to_dspy_examples("train")
     valset = dataset.to_dspy_examples("val")
 
-    # ── 5. Run GEPA optimization ────────────────────────────────────────
     console.print(f"\n[bold cyan]Running GEPA optimization ({iterations} iterations)...[/bold cyan]\n")
 
     start_time = time.time()
 
-    try:
-        optimizer = dspy.GEPA(
-            metric=skill_fitness_metric,
-            max_steps=iterations,
-        )
-
-        optimized_module = optimizer.compile(
-            baseline_module,
-            trainset=trainset,
-            valset=valset,
-        )
-    except Exception as e:
-        # Fall back to MIPROv2 if GEPA isn't available in this DSPy version
-        console.print(f"[yellow]GEPA not available ({e}), falling back to MIPROv2[/yellow]")
-        optimizer = dspy.MIPROv2(
-            metric=skill_fitness_metric,
-            auto="light",
-        )
-        optimized_module = optimizer.compile(
-            baseline_module,
-            trainset=trainset,
-        )
+    optimizer_name, optimized_module = optimize_skill_module(
+        baseline_module=baseline_module,
+        trainset=trainset,
+        valset=valset,
+        iterations=iterations,
+        metric=skill_fitness_metric,
+        optimizer_model=optimizer_model,
+    )
+    if optimizer_name != "GEPA":
+        console.print(f"[yellow]Optimizer fallback in use: {optimizer_name}[/yellow]")
 
     elapsed = time.time() - start_time
     console.print(f"\n  Optimization completed in {elapsed:.1f}s")
 
-    # ── 6. Extract evolved skill text ───────────────────────────────────
-    # The optimized module's instructions contain the evolved skill text
     evolved_body = optimized_module.skill_text
     evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
 
-    # ── 7. Validate evolved skill ───────────────────────────────────────
     console.print(f"\n[bold]Validating evolved skill[/bold]")
-    evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"])
+    evolved_constraints = validate_skill_constraints(
+        validator=validator,
+        full_skill_text=evolved_full,
+        baseline_full_text=skill["raw"],
+    )
     all_pass = True
     for c in evolved_constraints:
         icon = "✓" if c.passed else "✗"
@@ -197,36 +622,30 @@ def evolve(
 
     if not all_pass:
         console.print("[red]✗ Evolved skill FAILED constraints — not deploying[/red]")
-        # Still save for inspection
         output_path = Path("output") / skill_name / "evolved_FAILED.md"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         output_path.write_text(evolved_full)
         console.print(f"  Saved failed variant to {output_path}")
         return
 
-    # ── 8. Evaluate on holdout set ──────────────────────────────────────
     console.print(f"\n[bold]Evaluating on holdout set ({len(dataset.holdout)} examples)[/bold]")
 
-    holdout_examples = dataset.to_dspy_examples("holdout")
-
-    baseline_scores = []
-    evolved_scores = []
-    for ex in holdout_examples:
-        # Score baseline
-        with dspy.context(lm=lm):
-            baseline_pred = baseline_module(task_input=ex.task_input)
-            baseline_score = skill_fitness_metric(ex, baseline_pred)
-            baseline_scores.append(baseline_score)
-
-            evolved_pred = optimized_module(task_input=ex.task_input)
-            evolved_score = skill_fitness_metric(ex, evolved_pred)
-            evolved_scores.append(evolved_score)
+    baseline_scores, evolved_scores = evaluate_holdout(
+        dataset=dataset,
+        eval_backend=eval_backend,
+        baseline_module=baseline_module,
+        evolved_module=optimized_module,
+        baseline_skill_body=skill["body"],
+        evolved_skill_body=evolved_body,
+        eval_model=eval_model,
+        hermes_repo=str(config.hermes_agent_path),
+        skill_name=skill_name,
+    )
 
     avg_baseline = sum(baseline_scores) / max(1, len(baseline_scores))
     avg_evolved = sum(evolved_scores) / max(1, len(evolved_scores))
     improvement = avg_evolved - avg_baseline
 
-    # ── 9. Report results ───────────────────────────────────────────────
     table = Table(title="Evolution Results")
     table.add_column("Metric", style="bold")
     table.add_column("Baseline", justify="right")
@@ -248,28 +667,46 @@ def evolve(
     )
     table.add_row("Time", "", f"{elapsed:.1f}s", "")
     table.add_row("Iterations", "", str(iterations), "")
+    table.add_row("Eval Backend", "", eval_backend, "")
+
+    tblite_gate_result = maybe_run_tblite_gate(
+        run_tblite=run_tblite,
+        skill_name=skill_name,
+        baseline_skill_body=skill["body"],
+        evolved_skill_body=evolved_body,
+        hermes_repo=str(config.hermes_agent_path),
+        tblite_regression_threshold=config.tblite_regression_threshold,
+        tblite_task_filter=tblite_task_filter,
+        tblite_mode=tblite_mode,
+    )
+    if tblite_gate_result is not None:
+        gate_color = "green" if tblite_gate_result.passed else "red"
+        table.add_row(
+            "TBLite Gate",
+            f"{tblite_gate_result.baseline_pass_rate:.3f}",
+            f"{tblite_gate_result.evolved_pass_rate:.3f}",
+            f"[{gate_color}]{tblite_gate_result.delta:+.3f}[/{gate_color}]",
+        )
 
     console.print()
     console.print(table)
 
-    # ── 10. Save output ─────────────────────────────────────────────────
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     output_dir = Path("output") / skill_name / timestamp
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    # Save evolved skill
     (output_dir / "evolved_skill.md").write_text(evolved_full)
-
-    # Save baseline for comparison
     (output_dir / "baseline_skill.md").write_text(skill["raw"])
+    baseline_skill_path = output_dir / "baseline_skill.md"
+    evolved_skill_path = output_dir / "evolved_skill.md"
 
-    # Save metrics
     metrics = {
         "skill_name": skill_name,
         "timestamp": timestamp,
         "iterations": iterations,
         "optimizer_model": optimizer_model,
         "eval_model": eval_model,
+        "eval_backend": eval_backend,
         "baseline_score": avg_baseline,
         "evolved_score": avg_evolved,
         "improvement": improvement,
@@ -280,10 +717,107 @@ def evolve(
         "holdout_examples": len(dataset.holdout),
         "elapsed_seconds": elapsed,
         "constraints_passed": all_pass,
+        "tblite_gate": None if tblite_gate_result is None else {
+            "passed": tblite_gate_result.passed,
+            "mode": tblite_gate_result.mode,
+            "task_filter": tblite_gate_result.task_filter,
+            "base_config_path": str(tblite_gate_result.base_config_path),
+            "baseline_pass_rate": tblite_gate_result.baseline_pass_rate,
+            "evolved_pass_rate": tblite_gate_result.evolved_pass_rate,
+            "delta": tblite_gate_result.delta,
+            "threshold": tblite_gate_result.threshold,
+            "summary": tblite_gate_result.summary,
+        },
     }
     (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2))
+    report_paths = write_evolution_report_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        baseline_skill_path=baseline_skill_path,
+        evolved_skill_path=evolved_skill_path,
+    )
+    pr_paths = write_evolution_pr_ready_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        baseline_skill_path=baseline_skill_path,
+        evolved_skill_path=evolved_skill_path,
+    )
+    github_pr_paths = write_evolution_github_pr_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        baseline_skill_path=baseline_skill_path,
+        evolved_skill_path=evolved_skill_path,
+        report_path=report_paths["report_md"],
+        summary_path=report_paths["summary_json"],
+        diff_summary_path=pr_paths["diff_summary_md"],
+        review_checklist_path=pr_paths["review_checklist_md"],
+    )
+    skill_relpath = str(skill_path.relative_to(config.hermes_agent_path / "skills"))
+    git_pr_automation_paths = write_evolution_git_pr_automation_artifacts(
+        output_dir=output_dir,
+        metrics=metrics,
+        hermes_repo=config.hermes_agent_path,
+        skill_relpath=skill_relpath,
+        evolved_skill_text=evolved_full,
+        github_pr_body_path=github_pr_paths["github_pr_body_md"],
+    )
 
     console.print(f"\n  Output saved to {output_dir}/")
+    console.print(f"  Report: {report_paths['report_md']}")
+    console.print(f"  Summary: {report_paths['summary_json']}")
+    console.print(f"  PR Draft: {pr_paths['pr_draft_md']}")
+    console.print(f"  Review Checklist: {pr_paths['review_checklist_md']}")
+    console.print(f"  Diff Summary: {pr_paths['diff_summary_md']}")
+    console.print(f"  GitHub PR Body: {github_pr_paths['github_pr_body_md']}")
+    if github_pr_paths["gh_pr_create_command_txt"] is not None:
+        console.print(f"  gh PR Command: {github_pr_paths['gh_pr_create_command_txt']}")
+    else:
+        console.print("  gh PR Command: skipped (decision=reject)")
+    console.print(f"  Candidate Skill Patch: {git_pr_automation_paths['candidate_skill_file']}")
+    console.print(f"  Git Apply Plan: {git_pr_automation_paths['git_apply_plan_sh']}")
+    console.print(f"  Git Apply Guide: {git_pr_automation_paths['git_apply_plan_md']}")
+    if git_pr_automation_paths["gh_pr_create_after_push_txt"] is not None:
+        console.print(f"  gh PR After Push: {git_pr_automation_paths['gh_pr_create_after_push_txt']}")
+    else:
+        console.print("  gh PR After Push: skipped (decision=reject)")
+
+    if tblite_gate_result is not None:
+        gate_style = "bold green" if tblite_gate_result.passed else "bold red"
+        gate_icon = "✓" if tblite_gate_result.passed else "✗"
+        console.print(f"[{gate_style}]{gate_icon} {tblite_gate_result.summary}[/{gate_style}]")
+        if not tblite_gate_result.passed:
+            console.print("[red]✗ Benchmark regression gate failed — evolved skill should not be deployed[/red]")
+            return
+
+    if execute_git_apply and execute_pr and not execute_push:
+        raise ValueError("execute_push must be True when execute_pr=True")
+
+    if execute_git_apply:
+        branch_name = build_evolution_branch_name(skill_name=skill_name, timestamp=timestamp)
+        target_skill_path = build_target_skill_path(
+            hermes_repo=config.hermes_agent_path,
+            skill_relpath=skill_relpath,
+        )
+        git_apply_plan = {
+            "hermes_repo": config.hermes_agent_path,
+            "branch_name": branch_name,
+            "target_skill_path": target_skill_path,
+            "candidate_skill_file": git_pr_automation_paths["candidate_skill_file"],
+            "commit_message": build_github_pr_title(metrics),
+        }
+        gh_pr_create_command = None
+        if git_pr_automation_paths["gh_pr_create_after_push_txt"] is not None:
+            gh_pr_create_command = Path(git_pr_automation_paths["gh_pr_create_after_push_txt"]).read_text().strip()
+
+        execution_result = execute_evolution_git_pr_automation(
+            git_apply_plan=git_apply_plan,
+            gh_pr_create_command=gh_pr_create_command,
+            execute_push=execute_push,
+            execute_pr=execute_pr,
+        )
+        console.print(f"  Executed Git Apply Steps: {len(execution_result['git']['steps'])}")
+        if execution_result["pr"] is not None:
+            console.print("  Executed PR creation via gh")
 
     if improvement > 0:
         console.print(f"\n[bold green]✓ Evolution improved skill by {improvement:+.3f} ({improvement/max(0.001, avg_baseline)*100:+.1f}%)[/bold green]")
@@ -298,13 +832,21 @@ def evolve(
 @click.option("--iterations", default=10, help="Number of GEPA iterations")
 @click.option("--eval-source", default="synthetic", type=click.Choice(["synthetic", "golden", "sessiondb"]),
               help="Source for evaluation dataset")
+@click.option("--eval-backend", default="dspy", type=click.Choice(["dspy", "hermes"]),
+              help="Backend used for holdout evaluation")
 @click.option("--dataset-path", default=None, help="Path to existing eval dataset (JSONL)")
-@click.option("--optimizer-model", default="openai/gpt-4.1", help="Model for GEPA reflections")
-@click.option("--eval-model", default="openai/gpt-4.1-mini", help="Model for evaluations")
+@click.option("--optimizer-model", default=DEFAULT_OPTIMIZER_MODEL, help="Model for GEPA reflections")
+@click.option("--eval-model", default=DEFAULT_EVAL_MODEL, help="Model for evaluations")
 @click.option("--hermes-repo", default=None, help="Path to hermes-agent repo")
 @click.option("--run-tests", is_flag=True, help="Run full pytest suite as constraint gate")
+@click.option("--run-tblite", is_flag=True, help="Run the TBLite regression benchmark gate after holdout eval")
+@click.option("--tblite-mode", default="fast", type=click.Choice(["fast", "full"]), help="TBLite gate mode: fast subset or full benchmark")
+@click.option("--tblite-task-filter", default=None, help="Optional comma-separated TBLite task filter for a faster gate")
+@click.option("--execute-git-apply", is_flag=True, help="Actually apply the evolved skill into the target Hermes repo branch")
+@click.option("--execute-push", is_flag=True, help="When executing git apply, also push the branch to the remote")
+@click.option("--execute-pr", is_flag=True, help="When executing git apply, also create the PR via gh after push")
 @click.option("--dry-run", is_flag=True, help="Validate setup without running optimization")
-def main(skill, iterations, eval_source, dataset_path, optimizer_model, eval_model, hermes_repo, run_tests, dry_run):
+def main(skill, iterations, eval_source, eval_backend, dataset_path, optimizer_model, eval_model, hermes_repo, run_tests, run_tblite, tblite_mode, tblite_task_filter, execute_git_apply, execute_push, execute_pr, dry_run):
     """Evolve a Hermes Agent skill using DSPy + GEPA optimization."""
     evolve(
         skill_name=skill,
@@ -316,6 +858,13 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, eval_mod
         hermes_repo=hermes_repo,
         run_tests=run_tests,
         dry_run=dry_run,
+        eval_backend=eval_backend,
+        run_tblite=run_tblite,
+        tblite_task_filter=tblite_task_filter,
+        tblite_mode=tblite_mode,
+        execute_git_apply=execute_git_apply,
+        execute_push=execute_push,
+        execute_pr=execute_pr,
     )
 
 
diff --git a/tests/core/test_benchmark_gate.py b/tests/core/test_benchmark_gate.py
new file mode 100644
index 0000000..2f9d310
--- /dev/null
+++ b/tests/core/test_benchmark_gate.py
@@ -0,0 +1,167 @@
+"""Tests for benchmark regression gate helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+import yaml
+
+from evolution.core import benchmark_gate as mod
+
+
+@pytest.fixture
+def fake_tblite_base_config(tmp_path: Path) -> Path:
+    path = tmp_path / "local.yaml"
+    path.write_text(
+        yaml.safe_dump(
+            {
+                "env": {
+                    "terminal_backend": "docker",
+                    "use_wandb": False,
+                    "data_dir_to_save_evals": "environments/benchmarks/evals/openthoughts-tblite-local",
+                },
+                "openai": {
+                    "model_name": "anthropic/claude-sonnet-4",
+                },
+            }
+        )
+    )
+    return path
+
+
+def test_parse_tblite_pass_rate_from_stdout():
+    output = "hello\nOverall Pass Rate: 0.4200 (42/100)\nbye\n"
+
+    assert mod.parse_tblite_pass_rate(output) == pytest.approx(0.42)
+
+
+def test_parse_tblite_pass_rate_raises_when_missing_summary():
+    with pytest.raises(ValueError, match="Overall Pass Rate"):
+        mod.parse_tblite_pass_rate("no summary here")
+
+
+def test_run_tblite_gate_allows_small_regression(monkeypatch, tmp_path: Path, fake_tblite_base_config: Path):
+    prompts = {
+        "BASE": "baseline prompt",
+        "EVOLVED": "evolved prompt",
+    }
+    calls = []
+
+    def _fake_build_skill_system_prompt(skill_name, hermes_repo=None, skill_body_override=None):
+        return prompts[skill_body_override], [skill_name]
+
+    def _fake_run(command, capture_output, text, timeout, cwd):
+        calls.append(command)
+        config_path = Path(command[command.index("--config") + 1])
+        config = yaml.safe_load(config_path.read_text())
+        prompt = config["env"]["system_prompt"]
+        score = 0.50 if prompt == "baseline prompt" else 0.49
+        return SimpleNamespace(returncode=0, stdout=f"Overall Pass Rate: {score:.4f} (1/2)", stderr="")
+
+    monkeypatch.setattr(mod, "build_skill_system_prompt", _fake_build_skill_system_prompt)
+    monkeypatch.setattr(mod.subprocess, "run", _fake_run)
+
+    result = mod.run_tblite_benchmark_gate(
+        skill_name="github-code-review",
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        hermes_repo=tmp_path,
+        regression_threshold=0.02,
+        task_filter="broken-python,pandas-etl",
+        base_config_path=fake_tblite_base_config,
+        benchmark_script_path=tmp_path / "tblite_env.py",
+    )
+
+    assert result.passed is True
+    assert result.baseline_pass_rate == pytest.approx(0.50)
+    assert result.evolved_pass_rate == pytest.approx(0.49)
+    assert result.delta == pytest.approx(-0.01)
+    assert len(calls) == 2
+    assert calls[0][-2:] == ["--env.task_filter", "broken-python,pandas-etl"]
+
+
+def test_run_tblite_gate_fails_large_regression(monkeypatch, tmp_path: Path, fake_tblite_base_config: Path):
+    monkeypatch.setattr(
+        mod,
+        "build_skill_system_prompt",
+        lambda skill_name, hermes_repo=None, skill_body_override=None: (f"prompt::{skill_body_override}", [skill_name]),
+    )
+
+    def _fake_run(command, capture_output, text, timeout, cwd):
+        config_path = Path(command[command.index("--config") + 1])
+        config = yaml.safe_load(config_path.read_text())
+        prompt = config["env"]["system_prompt"]
+        score = 0.50 if prompt == "prompt::BASE" else 0.40
+        return SimpleNamespace(returncode=0, stdout=f"Overall Pass Rate: {score:.4f} (1/2)", stderr="")
+
+    monkeypatch.setattr(mod.subprocess, "run", _fake_run)
+
+    result = mod.run_tblite_benchmark_gate(
+        skill_name="github-code-review",
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        hermes_repo=tmp_path,
+        regression_threshold=0.02,
+        base_config_path=fake_tblite_base_config,
+        benchmark_script_path=tmp_path / "tblite_env.py",
+    )
+
+    assert result.passed is False
+    assert result.delta == pytest.approx(-0.10)
+    assert "regression" in result.summary.lower()
+
+
+def test_resolve_tblite_gate_plan_fast_uses_local_config_and_default_subset(tmp_path: Path):
+    plan = mod.resolve_tblite_gate_plan(mode="fast", hermes_repo=tmp_path)
+
+    assert plan.mode == "fast"
+    assert plan.base_config_path == tmp_path / "environments/benchmarks/tblite/local.yaml"
+    assert plan.task_filter == mod.DEFAULT_FAST_TBLITE_TASK_FILTER
+
+
+
+def test_resolve_tblite_gate_plan_full_uses_default_config_without_filter(tmp_path: Path):
+    plan = mod.resolve_tblite_gate_plan(mode="full", hermes_repo=tmp_path)
+
+    assert plan.mode == "full"
+    assert plan.base_config_path == tmp_path / "environments/benchmarks/tblite/default.yaml"
+    assert plan.task_filter is None
+
+
+
+def test_run_tblite_gate_records_mode_metadata(monkeypatch, tmp_path: Path, fake_tblite_base_config: Path):
+    monkeypatch.setattr(
+        mod,
+        "build_skill_system_prompt",
+        lambda skill_name, hermes_repo=None, skill_body_override=None: (f"prompt::{skill_body_override}", [skill_name]),
+    )
+    monkeypatch.setattr(
+        mod,
+        "resolve_tblite_gate_plan",
+        lambda **kwargs: mod.TBLiteGatePlan(
+            mode="fast",
+            base_config_path=fake_tblite_base_config,
+            task_filter="broken-python,pandas-etl",
+            benchmark_script_path=tmp_path / "tblite_env.py",
+        ),
+    )
+
+    def _fake_run(command, capture_output, text, timeout, cwd):
+        return SimpleNamespace(returncode=0, stdout="Overall Pass Rate: 0.5000 (1/2)", stderr="")
+
+    monkeypatch.setattr(mod.subprocess, "run", _fake_run)
+
+    result = mod.run_tblite_benchmark_gate(
+        skill_name="github-code-review",
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        hermes_repo=tmp_path,
+        regression_threshold=0.02,
+        mode="fast",
+    )
+
+    assert result.mode == "fast"
+    assert result.task_filter == "broken-python,pandas-etl"
+    assert result.base_config_path == fake_tblite_base_config
diff --git a/tests/core/test_constraints.py b/tests/core/test_constraints.py
index 88e3aaa..2cebf89 100644
--- a/tests/core/test_constraints.py
+++ b/tests/core/test_constraints.py
@@ -1,5 +1,7 @@
 """Tests for constraint validators."""
 
+from pathlib import Path
+
 import pytest
 from evolution.core.constraints import ConstraintValidator
 from evolution.core.config import EvolutionConfig
@@ -7,7 +9,7 @@
 
 @pytest.fixture
 def validator():
-    config = EvolutionConfig()
+    config = EvolutionConfig(hermes_agent_path=Path("/tmp/hermes-agent"))
     return ConstraintValidator(config)
 
 
diff --git a/tests/core/test_git_pr_automation.py b/tests/core/test_git_pr_automation.py
new file mode 100644
index 0000000..7c55371
--- /dev/null
+++ b/tests/core/test_git_pr_automation.py
@@ -0,0 +1,239 @@
+"""Tests for git/GitHub automation artifacts."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from evolution.core import git_pr_automation as mod
+
+
+def sample_metrics() -> dict:
+    return {
+        "skill_name": "github-code-review",
+        "timestamp": "20260414_180000",
+        "eval_backend": "hermes",
+        "baseline_score": 0.42,
+        "evolved_score": 0.57,
+        "improvement": 0.15,
+        "tblite_gate": {
+            "passed": True,
+            "summary": "TBLite fast gate passed",
+        },
+    }
+
+
+def test_build_evolution_branch_name_sanitizes_skill_and_includes_timestamp():
+    branch = mod.build_evolution_branch_name(
+        skill_name="github/code review",
+        timestamp="20260414_180000",
+    )
+
+    assert branch == "evolution/github-code-review-20260414_180000"
+
+
+def test_build_target_skill_path_resolves_skill_location_from_repo_root(tmp_path: Path):
+    target = mod.build_target_skill_path(
+        hermes_repo=tmp_path / "hermes-agent",
+        skill_relpath="github/github-code-review/SKILL.md",
+    )
+
+    assert target == tmp_path / "hermes-agent" / "skills" / "github/github-code-review/SKILL.md"
+
+
+def test_write_skill_patch_artifacts_writes_candidate_skill_file_and_manifest(tmp_path: Path):
+    paths = mod.write_skill_patch_artifacts(
+        output_dir=tmp_path,
+        skill_relpath="github/github-code-review/SKILL.md",
+        evolved_skill_text="# evolved skill\n",
+        hermes_repo=tmp_path / "hermes-agent",
+    )
+
+    assert paths["candidate_skill_file"].exists()
+    assert paths["git_patch_manifest_json"].exists()
+    assert "github/github-code-review/SKILL.md" in paths["git_patch_manifest_json"].read_text()
+
+
+def test_build_git_apply_plan_includes_branch_checkout_commit_and_push(tmp_path: Path):
+    plan = mod.build_git_apply_plan(
+        hermes_repo=tmp_path / "hermes-agent",
+        branch_name="evolution/github-code-review-20260414_180000",
+        target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md",
+        candidate_skill_file=tmp_path / "candidate_skill.md",
+        commit_message="feat: evolve skill github-code-review (candidate)",
+        push_remote="origin",
+    )
+
+    assert "git checkout -b evolution/github-code-review-20260414_180000" in plan
+    assert "cp" in plan
+    assert "git add" in plan
+    assert "git commit -m" in plan
+    assert "git push -u origin" in plan
+
+
+def test_write_git_apply_plan_artifacts_writes_plan_files(tmp_path: Path):
+    paths = mod.write_git_apply_plan_artifacts(
+        output_dir=tmp_path,
+        hermes_repo=tmp_path / "hermes-agent",
+        branch_name="evolution/github-code-review-20260414_180000",
+        target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md",
+        candidate_skill_file=tmp_path / "candidate_skill.md",
+        commit_message="feat: evolve skill github-code-review (candidate)",
+    )
+
+    assert paths["git_apply_plan_sh"].exists()
+    assert paths["git_apply_plan_md"].exists()
+    assert "git checkout -b" in paths["git_apply_plan_sh"].read_text()
+    assert "Commit message" in paths["git_apply_plan_md"].read_text()
+
+
+def test_write_full_git_pr_automation_artifacts_writes_branch_and_pr_handoff(tmp_path: Path):
+    paths = mod.write_git_pr_automation_artifacts(
+        output_dir=tmp_path,
+        metrics=sample_metrics(),
+        hermes_repo=tmp_path / "hermes-agent",
+        skill_relpath="github/github-code-review/SKILL.md",
+        evolved_skill_text="# evolved skill\n",
+        github_pr_body_path=tmp_path / "github_pr_body.md",
+    )
+
+    assert paths["candidate_skill_file"].exists()
+    assert paths["git_apply_plan_sh"].exists()
+    assert paths["git_apply_plan_md"].exists()
+    assert paths["git_patch_manifest_json"].exists()
+    assert paths["gh_pr_create_after_push_txt"].exists()
+    assert "gh pr create" in paths["gh_pr_create_after_push_txt"].read_text()
+
+
+
+def test_build_target_skill_path_rejects_escape_from_skills_root(tmp_path: Path):
+    with pytest.raises(ValueError, match="skills"):
+        mod.build_target_skill_path(
+            hermes_repo=tmp_path / "hermes-agent",
+            skill_relpath="../outside.md",
+        )
+
+
+
+def test_execute_git_apply_plan_rejects_target_outside_skills_root(tmp_path: Path):
+    with pytest.raises(ValueError, match="skills"):
+        mod.execute_git_apply_plan(
+            hermes_repo=tmp_path / "hermes-agent",
+            branch_name="evolution/github-code-review-20260414_180000",
+            target_skill_path=tmp_path / "outside.md",
+            candidate_skill_file=tmp_path / "candidate_skill.md",
+            commit_message="feat: evolve skill github-code-review (candidate)",
+            run_push=False,
+            runner=lambda command, *, workdir=None: {"exit_code": 0, "output": "ok"},
+        )
+
+
+
+def test_write_full_git_pr_automation_artifacts_skips_pr_command_for_reject(tmp_path: Path):
+    metrics = sample_metrics()
+    metrics["tblite_gate"]["passed"] = False
+
+    paths = mod.write_git_pr_automation_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        hermes_repo=tmp_path / "hermes-agent",
+        skill_relpath="github/github-code-review/SKILL.md",
+        evolved_skill_text="# evolved skill\n",
+        github_pr_body_path=tmp_path / "github_pr_body.md",
+    )
+
+    assert paths["gh_pr_create_after_push_txt"] is None
+
+
+def test_execute_git_apply_plan_runs_copy_commit_and_push_steps_in_order(tmp_path: Path):
+    commands = []
+
+    def _fake_runner(command: str, *, workdir=None):
+        commands.append((command, workdir))
+        return {"exit_code": 0, "output": "ok"}
+
+    result = mod.execute_git_apply_plan(
+        hermes_repo=tmp_path / "hermes-agent",
+        branch_name="evolution/github-code-review-20260414_180000",
+        target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md",
+        candidate_skill_file=tmp_path / "candidate_skill.md",
+        commit_message="feat: evolve skill github-code-review (candidate)",
+        run_push=True,
+        runner=_fake_runner,
+    )
+
+    assert result["steps"][0]["name"] == "checkout_branch"
+    assert result["steps"][-1]["name"] == "push_branch"
+    assert any("git checkout -b" in cmd for cmd, _ in commands)
+    assert any("git commit -m" in cmd for cmd, _ in commands)
+    assert any("git push -u origin" in cmd for cmd, _ in commands)
+
+
+def test_execute_git_apply_plan_can_skip_push(tmp_path: Path):
+    commands = []
+
+    def _fake_runner(command: str, *, workdir=None):
+        commands.append(command)
+        return {"exit_code": 0, "output": "ok"}
+
+    result = mod.execute_git_apply_plan(
+        hermes_repo=tmp_path / "hermes-agent",
+        branch_name="evolution/github-code-review-20260414_180000",
+        target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md",
+        candidate_skill_file=tmp_path / "candidate_skill.md",
+        commit_message="feat: evolve skill github-code-review (candidate)",
+        run_push=False,
+        runner=_fake_runner,
+    )
+
+    assert all(step["name"] != "push_branch" for step in result["steps"])
+    assert not any("git push -u origin" in cmd for cmd in commands)
+
+
+def test_execute_gh_pr_create_runs_only_when_command_is_present(tmp_path: Path):
+    commands = []
+
+    def _fake_runner(command: str, *, workdir=None):
+        commands.append((command, workdir))
+        return {"exit_code": 0, "output": "created"}
+
+    result = mod.execute_gh_pr_create(
+        command="gh pr create --base main --head evolution/test --title 'x' --body-file body.md",
+        hermes_repo=tmp_path / "hermes-agent",
+        runner=_fake_runner,
+    )
+
+    assert result["executed"] is True
+    assert commands[0][0].startswith("gh pr create")
+
+
+def test_execute_full_git_pr_automation_skips_pr_creation_without_command(tmp_path: Path):
+    calls = []
+
+    def _fake_execute_git_apply_plan(**kwargs):
+        calls.append(("apply", kwargs))
+        return {"steps": [{"name": "checkout_branch"}]}
+
+    def _fake_execute_gh_pr_create(**kwargs):
+        calls.append(("pr", kwargs))
+        return {"executed": False}
+
+    result = mod.execute_git_pr_automation(
+        git_apply_plan={
+            "hermes_repo": tmp_path / "hermes-agent",
+            "branch_name": "evolution/github-code-review-20260414_180000",
+            "target_skill_path": tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md",
+            "candidate_skill_file": tmp_path / "candidate_skill.md",
+            "commit_message": "feat: evolve skill github-code-review (candidate)",
+        },
+        gh_pr_create_command=None,
+        execute_push=True,
+        execute_pr=False,
+        execute_git_apply_plan_fn=_fake_execute_git_apply_plan,
+        execute_gh_pr_create_fn=_fake_execute_gh_pr_create,
+    )
+
+    assert result["git"]["steps"][0]["name"] == "checkout_branch"
+    assert result["pr"] is None
+    assert [kind for kind, _ in calls] == ["apply"]
diff --git a/tests/core/test_hermes_eval.py b/tests/core/test_hermes_eval.py
new file mode 100644
index 0000000..98847ff
--- /dev/null
+++ b/tests/core/test_hermes_eval.py
@@ -0,0 +1,203 @@
+"""Tests for real Hermes Agent-backed skill evaluation helpers."""
+
+from types import SimpleNamespace
+
+import pytest
+
+from evolution.core.hermes_eval import (
+    HermesSkillEvalCase,
+    _load_hermes_symbols,
+    build_skill_system_prompt,
+    run_skill_eval,
+)
+
+
+class FakeAgent:
+    last_init = None
+    last_user_message = None
+
+    def __init__(self, **kwargs):
+        FakeAgent.last_init = kwargs
+
+    def run_conversation(self, user_message: str, conversation_history=None):
+        FakeAgent.last_user_message = user_message
+        return {"final_response": "done", "messages": []}
+
+
+class InlineOnlyAgent(FakeAgent):
+    pass
+
+
+@pytest.fixture(autouse=True)
+def reset_fake_agent():
+    FakeAgent.last_init = None
+    FakeAgent.last_user_message = None
+
+
+@pytest.fixture
+def fake_loader(monkeypatch):
+    def _fake_loader(hermes_repo=None):
+        def _build_preloaded_skills_prompt(skill_identifiers, task_id=None):
+            assert skill_identifiers == ["github-code-review"]
+            return ("[SYSTEM: skill prompt]", ["github-code-review"], [])
+
+        return SimpleNamespace(
+            AIAgent=FakeAgent,
+            build_preloaded_skills_prompt=_build_preloaded_skills_prompt,
+        )
+
+    monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader)
+
+
+def test_build_skill_system_prompt_can_inline_alternate_skill_body(monkeypatch):
+    def _fake_loader(hermes_repo=None):
+        def _build_preloaded_skills_prompt(skill_identifiers, task_id=None):
+            return ("[SYSTEM: original skill prompt]", ["github-code-review"], [])
+
+        return SimpleNamespace(
+            AIAgent=InlineOnlyAgent,
+            build_preloaded_skills_prompt=_build_preloaded_skills_prompt,
+        )
+
+    monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader)
+
+    prompt, loaded = build_skill_system_prompt(
+        "github-code-review",
+        skill_body_override="# EVOLVED\nUse stricter review criteria.",
+    )
+
+    assert loaded == ["github-code-review"]
+    assert "original skill prompt" not in prompt
+    assert "EVOLVED" in prompt
+    assert "stricter review criteria" in prompt
+
+
+def test_load_hermes_symbols_imports_from_repo(monkeypatch, tmp_path):
+    repo = tmp_path / "hermes-agent"
+    (repo / "agent").mkdir(parents=True)
+    (repo / "run_agent.py").write_text(
+        'class AIAgent:\n'
+        '    pass\n'
+    )
+    (repo / "agent" / "skill_commands.py").write_text(
+        'def build_preloaded_skills_prompt(skill_identifiers, task_id=None):\n'
+        '    return "prompt", ["loaded"], []\n'
+    )
+
+    symbols = _load_hermes_symbols(repo)
+
+    assert symbols.AIAgent.__name__ == "AIAgent"
+    prompt, loaded, missing = symbols.build_preloaded_skills_prompt(["x"])
+    assert prompt == "prompt"
+    assert loaded == ["loaded"]
+    assert missing == []
+
+
+def test_load_hermes_symbols_tolerates_missing_fire_dependency(monkeypatch, tmp_path):
+    repo = tmp_path / "hermes-agent"
+    (repo / "agent").mkdir(parents=True)
+    (repo / "run_agent.py").write_text(
+        'import fire\n'
+        'class AIAgent:\n'
+        '    pass\n'
+    )
+    (repo / "agent" / "skill_commands.py").write_text(
+        'def build_preloaded_skills_prompt(skill_identifiers, task_id=None):\n'
+        '    return "prompt", ["loaded"], []\n'
+    )
+    monkeypatch.delitem(__import__("sys").modules, "fire", raising=False)
+
+    symbols = _load_hermes_symbols(repo)
+
+    assert symbols.AIAgent.__name__ == "AIAgent"
+
+
+
+def test_load_hermes_symbols_adds_repo_to_sys_path_for_local_imports(tmp_path):
+    repo = tmp_path / "hermes-agent"
+    (repo / "agent").mkdir(parents=True)
+    (repo / "hermes_constants.py").write_text('VALUE = "ok"\n')
+    (repo / "run_agent.py").write_text(
+        'from hermes_constants import VALUE\n'
+        'class AIAgent:\n'
+        '    value = VALUE\n'
+    )
+    (repo / "agent" / "skill_commands.py").write_text(
+        'def build_preloaded_skills_prompt(skill_identifiers, task_id=None):\n'
+        '    return "prompt", ["loaded"], []\n'
+    )
+
+    symbols = _load_hermes_symbols(repo)
+
+    assert symbols.AIAgent.value == "ok"
+
+
+def test_build_skill_system_prompt_returns_loaded_skill_prompt(fake_loader):
+    prompt, loaded = build_skill_system_prompt("github-code-review", hermes_repo="/tmp/hermes")
+
+    assert prompt == "[SYSTEM: skill prompt]"
+    assert loaded == ["github-code-review"]
+
+
+def test_build_skill_system_prompt_raises_for_missing_skill(monkeypatch):
+    def _fake_loader(hermes_repo=None):
+        def _build_preloaded_skills_prompt(skill_identifiers, task_id=None):
+            return ("", [], ["missing-skill"])
+
+        return SimpleNamespace(
+            AIAgent=FakeAgent,
+            build_preloaded_skills_prompt=_build_preloaded_skills_prompt,
+        )
+
+    monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader)
+
+    with pytest.raises(ValueError, match="missing-skill"):
+        build_skill_system_prompt("github-code-review")
+
+
+def test_run_skill_eval_uses_real_agent_shape_and_preloaded_skill_prompt(fake_loader):
+    case = HermesSkillEvalCase(
+        skill_name="github-code-review",
+        task_input="Review this diff for security issues.",
+        system_prompt="[SYSTEM: custom evaluator instructions]",
+    )
+
+    result = run_skill_eval(case, model="openai/gpt-4.1-mini", hermes_repo="/tmp/hermes")
+
+    assert result.final_response == "done"
+    assert result.loaded_skills == ["github-code-review"]
+    assert "custom evaluator instructions" in result.effective_system_prompt
+    assert "skill prompt" in result.effective_system_prompt
+    assert FakeAgent.last_user_message == "Review this diff for security issues."
+    assert FakeAgent.last_init["model"] == "openai/gpt-4.1-mini"
+    assert FakeAgent.last_init["quiet_mode"] is True
+    assert FakeAgent.last_init["skip_context_files"] is True
+    assert FakeAgent.last_init["skip_memory"] is True
+    assert FakeAgent.last_init["ephemeral_system_prompt"] == result.effective_system_prompt
+
+
+def test_run_skill_eval_accepts_string_agent_result(fake_loader, monkeypatch):
+    class StringAgent(FakeAgent):
+        def run_conversation(self, user_message: str, conversation_history=None):
+            return "plain string response"
+
+    def _fake_loader(hermes_repo=None):
+        def _build_preloaded_skills_prompt(skill_identifiers, task_id=None):
+            return ("[SYSTEM: skill prompt]", ["github-code-review"], [])
+
+        return SimpleNamespace(
+            AIAgent=StringAgent,
+            build_preloaded_skills_prompt=_build_preloaded_skills_prompt,
+        )
+
+    monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader)
+
+    result = run_skill_eval(
+        HermesSkillEvalCase(
+            skill_name="github-code-review",
+            task_input="Review this diff.",
+        )
+    )
+
+    assert result.final_response == "plain string response"
+    assert result.raw_result == "plain string response"
diff --git a/tests/core/test_report_artifact.py b/tests/core/test_report_artifact.py
new file mode 100644
index 0000000..0097985
--- /dev/null
+++ b/tests/core/test_report_artifact.py
@@ -0,0 +1,205 @@
+"""Tests for report artifact helpers."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from evolution.core import report_artifact as mod
+
+
+def sample_metrics() -> dict:
+    return {
+        "skill_name": "github-code-review",
+        "timestamp": "20260414_180000",
+        "iterations": 5,
+        "optimizer_model": "openai/gpt-4.1",
+        "eval_model": "openai/gpt-4.1-mini",
+        "eval_backend": "hermes",
+        "baseline_score": 0.42,
+        "evolved_score": 0.57,
+        "improvement": 0.15,
+        "baseline_size": 1000,
+        "evolved_size": 1100,
+        "train_examples": 10,
+        "val_examples": 5,
+        "holdout_examples": 5,
+        "elapsed_seconds": 12.5,
+        "constraints_passed": True,
+        "tblite_gate": {
+            "passed": True,
+            "mode": "fast",
+            "task_filter": "broken-python,pandas-etl",
+            "base_config_path": "/tmp/local.yaml",
+            "baseline_pass_rate": 0.50,
+            "evolved_pass_rate": 0.48,
+            "delta": -0.02,
+            "threshold": 0.02,
+            "summary": "TBLite fast gate passed",
+        },
+    }
+
+
+def test_build_evolution_report_contains_decision_and_key_metrics():
+    report = mod.build_evolution_report(
+        metrics=sample_metrics(),
+        baseline_skill_path="output/github-code-review/20260414_180000/baseline_skill.md",
+        evolved_skill_path="output/github-code-review/20260414_180000/evolved_skill.md",
+    )
+
+    assert "# Evolution Report" in report
+    assert "github-code-review" in report
+    assert "candidate_for_review" in report
+    assert "TBLite fast gate passed" in report
+    assert "output/github-code-review/20260414_180000/evolved_skill.md" in report
+
+
+
+def test_summarize_recommendation_rejects_on_failed_gate():
+    metrics = sample_metrics()
+    metrics["tblite_gate"]["passed"] = False
+    metrics["tblite_gate"]["summary"] = "TBLite fast regression detected"
+
+    summary = mod.summarize_recommendation(metrics)
+
+    assert summary["decision"] == "reject"
+    assert "benchmark_gate_failed" in summary["reasons"]
+
+
+
+def test_write_report_artifacts_creates_markdown_and_summary_json(tmp_path: Path):
+    metrics = sample_metrics()
+
+    paths = mod.write_report_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        baseline_skill_path=tmp_path / "baseline_skill.md",
+        evolved_skill_path=tmp_path / "evolved_skill.md",
+    )
+
+    assert paths["report_md"].exists()
+    assert paths["summary_json"].exists()
+
+    summary = json.loads(paths["summary_json"].read_text())
+    assert summary["decision"] == "candidate_for_review"
+    assert summary["skill_name"] == "github-code-review"
+
+
+
+def test_write_pr_ready_artifacts_creates_pr_files(tmp_path: Path):
+    metrics = sample_metrics()
+
+    paths = mod.write_pr_ready_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        baseline_skill_path=tmp_path / "baseline_skill.md",
+        evolved_skill_path=tmp_path / "evolved_skill.md",
+    )
+
+    assert paths["pr_draft_md"].exists()
+    assert paths["review_checklist_md"].exists()
+    assert paths["diff_summary_md"].exists()
+
+
+
+def test_build_pr_draft_marks_review_candidate_and_contains_sections(tmp_path: Path):
+    metrics = sample_metrics()
+
+    draft = mod.build_pr_draft(
+        metrics=metrics,
+        baseline_skill_path=tmp_path / "baseline_skill.md",
+        evolved_skill_path=tmp_path / "evolved_skill.md",
+    )
+
+    assert "# PR Draft" in draft
+    assert "candidate_for_review" in draft
+    assert "## Summary" in draft
+    assert "## Test Plan" in draft
+
+
+
+def test_build_review_checklist_mentions_gate_and_holdout():
+    checklist = mod.build_review_checklist(sample_metrics())
+
+    assert "Holdout improvement looks meaningful" in checklist
+    assert "TBLite gate result reviewed" in checklist
+
+
+
+def test_build_diff_summary_mentions_size_and_improvement(tmp_path: Path):
+    summary = mod.build_diff_summary(
+        metrics=sample_metrics(),
+        baseline_skill_path=tmp_path / "baseline_skill.md",
+        evolved_skill_path=tmp_path / "evolved_skill.md",
+    )
+
+    assert "Skill size delta" in summary
+    assert "+0.150" in summary
+    assert "baseline_skill.md" in summary
+
+
+def test_build_github_pr_body_contains_github_sections_and_artifact_paths(tmp_path: Path):
+    body = mod.build_github_pr_body(
+        metrics=sample_metrics(),
+        baseline_skill_path=tmp_path / "baseline_skill.md",
+        evolved_skill_path=tmp_path / "evolved_skill.md",
+        report_path=tmp_path / "report.md",
+        summary_path=tmp_path / "summary.json",
+        diff_summary_path=tmp_path / "diff_summary.md",
+        review_checklist_path=tmp_path / "review_checklist.md",
+    )
+
+    assert "## Summary" in body
+    assert "## Evaluation Evidence" in body
+    assert "## Artifacts" in body
+    assert "## Test Plan" in body
+    assert "report.md" in body
+    assert "review_checklist.md" in body
+
+
+def test_build_gh_pr_create_command_marks_review_needed_variants_as_draft(tmp_path: Path):
+    metrics = sample_metrics()
+    metrics["improvement"] = 0.0
+    metrics["evolved_score"] = metrics["baseline_score"]
+
+    command = mod.build_gh_pr_create_command(
+        metrics=metrics,
+        body_path=tmp_path / "github_pr_body.md",
+    )
+
+    assert command is not None
+    assert "gh pr create" in command
+    assert "--draft" in command
+    assert "--body-file" in command
+
+
+def test_build_gh_pr_create_command_skips_rejected_variants(tmp_path: Path):
+    metrics = sample_metrics()
+    metrics["tblite_gate"]["passed"] = False
+    metrics["tblite_gate"]["summary"] = "TBLite fast regression detected"
+
+    command = mod.build_gh_pr_create_command(
+        metrics=metrics,
+        body_path=tmp_path / "github_pr_body.md",
+    )
+
+    assert command is None
+
+
+def test_write_github_pr_artifacts_writes_body_and_command_for_candidates(tmp_path: Path):
+    paths = mod.write_github_pr_artifacts(
+        output_dir=tmp_path,
+        metrics=sample_metrics(),
+        baseline_skill_path=tmp_path / "baseline_skill.md",
+        evolved_skill_path=tmp_path / "evolved_skill.md",
+        report_path=tmp_path / "report.md",
+        summary_path=tmp_path / "summary.json",
+        diff_summary_path=tmp_path / "diff_summary.md",
+        review_checklist_path=tmp_path / "review_checklist.md",
+    )
+
+    assert paths["github_pr_body_md"].exists()
+    assert paths["gh_pr_create_command_txt"].exists()
+    command = paths["gh_pr_create_command_txt"].read_text()
+    assert "gh pr create" in command
+    assert "--title" in command
diff --git a/tests/skills/test_evolve_skill.py b/tests/skills/test_evolve_skill.py
new file mode 100644
index 0000000..cc9b315
--- /dev/null
+++ b/tests/skills/test_evolve_skill.py
@@ -0,0 +1,614 @@
+"""Tests for evolve_skill orchestration helpers."""
+
+import os
+from types import SimpleNamespace
+
+from evolution.core.dataset_builder import EvalExample, EvalDataset
+from evolution.skills import evolve_skill as mod
+
+
+class FakeModule:
+    def __init__(self, skill_text):
+        self.skill_text = skill_text
+
+    def __call__(self, task_input: str):
+        return SimpleNamespace(output=f"base::{task_input}")
+
+
+class FakeOptimizedModule:
+    def __init__(self, skill_text):
+        self.skill_text = skill_text
+
+    def __call__(self, task_input: str):
+        return SimpleNamespace(output=f"evolved::{task_input}")
+
+
+def sample_dataset():
+    return EvalDataset(
+        train=[EvalExample(task_input="train task", expected_behavior="train rubric")],
+        val=[EvalExample(task_input="val task", expected_behavior="val rubric")],
+        holdout=[
+            EvalExample(task_input="task one", expected_behavior="rubric one"),
+            EvalExample(task_input="task two", expected_behavior="rubric two"),
+        ],
+    )
+
+
+def test_evaluate_holdout_with_dspy_backend(monkeypatch):
+    monkeypatch.setattr(mod, "skill_fitness_metric", lambda ex, pred: 0.8 if pred.output.startswith("base::") else 0.95)
+
+    baseline, evolved = mod.evaluate_holdout(
+        dataset=sample_dataset(),
+        eval_backend="dspy",
+        baseline_module=FakeModule("BASE"),
+        evolved_module=FakeOptimizedModule("EVOLVED"),
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        eval_model="openai/gpt-4.1-mini",
+        hermes_repo="/tmp/hermes",
+    )
+
+    assert baseline == [0.8, 0.8]
+    assert evolved == [0.95, 0.95]
+
+
+
+def test_evaluate_holdout_with_hermes_backend(monkeypatch):
+    calls = []
+
+    def _fake_run_skill_eval(case, **kwargs):
+        calls.append({
+            "skill_name": case.skill_name,
+            "task_input": case.task_input,
+            "skill_body_override": kwargs.get("skill_body_override"),
+            "agent_kwargs": kwargs.get("agent_kwargs"),
+        })
+        return SimpleNamespace(final_response=f"resp::{case.task_input}", raw_result={})
+
+    def score_by_body(skill_body):
+        return 0.4 if skill_body == "BASE" else 0.9
+
+    monkeypatch.setattr(mod, "run_skill_eval", _fake_run_skill_eval)
+    monkeypatch.setattr(mod, "score_output_against_example", lambda **kwargs: score_by_body(kwargs["skill_body"]))
+
+    baseline, evolved = mod.evaluate_holdout(
+        dataset=sample_dataset(),
+        eval_backend="hermes",
+        baseline_module=FakeModule("BASE"),
+        evolved_module=FakeOptimizedModule("EVOLVED"),
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        eval_model="openai/gpt-4.1-mini",
+        hermes_repo="/tmp/hermes",
+        skill_name="github-code-review",
+    )
+
+    assert baseline == [0.4, 0.4]
+    assert evolved == [0.9, 0.9]
+    assert calls == [
+        {
+            "skill_name": "github-code-review",
+            "task_input": "task one",
+            "skill_body_override": "BASE",
+            "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS},
+        },
+        {
+            "skill_name": "github-code-review",
+            "task_input": "task one",
+            "skill_body_override": "EVOLVED",
+            "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS},
+        },
+        {
+            "skill_name": "github-code-review",
+            "task_input": "task two",
+            "skill_body_override": "BASE",
+            "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS},
+        },
+        {
+            "skill_name": "github-code-review",
+            "task_input": "task two",
+            "skill_body_override": "EVOLVED",
+            "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS},
+        },
+    ]
+
+
+
+def test_maybe_run_tblite_gate_skips_when_disabled():
+    result = mod.maybe_run_tblite_gate(
+        run_tblite=False,
+        skill_name="github-code-review",
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        hermes_repo="/tmp/hermes",
+    )
+
+    assert result is None
+
+
+
+def test_maybe_run_tblite_gate_invokes_gate_when_enabled(monkeypatch):
+    calls = []
+
+    def _fake_gate(**kwargs):
+        calls.append(kwargs)
+        return SimpleNamespace(passed=True, delta=0.01)
+
+    monkeypatch.setattr(mod, "run_tblite_benchmark_gate", _fake_gate)
+
+    result = mod.maybe_run_tblite_gate(
+        run_tblite=True,
+        skill_name="github-code-review",
+        baseline_skill_body="BASE",
+        evolved_skill_body="EVOLVED",
+        hermes_repo="/tmp/hermes",
+        tblite_regression_threshold=0.03,
+        tblite_task_filter="broken-python",
+        tblite_mode="fast",
+    )
+
+    assert result.passed is True
+    assert calls == [{
+        "skill_name": "github-code-review",
+        "baseline_skill_body": "BASE",
+        "evolved_skill_body": "EVOLVED",
+        "hermes_repo": "/tmp/hermes",
+        "regression_threshold": 0.03,
+        "task_filter": "broken-python",
+        "mode": "fast",
+    }]
+
+
+
+def test_write_report_artifacts_delegates_to_report_module(tmp_path, monkeypatch):
+    metrics = {"skill_name": "github-code-review"}
+    called = {}
+
+    def _fake_write_report_artifacts(**kwargs):
+        called.update(kwargs)
+        return {
+            "report_md": tmp_path / "report.md",
+            "summary_json": tmp_path / "summary.json",
+        }
+
+    monkeypatch.setattr(mod, "write_report_artifacts", _fake_write_report_artifacts)
+
+    result = mod.write_evolution_report_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        baseline_skill_path=tmp_path / "baseline.md",
+        evolved_skill_path=tmp_path / "evolved.md",
+    )
+
+    assert result["report_md"] == tmp_path / "report.md"
+    assert called["metrics"] == metrics
+    assert called["baseline_skill_path"] == tmp_path / "baseline.md"
+
+
+
+def test_write_pr_ready_artifacts_delegates_to_report_module(tmp_path, monkeypatch):
+    metrics = {"skill_name": "github-code-review"}
+    called = {}
+
+    def _fake_write_pr_ready_artifacts(**kwargs):
+        called.update(kwargs)
+        return {"pr_draft_md": tmp_path / "pr_draft.md"}
+
+    monkeypatch.setattr(mod, "write_pr_ready_artifacts", _fake_write_pr_ready_artifacts)
+
+    result = mod.write_evolution_pr_ready_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        baseline_skill_path=tmp_path / "baseline.md",
+        evolved_skill_path=tmp_path / "evolved.md",
+    )
+
+    assert result["pr_draft_md"] == tmp_path / "pr_draft.md"
+    assert called["evolved_skill_path"] == tmp_path / "evolved.md"
+
+
+
+def test_write_github_pr_artifacts_delegates_to_report_module(tmp_path, monkeypatch):
+    metrics = {"skill_name": "github-code-review"}
+    called = {}
+
+    def _fake_write_github_pr_artifacts(**kwargs):
+        called.update(kwargs)
+        return {"github_pr_body_md": tmp_path / "github_pr_body.md"}
+
+    monkeypatch.setattr(mod, "write_github_pr_artifacts", _fake_write_github_pr_artifacts)
+
+    result = mod.write_evolution_github_pr_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        baseline_skill_path=tmp_path / "baseline.md",
+        evolved_skill_path=tmp_path / "evolved.md",
+        report_path=tmp_path / "report.md",
+        summary_path=tmp_path / "summary.json",
+        diff_summary_path=tmp_path / "diff_summary.md",
+        review_checklist_path=tmp_path / "review_checklist.md",
+    )
+
+    assert result["github_pr_body_md"] == tmp_path / "github_pr_body.md"
+    assert called["report_path"] == tmp_path / "report.md"
+    assert called["review_checklist_path"] == tmp_path / "review_checklist.md"
+
+
+
+def test_write_git_pr_automation_artifacts_delegates_to_automation_module(tmp_path, monkeypatch):
+    metrics = {"skill_name": "github-code-review"}
+    called = {}
+
+    def _fake_write_git_pr_automation_artifacts(**kwargs):
+        called.update(kwargs)
+        return {"git_apply_plan_sh": tmp_path / "git_apply_plan.sh"}
+
+    monkeypatch.setattr(mod, "write_git_pr_automation_artifacts", _fake_write_git_pr_automation_artifacts)
+
+    result = mod.write_evolution_git_pr_automation_artifacts(
+        output_dir=tmp_path,
+        metrics=metrics,
+        hermes_repo=tmp_path / "hermes-agent",
+        skill_relpath="github/github-code-review/SKILL.md",
+        evolved_skill_text="# evolved\n",
+        github_pr_body_path=tmp_path / "github_pr_body.md",
+    )
+
+    assert result["git_apply_plan_sh"] == tmp_path / "git_apply_plan.sh"
+    assert called["skill_relpath"] == "github/github-code-review/SKILL.md"
+    assert called["github_pr_body_path"] == tmp_path / "github_pr_body.md"
+
+
+
+def test_execute_evolution_git_pr_automation_delegates_to_execution_module(tmp_path, monkeypatch):
+    called = {}
+
+    def _fake_execute_git_pr_automation(**kwargs):
+        called.update(kwargs)
+        return {"git": {"steps": []}, "pr": None}
+
+    monkeypatch.setattr(mod, "execute_git_pr_automation", _fake_execute_git_pr_automation)
+
+    result = mod.execute_evolution_git_pr_automation(
+        git_apply_plan={"hermes_repo": tmp_path / "hermes-agent"},
+        gh_pr_create_command="gh pr create --draft",
+        execute_push=True,
+        execute_pr=False,
+    )
+
+    assert result["git"]["steps"] == []
+    assert called["execute_push"] is True
+    assert called["execute_pr"] is False
+
+
+def test_git_apply_is_skipped_when_tblite_gate_fails(tmp_path, monkeypatch):
+    monkeypatch.setattr(mod, "load_default_env_files", lambda: [])
+    monkeypatch.setattr(mod, "resolve_runtime_model_settings", lambda **kwargs: (
+        kwargs["optimizer_model"],
+        kwargs["eval_model"],
+        {},
+    ))
+    monkeypatch.setattr(
+        mod,
+        "find_skill",
+        lambda skill_name, hermes_repo: tmp_path / "hermes-agent" / "skills" / skill_name / "SKILL.md",
+    )
+    monkeypatch.setattr(mod, "load_skill", lambda path: {
+        "raw": "---\nname: dogfood\ndescription: test\n---\n\n# Body",
+        "body": "# Body",
+        "frontmatter": {"name": "dogfood", "description": "test"},
+        "name": "dogfood",
+        "description": "test",
+    })
+
+    dataset = sample_dataset()
+    monkeypatch.setattr(mod.GoldenDatasetLoader, "load", lambda path: dataset)
+
+    class FakeValidator:
+        def validate_all(self, artifact_text, artifact_type, baseline_text=None):
+            return [SimpleNamespace(passed=True, constraint_name="ok", message="ok")]
+
+    monkeypatch.setattr(mod, "ConstraintValidator", lambda config: FakeValidator())
+    monkeypatch.setattr(mod.dspy, "LM", lambda model: object())
+    monkeypatch.setattr(mod.dspy, "configure", lambda **kwargs: None)
+    monkeypatch.setattr(mod, "SkillModule", lambda text: FakeModule(text))
+    monkeypatch.setattr(mod, "optimize_skill_module", lambda **kwargs: ("GEPA", FakeOptimizedModule("# Evolved")))
+    monkeypatch.setattr(mod, "evaluate_holdout", lambda **kwargs: ([0.4, 0.4], [0.9, 0.9]))
+    monkeypatch.setattr(
+        mod,
+        "maybe_run_tblite_gate",
+        lambda **kwargs: SimpleNamespace(
+            passed=False,
+            baseline_pass_rate=0.6,
+            evolved_pass_rate=0.4,
+            delta=-0.2,
+            threshold=0.02,
+            summary="TBLite fast regression detected",
+            mode="fast",
+            task_filter="broken-python",
+            base_config_path=tmp_path / "local.yaml",
+        ),
+    )
+    monkeypatch.setattr(mod, "write_evolution_report_artifacts", lambda **kwargs: {
+        "report_md": tmp_path / "report.md",
+        "summary_json": tmp_path / "summary.json",
+    })
+    monkeypatch.setattr(mod, "write_evolution_pr_ready_artifacts", lambda **kwargs: {
+        "pr_draft_md": tmp_path / "pr_draft.md",
+        "review_checklist_md": tmp_path / "review_checklist.md",
+        "diff_summary_md": tmp_path / "diff_summary.md",
+    })
+    monkeypatch.setattr(mod, "write_evolution_github_pr_artifacts", lambda **kwargs: {
+        "github_pr_body_md": tmp_path / "github_pr_body.md",
+        "gh_pr_create_command_txt": tmp_path / "gh_pr_create_command.txt",
+    })
+    monkeypatch.setattr(mod, "write_evolution_git_pr_automation_artifacts", lambda **kwargs: {
+        "candidate_skill_file": tmp_path / "candidate_skill_patch.md",
+        "git_apply_plan_sh": tmp_path / "git_apply_plan.sh",
+        "git_apply_plan_md": tmp_path / "git_apply_plan.md",
+        "gh_pr_create_after_push_txt": None,
+    })
+
+    execute_calls = []
+    monkeypatch.setattr(mod, "execute_evolution_git_pr_automation", lambda **kwargs: execute_calls.append(kwargs))
+
+    mod.evolve(
+        skill_name="dogfood",
+        eval_source="golden",
+        dataset_path=str(tmp_path / "dataset"),
+        hermes_repo=str(tmp_path / "hermes-agent"),
+        eval_backend="hermes",
+        run_tblite=True,
+        execute_git_apply=True,
+        execute_push=True,
+        execute_pr=False,
+    )
+
+    assert execute_calls == []
+
+
+def test_load_env_file_sets_missing_values_without_overwriting_existing(tmp_path, monkeypatch):
+    env_path = tmp_path / ".env"
+    env_path.write_text(
+        "# comment\n"
+        "OPENAI_API_KEY=loaded-key\n"
+        "export OPENAI_BASE_URL='https://example.test/v1'\n"
+    )
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    monkeypatch.setenv("OPENAI_BASE_URL", "https://keep-me")
+
+    loaded = mod.load_env_file(env_path)
+
+    assert loaded == {"OPENAI_API_KEY": "loaded-key"}
+    assert os.environ["OPENAI_API_KEY"] == "loaded-key"
+    assert os.environ["OPENAI_BASE_URL"] == "https://keep-me"
+
+
+
+def test_load_default_env_files_prefers_home_hermes_env(tmp_path, monkeypatch):
+    hermes_home = tmp_path / ".hermes"
+    hermes_home.mkdir()
+    env_path = hermes_home / ".env"
+    env_path.write_text("OPENAI_API_KEY=from-hermes-home\n")
+    monkeypatch.setattr(mod.Path, "home", lambda: tmp_path)
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+    loaded_paths = mod.load_default_env_files()
+
+    assert env_path in loaded_paths
+    assert os.environ["OPENAI_API_KEY"] == "from-hermes-home"
+
+
+
+def test_resolve_runtime_model_settings_uses_custom_hermes_config_defaults(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.yaml"
+    config_path.write_text(
+        "model:\n"
+        "  provider: custom\n"
+        "  default: gpt-5.4\n"
+        "  base_url: https://custom.example/v1\n"
+    )
+    monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
+    monkeypatch.delenv("OPENAI_API_BASE", raising=False)
+
+    optimizer_model, eval_model, applied_env = mod.resolve_runtime_model_settings(
+        optimizer_model="openai/gpt-4.1",
+        eval_model="openai/gpt-4.1-mini",
+        config_path=config_path,
+    )
+
+    assert optimizer_model == "gpt-5.4"
+    assert eval_model == "gpt-5.4"
+    assert applied_env == {
+        "OPENAI_BASE_URL": "https://custom.example/v1",
+        "OPENAI_API_BASE": "https://custom.example/v1",
+    }
+    assert os.environ["OPENAI_BASE_URL"] == "https://custom.example/v1"
+    assert os.environ["OPENAI_API_BASE"] == "https://custom.example/v1"
+
+
+
+def test_resolve_runtime_model_settings_preserves_explicit_models_and_env(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.yaml"
+    config_path.write_text(
+        "model:\n"
+        "  provider: custom\n"
+        "  default: gpt-5.4\n"
+        "  base_url: https://custom.example/v1\n"
+    )
+    monkeypatch.setenv("OPENAI_BASE_URL", "https://keep.example/v1")
+    monkeypatch.delenv("OPENAI_API_BASE", raising=False)
+
+    optimizer_model, eval_model, applied_env = mod.resolve_runtime_model_settings(
+        optimizer_model="anthropic/claude-sonnet-4.6",
+        eval_model="google/gemini-3-flash",
+        config_path=config_path,
+    )
+
+    assert optimizer_model == "anthropic/claude-sonnet-4.6"
+    assert eval_model == "google/gemini-3-flash"
+    assert applied_env == {"OPENAI_API_BASE": "https://custom.example/v1"}
+    assert os.environ["OPENAI_BASE_URL"] == "https://keep.example/v1"
+    assert os.environ["OPENAI_API_BASE"] == "https://custom.example/v1"
+
+
+
+def test_normalized_reflection_lm_preserves_list_shape_for_none_outputs():
+    lm = mod.NormalizedReflectionLM(lambda prompt: [None])
+
+    assert lm("prompt") == [""]
+
+
+
+def test_normalized_reflection_lm_wraps_scalar_outputs_as_single_item_list():
+    assert mod.NormalizedReflectionLM(lambda prompt: "hello")("prompt") == ["hello"]
+    assert mod.NormalizedReflectionLM(lambda prompt: {"text": "hello"})("prompt") == [{"text": "hello"}]
+
+
+
+def test_create_gepa_optimizer_uses_legacy_max_steps_when_supported(monkeypatch):
+    called = {}
+
+    class FakeLegacyGEPA:
+        def __init__(self, **kwargs):
+            called.update(kwargs)
+
+    monkeypatch.setattr(mod.dspy, "GEPA", FakeLegacyGEPA)
+    monkeypatch.setattr(mod.inspect, "signature", lambda obj: SimpleNamespace(parameters={"metric": object(), "max_steps": object()}))
+
+    optimizer = mod.create_gepa_optimizer(metric="metric", iterations=7)
+
+    assert isinstance(optimizer, FakeLegacyGEPA)
+    assert called == {"metric": "metric", "max_steps": 7}
+
+
+
+def test_create_gepa_optimizer_adapts_to_new_signature(monkeypatch):
+    called = {}
+
+    class FakeModernGEPA:
+        def __init__(self, **kwargs):
+            called.update(kwargs)
+
+    class FakeLM:
+        def __init__(self, model):
+            self.model = model
+
+        def __call__(self, prompt):
+            return ["normalized-text"]
+
+    monkeypatch.setattr(mod.dspy, "GEPA", FakeModernGEPA)
+    monkeypatch.setattr(mod.dspy, "LM", FakeLM)
+    monkeypatch.setattr(
+        mod.inspect,
+        "signature",
+        lambda obj: SimpleNamespace(parameters={
+            "metric": object(),
+            "reflection_lm": object(),
+            "max_full_evals": object(),
+        }),
+    )
+
+    optimizer = mod.create_gepa_optimizer(metric="metric", iterations=3, optimizer_model="openai/gpt-4.1")
+
+    assert isinstance(optimizer, FakeModernGEPA)
+    assert called["metric"] == "metric"
+    assert called["max_full_evals"] == 3
+    assert called["reflection_lm"]("prompt") == ["normalized-text"]
+
+
+
+def test_create_gepa_optimizer_wraps_legacy_metric_for_new_gepa(monkeypatch):
+    called = {}
+
+    class FakeModernGEPA:
+        def __init__(self, **kwargs):
+            called.update(kwargs)
+
+    def _metric(example, prediction, trace=None):
+        return 0.42
+
+    class FakeLM:
+        def __init__(self, model):
+            self.model = model
+
+        def __call__(self, prompt):
+            return [f"lm::{self.model}"]
+
+    monkeypatch.setattr(mod.dspy, "GEPA", FakeModernGEPA)
+    monkeypatch.setattr(mod.dspy, "LM", FakeLM)
+
+    def _fake_signature(obj):
+        if obj is FakeModernGEPA:
+            return SimpleNamespace(parameters={
+                "metric": object(),
+                "reflection_lm": object(),
+                "max_full_evals": object(),
+            })
+        if obj is _metric:
+            return SimpleNamespace(parameters={"example": object(), "prediction": object(), "trace": object()})
+        raise AssertionError(f"unexpected object: {obj}")
+
+    monkeypatch.setattr(mod.inspect, "signature", _fake_signature)
+
+    mod.create_gepa_optimizer(metric=_metric, iterations=2, optimizer_model="openai/gpt-4.1")
+
+    assert called["reflection_lm"]("prompt") == ["lm::openai/gpt-4.1"]
+    assert called["max_full_evals"] == 2
+    assert called["metric"]("gold", "pred", "trace", "pred_name", "pred_trace") == 0.42
+
+
+
+def test_optimize_skill_module_falls_back_to_miprov2_when_gepa_fails(monkeypatch):
+    calls = []
+
+    def _fake_create_gepa_optimizer(**kwargs):
+        raise TypeError("old/new API mismatch")
+
+    class FakeMIPRO:
+        def __init__(self, **kwargs):
+            calls.append(("mipro_init", kwargs))
+
+        def compile(self, baseline_module, **kwargs):
+            calls.append(("mipro_compile", kwargs))
+            return "optimized-module"
+
+    monkeypatch.setattr(mod, "create_gepa_optimizer", _fake_create_gepa_optimizer)
+    monkeypatch.setattr(mod.dspy, "MIPROv2", FakeMIPRO)
+
+    optimizer_name, optimized = mod.optimize_skill_module(
+        baseline_module="baseline",
+        trainset=["train"],
+        valset=["val"],
+        iterations=2,
+        metric="metric",
+    )
+
+    assert optimizer_name == "MIPROv2"
+    assert optimized == "optimized-module"
+    assert calls == [
+        ("mipro_init", {"metric": "metric", "auto": "light"}),
+        ("mipro_compile", {"trainset": ["train"], "valset": ["val"]}),
+    ]
+
+
+def test_validate_skill_constraints_uses_full_skill_text_for_structure(monkeypatch):
+    calls = []
+
+    class FakeValidator:
+        def validate_all(self, artifact_text, artifact_type, baseline_text=None):
+            calls.append((artifact_text, artifact_type, baseline_text))
+            return [SimpleNamespace(passed=True, constraint_name="skill_structure", message="ok")]
+
+    result = mod.validate_skill_constraints(
+        validator=FakeValidator(),
+        full_skill_text="---\nname: dogfood\ndescription: d\n---\n\n# Body",
+        baseline_full_text="---\nname: base\ndescription: d\n---\n\n# Base",
+    )
+
+    assert result[0].passed is True
+    assert calls == [(
+        "---\nname: dogfood\ndescription: d\n---\n\n# Body",
+        "skill",
+        "---\nname: base\ndescription: d\n---\n\n# Base",
+    )]