diff --git a/.gitignore b/.gitignore index 33d91b6..c8aac99 100644 --- a/.gitignore +++ b/.gitignore @@ -9,11 +9,19 @@ build/ .venv/ venv/ .env +.pytest_cache/ # Generated eval datasets (local, not shared) datasets/**/*.jsonl datasets/**/*.json !datasets/.gitkeep +!datasets/skills/dogfood/baidu-homepage/train.jsonl +!datasets/skills/dogfood/baidu-homepage/val.jsonl +!datasets/skills/dogfood/baidu-homepage/holdout.jsonl + +# Generated run artifacts +output/ +dogfood-output/ # Evolution snapshots snapshots/ diff --git a/README.md b/README.md index c59f9a6..9de5629 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,9 @@ GEPA reads execution traces to understand *why* things fail (not just that they # Install git clone https://github.com/NousResearch/hermes-agent-self-evolution.git cd hermes-agent-self-evolution -pip install -e ".[dev]" +uv venv --python 3.11 .venv +source .venv/bin/activate +uv pip install -e '.[dev]' # Point at your hermes-agent repo export HERMES_AGENT_REPO=~/.hermes/hermes-agent @@ -49,6 +51,57 @@ python -m evolution.skills.evolve_skill \ --eval-source sessiondb ``` +## Advanced Usage + +### Evaluate through a real Hermes runtime + +```bash +python -m evolution.skills.evolve_skill \ + --skill dogfood \ + --eval-source golden \ + --dataset-path datasets/skills/dogfood/baidu-homepage \ + --eval-backend hermes +``` + +### Add an optional TBLite regression gate + +```bash +python -m evolution.skills.evolve_skill \ + --skill github-code-review \ + --eval-source synthetic \ + --run-tblite \ + --tblite-mode fast +``` + +### Generate git / PR handoff artifacts or execute them directly + +```bash +python -m evolution.skills.evolve_skill \ + --skill github-code-review \ + --eval-source synthetic \ + --execute-git-apply + +python -m evolution.skills.evolve_skill \ + --skill github-code-review \ + --eval-source synthetic \ + --execute-git-apply \ + --execute-push \ + --execute-pr +``` + +Notes: +- By default the tool loads credentials from `~/.hermes/.env` and local `.env` when present, without overwriting already exported values. +- When the local Hermes runtime uses `model.provider: custom`, default self-evolution model settings are aligned to the active Hermes config automatically. +- `--execute-pr` requires `--execute-push`. + +## Golden Dataset Sample + +A browser-heavy golden sample for the `dogfood` skill is included at: + +- `datasets/skills/dogfood/baidu-homepage/` + +It captures both positive paths and blockers from a real Baidu homepage dogfood run, which makes it useful for evaluating browser QA skills more realistically than purely synthetic examples. + ## What It Optimizes | Phase | Target | Engine | Status | diff --git a/datasets/skills/dogfood/baidu-homepage/README.md b/datasets/skills/dogfood/baidu-homepage/README.md new file mode 100644 index 0000000..732ff2e --- /dev/null +++ b/datasets/skills/dogfood/baidu-homepage/README.md @@ -0,0 +1,34 @@ +# Dogfood Golden Sample: Baidu Homepage + +This dataset incorporates the real dogfood run against on 2026-04-15 into the self-evolution sample set for the `dogfood` skill. + +## Source Artifacts + +- Source report: `datasets/skills/dogfood/baidu-homepage/source_report.md` +- Dataset directory: `datasets/skills/dogfood/baidu-homepage` + +## What This Sample Covers + +- Homepage load health and console cleanliness +- Search submission flow +- Search suggestion relevance +- Wenxin assistant entry and back-navigation chain +- Top-nav News entry health + +## Why It Matters + +This sample gives `dogfood` a real browser-heavy golden set with both: + +- **positive paths**: homepage load, Wenxin single-turn QA, News page load +- **negative/blocking paths**: search flow interrupted by Baidu security verification, unrelated suggestions, unstable back-navigation chain + +That makes it more useful than a purely synthetic sample when evaluating whether the evolved skill: + +1. tests the intended user flows, +2. distinguishes blockers from non-blockers, +3. captures evidence correctly, and +4. writes a balanced QA report with both working and broken paths. + +## Notes + +- The source report is stored in-repo as text only; screenshot binaries from the original run are intentionally not committed. diff --git a/datasets/skills/dogfood/baidu-homepage/holdout.jsonl b/datasets/skills/dogfood/baidu-homepage/holdout.jsonl new file mode 100644 index 0000000..dbf0821 --- /dev/null +++ b/datasets/skills/dogfood/baidu-homepage/holdout.jsonl @@ -0,0 +1 @@ +{"task_input": "用 dogfood 测这个网站:https://www.baidu.com/,重点看顶部导航中的“新闻”入口,并总结哪些路径正常、哪些路径被阻断。", "expected_behavior": "应验证“新闻”入口能否正常打开百度新闻页,并区分成功路径与失败路径:例如新闻页应被记录为正常打开、无明显布局异常或 console 错误;同时如果搜索主流程被安全验证打断,也应在总结中列为 blocker,而不是把所有路径都误判为失败。", "difficulty": "hard", "category": "navigation-health", "source": "golden"} diff --git a/datasets/skills/dogfood/baidu-homepage/source_report.md b/datasets/skills/dogfood/baidu-homepage/source_report.md new file mode 100644 index 0000000..8dea480 --- /dev/null +++ b/datasets/skills/dogfood/baidu-homepage/source_report.md @@ -0,0 +1,165 @@ +# Dogfood QA Report + +**Target:** https://www.baidu.com/ +**Date:** 2026-04-15 +**Scope:** 百度首页桌面站小样本探索式测试:首页加载、顶部导航、搜索输入与提交主流程、文心助手入口、百度新闻入口。 +**Tester:** Hermes Agent (automated exploratory QA) + +--- + +## Executive Summary + +| Severity | Count | +|----------|-------| +| 🔴 Critical | 0 | +| 🟠 High | 1 | +| 🟡 Medium | 2 | +| 🔵 Low | 0 | +| **Total** | **3** | + +**Overall Assessment:** 百度首页与主要入口整体可用,但搜索主流程触发安全验证拦截,且搜索联想与返回链路存在可用性问题,影响无登录/自动化场景下的连续使用体验。 + +--- + +## Issues + +### Issue #1: 首页搜索主流程被安全验证拦截,无法直接进入结果页 + +| Field | Value | +|-------|-------| +| **Severity** | High | +| **Category** | Functional | +| **URL** | https://www.baidu.com/ | + +**Description:** +在首页输入测试关键词并提交后,未直接进入正常搜索结果页,而是被“百度安全验证”拦截。页面要求用户完成“拖动左侧滑块使图片为正”的图片旋转验证,导致标准搜索主流程中断。对于自动化代理、辅助技术用户或希望快速搜索的用户来说,这属于明显阻断。 + +**Steps to Reproduce:** +1. 打开 https://www.baidu.com/ +2. 在首页搜索框输入“Hermes Agent dogfood 测试” +3. 按 Enter 提交搜索 + +**Expected Behavior:** +直接进入对应关键词的搜索结果页,用户可以继续浏览结果。 + +**Actual Behavior:** +页面跳转到“百度安全验证”,要求完成滑块旋转图片验证后才能继续,正常搜索结果未展示。 + +**Screenshot:** +Original screenshot captured during the source run (binary not committed in this repo). + +**Console Errors** (if applicable): +```text +None observed. +``` + +--- + +### Issue #2: 搜索联想词与已输入查询明显不相关 + +| Field | Value | +|-------|-------| +| **Severity** | Medium | +| **Category** | UX | +| **URL** | https://www.baidu.com/ | + +**Description:** +在首页搜索框输入“Hermes Agent dogfood 测试”后,下拉联想建议并未围绕完整查询或“dogfood 测试”意图展开,而是出现大量泛化的英文品牌/词条,如 “hermes tracking”、“hermes track”、“hermes trismegistus”等。这种联想结果与当前查询意图偏差较大,容易误导用户点击到无关搜索方向。 + +**Steps to Reproduce:** +1. 打开 https://www.baidu.com/ +2. 在首页搜索框输入“Hermes Agent dogfood 测试” +3. 观察联想词下拉列表 + +**Expected Behavior:** +联想词应尽量贴近当前完整查询,或至少与“Agent / dogfood / 测试”意图相关。 + +**Actual Behavior:** +联想词主要围绕泛化的“Hermes”品牌/英文词条展开,与完整查询相关性较弱。 + +**Screenshot:** +Original screenshot captured during the source run (binary not committed in this repo). + +**Console Errors** (if applicable): +```text +None observed. +``` + +--- + +### Issue #3: 从文心助手页使用返回操作未能回到百度首页,历史链路表现不稳定 + +| Field | Value | +|-------|-------| +| **Severity** | Medium | +| **Category** | Functional | +| **URL** | https://chat.baidu.com/?enter_type=home_operate | + +**Description:** +从百度首页点击“复杂问题就找文心助手”进入文心助手页后,使用浏览器后退操作时,并未顺利回到百度首页,而是停留在文心相关页面。对用户来说,这会造成页面链路理解困难;对自动化工作流来说,也会增加 flow 恢复成本。 + +**Steps to Reproduce:** +1. 打开 https://www.baidu.com/ +2. 点击“复杂问题就找文心助手,深入思考回答更优”入口 +3. 在文心助手页面执行浏览器后退 + +**Expected Behavior:** +后退应返回原始百度首页。 + +**Actual Behavior:** +后退后仍停留在文心相关页面,未恢复到首页,需要重新导航到百度首页。 + +**Screenshot:** +Original screenshot captured during the source run (binary not committed in this repo). + +**Console Errors** (if applicable): +```text +None observed. +``` + +--- + +## Issues Summary Table + +| # | Title | Severity | Category | URL | +|---|-------|----------|----------|-----| +| 1 | 首页搜索主流程被安全验证拦截,无法直接进入结果页 | High | Functional | https://www.baidu.com/ | +| 2 | 搜索联想词与已输入查询明显不相关 | Medium | UX | https://www.baidu.com/ | +| 3 | 从文心助手页使用返回操作未能回到百度首页,历史链路表现不稳定 | Medium | Functional | https://chat.baidu.com/?enter_type=home_operate | + +## Testing Coverage + +### Pages Tested +- 百度首页(https://www.baidu.com/) +- 百度安全验证页(搜索后触发) +- 文心助手入口页 / 对话页(https://chat.baidu.com/) +- 百度新闻页(http://news.baidu.com) + +### Features Tested +- 首页加载与视觉检查 +- 浏览器 console 基础检查 +- 首页搜索输入与提交 +- 搜索联想词观察 +- 文心助手入口跳转 +- 文心助手单轮提问与回答返回 +- 顶部“新闻”导航入口跳转 + +### Not Tested / Out of Scope +- 登录流程 +- 图片、视频、地图、贴吧、网盘、文库等其余顶部入口的深入测试 +- 首页“设置”菜单展开行为 +- 热搜条目逐条点击验证 +- 移动端布局与响应式行为 +- 安全验证滑块的人工完成与验证后结果页质量 + +### Blockers +- 搜索主流程被百度安全验证拦截,无法在当前会话中继续检查正常搜索结果页的相关性、结果布局与分页链路。 + +--- + +## Notes + +1. 百度首页本身在首屏加载、布局和视觉呈现上表现稳定,未见明显白屏、JS 报错或布局错位。 +2. 文心助手入口可正常打开,且无需登录即可完成单轮问答,这一入口的可用性较好。 +3. 百度新闻页正常打开,说明顶部导航至少部分入口工作正常。 +4. 本次最主要的问题集中在“主搜索流程被风控打断”和“返回链路不稳定”,这两点对真实 end-to-end 体验影响最大。 diff --git a/datasets/skills/dogfood/baidu-homepage/train.jsonl b/datasets/skills/dogfood/baidu-homepage/train.jsonl new file mode 100644 index 0000000..14a6996 --- /dev/null +++ b/datasets/skills/dogfood/baidu-homepage/train.jsonl @@ -0,0 +1,2 @@ +{"task_input": "用 dogfood 测这个网站:https://www.baidu.com/,重点看首页搜索输入和提交主流程。", "expected_behavior": "应导航到百度首页,检查 console 与首屏状态,在搜索框输入明确测试词并提交;若出现百度安全验证,应将其识别为高严重级别的 Functional blocker,记录验证文案、触发步骤、结果页未展示这一事实,并附截图证据。", "difficulty": "medium", "category": "search-flow", "source": "golden"} +{"task_input": "用 dogfood 测这个网站:https://www.baidu.com/,重点看首页搜索联想词是否贴合查询意图。", "expected_behavior": "应在首页搜索框输入具有明确意图的测试查询,观察下拉联想词,并判断其是否与完整查询相关;若联想词大量偏向泛化品牌词而非当前测试意图,应记录为中等级别 UX 问题,说明误导风险并保存截图。", "difficulty": "medium", "category": "search-suggestions", "source": "golden"} diff --git a/datasets/skills/dogfood/baidu-homepage/val.jsonl b/datasets/skills/dogfood/baidu-homepage/val.jsonl new file mode 100644 index 0000000..3abeeec --- /dev/null +++ b/datasets/skills/dogfood/baidu-homepage/val.jsonl @@ -0,0 +1 @@ +{"task_input": "用 dogfood 测这个网站:https://www.baidu.com/,重点看“复杂问题就找文心助手”入口,以及从该页返回首页的链路。", "expected_behavior": "应点击文心助手入口,验证页面是否正常打开、是否能在未登录状态下完成至少一轮问答,再执行返回操作;若返回未恢复到原始百度首页,应记录为中等级别 Functional 问题,同时注明文心页面本身可用、无明显 console 错误或登录阻断。", "difficulty": "hard", "category": "subpage-flow", "source": "golden"} diff --git a/evolution/core/__init__.py b/evolution/core/__init__.py index 0e8396a..f941dca 100644 --- a/evolution/core/__init__.py +++ b/evolution/core/__init__.py @@ -1,3 +1,25 @@ """Core infrastructure shared across all evolution phases.""" from evolution.core.config import EvolutionConfig, get_hermes_agent_path +from evolution.core.benchmark_gate import TBLiteGateResult, run_tblite_benchmark_gate +from evolution.core.git_pr_automation import ( + build_evolution_branch_name, + build_git_apply_plan, + build_target_skill_path, + write_git_apply_plan_artifacts, + write_git_pr_automation_artifacts, + write_skill_patch_artifacts, +) +from evolution.core.report_artifact import ( + build_diff_summary, + build_evolution_report, + build_github_pr_body, + build_github_pr_title, + build_gh_pr_create_command, + build_pr_draft, + build_review_checklist, + summarize_recommendation, + write_github_pr_artifacts, + write_pr_ready_artifacts, + write_report_artifacts, +) diff --git a/evolution/core/benchmark_gate.py b/evolution/core/benchmark_gate.py new file mode 100644 index 0000000..40d3a05 --- /dev/null +++ b/evolution/core/benchmark_gate.py @@ -0,0 +1,232 @@ +"""Helpers for running benchmark regression gates against a local Hermes checkout.""" + +from __future__ import annotations + +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import yaml + +from evolution.core.hermes_eval import build_skill_system_prompt + + +DEFAULT_TBLITE_CONFIG = Path("environments/benchmarks/tblite/local.yaml") +DEFAULT_TBLITE_SCRIPT = Path("environments/benchmarks/tblite/tblite_env.py") +DEFAULT_FAST_TBLITE_TASK_FILTER = "broken-python,pandas-etl" + + +@dataclass +class TBLiteGatePlan: + mode: str + base_config_path: Path + task_filter: str | None + benchmark_script_path: Path + + +@dataclass +class TBLiteGateResult: + passed: bool + baseline_pass_rate: float + evolved_pass_rate: float + delta: float + threshold: float + summary: str + baseline_stdout: str + evolved_stdout: str + mode: str + task_filter: str | None + base_config_path: Path + + +def parse_tblite_pass_rate(stdout: str) -> float: + """Extract the overall pass rate from a TBLite/TB2 evaluation stdout string.""" + match = re.search(r"Overall Pass Rate:\s*([0-9]*\.?[0-9]+)", stdout) + if not match: + raise ValueError("Could not find 'Overall Pass Rate' in benchmark output") + return float(match.group(1)) + + +def resolve_tblite_gate_plan( + *, + mode: str, + hermes_repo: str | Path, + task_filter: str | None = None, + base_config_path: str | Path | None = None, + benchmark_script_path: str | Path | None = None, +) -> TBLiteGatePlan: + """Resolve the concrete config/script/filter used for a TBLite gate run.""" + hermes_repo_path = Path(hermes_repo).expanduser().resolve() + + normalized_mode = mode.lower().strip() + if normalized_mode not in {"fast", "full"}: + raise ValueError(f"Unsupported TBLite gate mode: {mode}") + + default_config = ( + hermes_repo_path / DEFAULT_TBLITE_CONFIG + if normalized_mode == "fast" + else hermes_repo_path / Path("environments/benchmarks/tblite/default.yaml") + ) + default_filter = DEFAULT_FAST_TBLITE_TASK_FILTER if normalized_mode == "fast" else None + + if base_config_path is None: + resolved_config = default_config + else: + supplied = Path(base_config_path) + resolved_config = supplied if supplied.is_absolute() else hermes_repo_path / supplied + + if benchmark_script_path is None: + resolved_script = hermes_repo_path / DEFAULT_TBLITE_SCRIPT + else: + supplied_script = Path(benchmark_script_path) + resolved_script = supplied_script if supplied_script.is_absolute() else hermes_repo_path / supplied_script + + return TBLiteGatePlan( + mode=normalized_mode, + base_config_path=resolved_config, + task_filter=task_filter if task_filter is not None else default_filter, + benchmark_script_path=resolved_script, + ) + + +def _write_benchmark_config( + *, + base_config_path: Path, + system_prompt: str, + output_dir: Path, + label: str, +) -> Path: + base_config = yaml.safe_load(base_config_path.read_text()) or {} + env_cfg = dict(base_config.get("env") or {}) + env_cfg["system_prompt"] = system_prompt + env_cfg["use_wandb"] = False + env_cfg["data_dir_to_save_evals"] = str(output_dir / f"tblite-{label}") + base_config["env"] = env_cfg + + config_path = output_dir / f"tblite_{label}.yaml" + config_path.write_text(yaml.safe_dump(base_config, sort_keys=False)) + return config_path + + +def _run_tblite_eval( + *, + hermes_repo: Path, + benchmark_script_path: Path, + config_path: Path, + task_filter: str | None, +) -> str: + command = [ + shutil.which("python") or "python", + str(benchmark_script_path), + "evaluate", + "--config", + str(config_path), + ] + if task_filter: + command.extend(["--env.task_filter", task_filter]) + + result = subprocess.run( + command, + capture_output=True, + text=True, + timeout=3600, + cwd=str(hermes_repo), + ) + if result.returncode != 0: + raise RuntimeError( + "TBLite evaluation failed with exit code " + f"{result.returncode}: {result.stderr.strip() or result.stdout.strip()}" + ) + return result.stdout + + +def run_tblite_benchmark_gate( + *, + skill_name: str, + baseline_skill_body: str, + evolved_skill_body: str, + hermes_repo: str | Path, + regression_threshold: float = 0.02, + task_filter: str | None = None, + base_config_path: str | Path | None = None, + benchmark_script_path: str | Path | None = None, + mode: str = "fast", +) -> TBLiteGateResult: + """Run a TBLite regression gate against baseline and evolved skill prompts.""" + hermes_repo_path = Path(hermes_repo).expanduser().resolve() + plan = resolve_tblite_gate_plan( + mode=mode, + hermes_repo=hermes_repo_path, + task_filter=task_filter, + base_config_path=base_config_path, + benchmark_script_path=benchmark_script_path, + ) + + baseline_prompt, _ = build_skill_system_prompt( + skill_name, + hermes_repo=hermes_repo_path, + skill_body_override=baseline_skill_body, + ) + evolved_prompt, _ = build_skill_system_prompt( + skill_name, + hermes_repo=hermes_repo_path, + skill_body_override=evolved_skill_body, + ) + + with tempfile.TemporaryDirectory(prefix="tblite-gate-") as tmpdir: + output_dir = Path(tmpdir) + baseline_config = _write_benchmark_config( + base_config_path=plan.base_config_path, + system_prompt=baseline_prompt, + output_dir=output_dir, + label="baseline", + ) + evolved_config = _write_benchmark_config( + base_config_path=plan.base_config_path, + system_prompt=evolved_prompt, + output_dir=output_dir, + label="evolved", + ) + + baseline_stdout = _run_tblite_eval( + hermes_repo=hermes_repo_path, + benchmark_script_path=plan.benchmark_script_path, + config_path=baseline_config, + task_filter=plan.task_filter, + ) + evolved_stdout = _run_tblite_eval( + hermes_repo=hermes_repo_path, + benchmark_script_path=plan.benchmark_script_path, + config_path=evolved_config, + task_filter=plan.task_filter, + ) + + baseline_pass_rate = parse_tblite_pass_rate(baseline_stdout) + evolved_pass_rate = parse_tblite_pass_rate(evolved_stdout) + delta = evolved_pass_rate - baseline_pass_rate + passed = delta >= -regression_threshold + + summary = ( + f"TBLite {plan.mode} gate passed: baseline={baseline_pass_rate:.4f}, evolved={evolved_pass_rate:.4f}, " + f"delta={delta:+.4f}, threshold=-{regression_threshold:.4f}" + if passed + else f"TBLite {plan.mode} regression detected: baseline={baseline_pass_rate:.4f}, evolved={evolved_pass_rate:.4f}, " + f"delta={delta:+.4f}, threshold=-{regression_threshold:.4f}" + ) + + return TBLiteGateResult( + passed=passed, + baseline_pass_rate=baseline_pass_rate, + evolved_pass_rate=evolved_pass_rate, + delta=delta, + threshold=regression_threshold, + summary=summary, + baseline_stdout=baseline_stdout, + evolved_stdout=evolved_stdout, + mode=plan.mode, + task_filter=plan.task_filter, + base_config_path=plan.base_config_path, + ) diff --git a/evolution/core/git_pr_automation.py b/evolution/core/git_pr_automation.py new file mode 100644 index 0000000..df98c06 --- /dev/null +++ b/evolution/core/git_pr_automation.py @@ -0,0 +1,375 @@ +"""Helpers for turning evolved skill artifacts into git/GitHub automation plans.""" + +from __future__ import annotations + +import json +import re +import shlex +import subprocess +from pathlib import Path +from typing import Any, Callable + +from evolution.core.report_artifact import build_github_pr_title, summarize_recommendation + +TerminalRunner = Callable[[str], dict[str, Any]] + + +def _slugify_skill_name(skill_name: str) -> str: + slug = re.sub(r"[^a-zA-Z0-9]+", "-", skill_name).strip("-").lower() + return slug or "skill" + + +def _skills_root(hermes_repo: str | Path) -> Path: + repo = Path(hermes_repo).expanduser().resolve() + return (repo / "skills").resolve(strict=False) + + +def _ensure_within_skills_root(*, hermes_repo: str | Path, target_path: str | Path) -> Path: + repo = Path(hermes_repo).expanduser().resolve() + skills_root = _skills_root(repo) + target = Path(target_path).expanduser() + if not target.is_absolute(): + target = repo / target + target = target.resolve(strict=False) + try: + target.relative_to(skills_root) + except ValueError as exc: + raise ValueError(f"target skill path must stay within {skills_root}") from exc + return target + + +def _default_runner(command: str, *, workdir: str | Path | None = None) -> dict[str, Any]: + completed = subprocess.run( + command, + shell=True, + cwd=str(workdir) if workdir is not None else None, + capture_output=True, + text=True, + ) + return { + "exit_code": completed.returncode, + "output": (completed.stdout or "") + (completed.stderr or ""), + } + + +def _run_step( + *, + name: str, + command: str, + workdir: str | Path, + runner: Callable[..., dict[str, Any]], +) -> dict[str, Any]: + result = runner(command, workdir=workdir) + exit_code = result.get("exit_code", 1) + output = result.get("output", "") + if exit_code != 0: + raise RuntimeError(f"{name} failed ({exit_code}): {output}") + return { + "name": name, + "command": command, + "exit_code": exit_code, + "output": output, + } + + +def build_evolution_branch_name(*, skill_name: str, timestamp: str) -> str: + """Build a deterministic branch name for an evolution candidate.""" + return f"evolution/{_slugify_skill_name(skill_name)}-{timestamp}" + + +def build_target_skill_path(*, hermes_repo: str | Path, skill_relpath: str) -> Path: + """Resolve the target skill path inside a hermes-agent checkout.""" + relative_path = Path(skill_relpath) + if relative_path.is_absolute(): + raise ValueError("skill_relpath must be relative to the skills directory") + return _ensure_within_skills_root( + hermes_repo=hermes_repo, + target_path=Path("skills") / relative_path, + ) + + +def write_skill_patch_artifacts( + *, + output_dir: str | Path, + skill_relpath: str, + evolved_skill_text: str, + hermes_repo: str | Path, +) -> dict[str, Path]: + """Write a candidate skill file plus a manifest describing where it should land.""" + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + candidate_skill_file = out_dir / "candidate_skill_patch.md" + candidate_skill_file.write_text(evolved_skill_text) + + target_skill_path = build_target_skill_path(hermes_repo=hermes_repo, skill_relpath=skill_relpath) + git_patch_manifest_json = out_dir / "git_patch_manifest.json" + git_patch_manifest_json.write_text(json.dumps({ + "skill_relpath": skill_relpath, + "target_skill_path": str(target_skill_path), + "candidate_skill_file": str(candidate_skill_file), + }, indent=2)) + + return { + "candidate_skill_file": candidate_skill_file, + "git_patch_manifest_json": git_patch_manifest_json, + "target_skill_path": target_skill_path, + } + + +def build_git_apply_plan( + *, + hermes_repo: str | Path, + branch_name: str, + target_skill_path: str | Path, + candidate_skill_file: str | Path, + commit_message: str, + push_remote: str = "origin", +) -> str: + """Build a shell plan that applies the evolved skill in a hermes repo.""" + repo = Path(hermes_repo).expanduser().resolve() + target = _ensure_within_skills_root(hermes_repo=repo, target_path=target_skill_path) + candidate = Path(candidate_skill_file).expanduser().resolve(strict=False) + + return "\n".join([ + "#!/usr/bin/env bash", + "set -euo pipefail", + f"cd {shlex.quote(str(repo))}", + f"git checkout -b {shlex.quote(branch_name)}", + f"mkdir -p {shlex.quote(str(target.parent))}", + f"cp {shlex.quote(str(candidate))} {shlex.quote(str(target))}", + f"git add {shlex.quote(str(target))}", + f"git commit -m {shlex.quote(commit_message)}", + f"git push -u {shlex.quote(push_remote)} {shlex.quote(branch_name)}", + "", + ]) + + +def write_git_apply_plan_artifacts( + *, + output_dir: str | Path, + hermes_repo: str | Path, + branch_name: str, + target_skill_path: str | Path, + candidate_skill_file: str | Path, + commit_message: str, + push_remote: str = "origin", +) -> dict[str, Path]: + """Write shell and markdown plans for applying the evolved skill via git.""" + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + git_apply_plan_sh = out_dir / "git_apply_plan.sh" + git_apply_plan_md = out_dir / "git_apply_plan.md" + + shell_plan = build_git_apply_plan( + hermes_repo=hermes_repo, + branch_name=branch_name, + target_skill_path=target_skill_path, + candidate_skill_file=candidate_skill_file, + commit_message=commit_message, + push_remote=push_remote, + ) + git_apply_plan_sh.write_text(shell_plan) + git_apply_plan_md.write_text( + "\n".join([ + "# Git Apply Plan", + "", + f"- Hermes repo: `{hermes_repo}`", + f"- Branch: `{branch_name}`", + f"- Target skill path: `{target_skill_path}`", + f"- Candidate file: `{candidate_skill_file}`", + f"- Commit message: `{commit_message}`", + "", + "## Shell Plan", + "```bash", + shell_plan.rstrip(), + "```", + "", + ]) + ) + + return { + "git_apply_plan_sh": git_apply_plan_sh, + "git_apply_plan_md": git_apply_plan_md, + } + + +def write_git_pr_automation_artifacts( + *, + output_dir: str | Path, + metrics: dict[str, Any], + hermes_repo: str | Path, + skill_relpath: str, + evolved_skill_text: str, + github_pr_body_path: str | Path, + push_remote: str = "origin", +) -> dict[str, Path | None]: + """Write the full git/GitHub handoff pack for applying an evolved skill.""" + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + patch_paths = write_skill_patch_artifacts( + output_dir=out_dir, + skill_relpath=skill_relpath, + evolved_skill_text=evolved_skill_text, + hermes_repo=hermes_repo, + ) + + branch_name = build_evolution_branch_name( + skill_name=metrics.get("skill_name", "skill"), + timestamp=metrics.get("timestamp", "unknown"), + ) + commit_message = build_github_pr_title(metrics) + plan_paths = write_git_apply_plan_artifacts( + output_dir=out_dir, + hermes_repo=hermes_repo, + branch_name=branch_name, + target_skill_path=patch_paths["target_skill_path"], + candidate_skill_file=patch_paths["candidate_skill_file"], + commit_message=commit_message, + push_remote=push_remote, + ) + + decision = summarize_recommendation(metrics)["decision"] + gh_pr_create_after_push_txt = out_dir / "gh_pr_create_after_push.txt" + if decision == "reject": + command_path: Path | None = None + if gh_pr_create_after_push_txt.exists(): + gh_pr_create_after_push_txt.unlink() + else: + draft_flag = " --draft" if decision == "review_needed" else "" + command = ( + f"cd {shlex.quote(str(hermes_repo))} && " + f"gh pr create --base main --head {shlex.quote(branch_name)} " + f"--title {shlex.quote(commit_message)} " + f"--body-file {shlex.quote(str(github_pr_body_path))}{draft_flag}" + ) + gh_pr_create_after_push_txt.write_text(command + "\n") + command_path = gh_pr_create_after_push_txt + + return { + "candidate_skill_file": patch_paths["candidate_skill_file"], + "git_patch_manifest_json": patch_paths["git_patch_manifest_json"], + "target_skill_path": patch_paths["target_skill_path"], + "git_apply_plan_sh": plan_paths["git_apply_plan_sh"], + "git_apply_plan_md": plan_paths["git_apply_plan_md"], + "gh_pr_create_after_push_txt": command_path, + } + + +def execute_git_apply_plan( + *, + hermes_repo: str | Path, + branch_name: str, + target_skill_path: str | Path, + candidate_skill_file: str | Path, + commit_message: str, + push_remote: str = "origin", + run_push: bool = True, + runner: Callable[..., dict[str, Any]] | None = None, +) -> dict[str, Any]: + """Execute the git apply plan step by step with an injectable runner.""" + run = runner or _default_runner + repo = Path(hermes_repo).expanduser().resolve() + target = _ensure_within_skills_root(hermes_repo=repo, target_path=target_skill_path) + candidate = Path(candidate_skill_file).expanduser().resolve(strict=False) + + steps = [ + _run_step( + name="checkout_branch", + command=f"git checkout -b {shlex.quote(branch_name)}", + workdir=repo, + runner=run, + ), + _run_step( + name="ensure_target_dir", + command=f"mkdir -p {shlex.quote(str(target.parent))}", + workdir=repo, + runner=run, + ), + _run_step( + name="copy_candidate_skill", + command=f"cp {shlex.quote(str(candidate))} {shlex.quote(str(target))}", + workdir=repo, + runner=run, + ), + _run_step( + name="git_add", + command=f"git add {shlex.quote(str(target))}", + workdir=repo, + runner=run, + ), + _run_step( + name="git_commit", + command=f"git commit -m {shlex.quote(commit_message)}", + workdir=repo, + runner=run, + ), + ] + if run_push: + steps.append( + _run_step( + name="push_branch", + command=f"git push -u {shlex.quote(push_remote)} {shlex.quote(branch_name)}", + workdir=repo, + runner=run, + ) + ) + + return { + "executed": True, + "steps": steps, + } + + +def execute_gh_pr_create( + *, + command: str | None, + hermes_repo: str | Path, + runner: Callable[..., dict[str, Any]] | None = None, +) -> dict[str, Any]: + """Execute a gh pr create command when present.""" + if not command: + return {"executed": False, "reason": "no_command"} + + run = runner or _default_runner + result = run(command, workdir=hermes_repo) + exit_code = result.get("exit_code", 1) + output = result.get("output", "") + if exit_code != 0: + raise RuntimeError(f"gh_pr_create failed ({exit_code}): {output}") + return { + "executed": True, + "command": command, + "exit_code": exit_code, + "output": output, + } + + +def execute_git_pr_automation( + *, + git_apply_plan: dict[str, Any], + gh_pr_create_command: str | None, + execute_push: bool, + execute_pr: bool, + execute_git_apply_plan_fn=execute_git_apply_plan, + execute_gh_pr_create_fn=execute_gh_pr_create, +) -> dict[str, Any]: + """Execute the real git apply flow, optionally followed by PR creation.""" + git_result = execute_git_apply_plan_fn( + **git_apply_plan, + run_push=execute_push, + ) + + pr_result = None + if execute_pr and gh_pr_create_command: + pr_result = execute_gh_pr_create_fn( + command=gh_pr_create_command, + hermes_repo=git_apply_plan["hermes_repo"], + ) + + return { + "git": git_result, + "pr": pr_result, + } diff --git a/evolution/core/hermes_eval.py b/evolution/core/hermes_eval.py new file mode 100644 index 0000000..007bc4e --- /dev/null +++ b/evolution/core/hermes_eval.py @@ -0,0 +1,179 @@ +"""Helpers for evaluating skills with a real Hermes Agent instance. + +This bridges hermes-agent-self-evolution to the actual Hermes runtime instead of +only evaluating a DSPy skill wrapper in isolation. +""" + +from __future__ import annotations + +import importlib.util +import sys +from contextlib import contextmanager +from dataclasses import dataclass +from pathlib import Path +from types import ModuleType, SimpleNamespace +from typing import Any, Optional + +from evolution.core.config import get_hermes_agent_path + + +@dataclass +class HermesSkillEvalCase: + """Single evaluation case for a skill using a real Hermes Agent.""" + + skill_name: str + task_input: str + system_prompt: str = "" + conversation_history: Optional[list[dict[str, Any]]] = None + + +@dataclass +class HermesSkillEvalResult: + """Result of a real Hermes Agent skill evaluation run.""" + + final_response: str + loaded_skills: list[str] + effective_system_prompt: str + raw_result: Any + + +def _import_module_from_path(module_name: str, path: Path) -> ModuleType: + spec = importlib.util.spec_from_file_location(module_name, path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load module {module_name} from {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +@contextmanager +def _temporary_hermes_import_shims(repo: Path | None = None): + """Provide tiny stubs and import path support for dynamic Hermes imports.""" + installed: list[str] = [] + inserted_repo = False + + if repo is not None: + repo_str = str(repo) + if repo_str not in sys.path: + sys.path.insert(0, repo_str) + inserted_repo = True + + if "fire" not in sys.modules: + fire_stub = ModuleType("fire") + + def _fire(*args, **kwargs): + return None + + fire_stub.Fire = _fire + sys.modules["fire"] = fire_stub + installed.append("fire") + + try: + yield + finally: + for module_name in installed: + sys.modules.pop(module_name, None) + if inserted_repo: + try: + sys.path.remove(repo_str) + except ValueError: + pass + + +def _load_hermes_symbols(hermes_repo: str | Path | None = None) -> SimpleNamespace: + repo = Path(hermes_repo).expanduser() if hermes_repo else get_hermes_agent_path() + repo = repo.resolve() + + with _temporary_hermes_import_shims(repo): + run_agent_module = _import_module_from_path( + "_self_evolution_run_agent", + repo / "run_agent.py", + ) + skill_commands_module = _import_module_from_path( + "_self_evolution_skill_commands", + repo / "agent" / "skill_commands.py", + ) + + return SimpleNamespace( + AIAgent=run_agent_module.AIAgent, + build_preloaded_skills_prompt=skill_commands_module.build_preloaded_skills_prompt, + ) + + +def build_skill_system_prompt( + skill_name: str, + hermes_repo: str | Path | None = None, + skill_body_override: str | None = None, +) -> tuple[str, list[str]]: + """Load a Hermes skill the same way the CLI preloads it for a session. + + When ``skill_body_override`` is provided, inline it as the active skill body so + we can evaluate candidate variants with the real Hermes runtime before they are + written back into the target repository. + """ + if skill_body_override is not None: + prompt = ( + f'[SYSTEM: The user launched this evaluation session with the "{skill_name}" skill ' + "preloaded. Treat its instructions as active guidance for the duration of this " + "session unless overridden.]\n\n" + f"# Active Skill: {skill_name}\n\n{skill_body_override}" + ) + return prompt, [skill_name] + + symbols = _load_hermes_symbols(hermes_repo) + skills_prompt, loaded_skills, missing_skills = symbols.build_preloaded_skills_prompt([skill_name]) + if missing_skills: + missing_display = ", ".join(missing_skills) + raise ValueError(f"Unknown skill(s): {missing_display}") + if not skills_prompt: + raise ValueError(f"Failed to build prompt for skill: {skill_name}") + return skills_prompt, loaded_skills + + +def run_skill_eval( + case: HermesSkillEvalCase, + *, + model: str = "openai/gpt-4.1-mini", + hermes_repo: str | Path | None = None, + agent_kwargs: Optional[dict[str, Any]] = None, + skill_body_override: str | None = None, +) -> HermesSkillEvalResult: + """Run one evaluation case through a real Hermes Agent instance.""" + symbols = _load_hermes_symbols(hermes_repo) + skills_prompt, loaded_skills = build_skill_system_prompt( + case.skill_name, + hermes_repo=hermes_repo, + skill_body_override=skill_body_override, + ) + effective_system_prompt = "\n\n".join( + part for part in (case.system_prompt, skills_prompt) if part + ).strip() + + merged_agent_kwargs = { + "model": model, + "quiet_mode": True, + "skip_context_files": True, + "skip_memory": True, + "ephemeral_system_prompt": effective_system_prompt or None, + } + if agent_kwargs: + merged_agent_kwargs.update(agent_kwargs) + + agent = symbols.AIAgent(**merged_agent_kwargs) + raw_result = agent.run_conversation( + user_message=case.task_input, + conversation_history=case.conversation_history, + ) + + if isinstance(raw_result, dict): + final_response = str(raw_result.get("final_response", "")) + else: + final_response = str(raw_result) + + return HermesSkillEvalResult( + final_response=final_response, + loaded_skills=loaded_skills, + effective_system_prompt=effective_system_prompt, + raw_result=raw_result, + ) diff --git a/evolution/core/report_artifact.py b/evolution/core/report_artifact.py new file mode 100644 index 0000000..444852b --- /dev/null +++ b/evolution/core/report_artifact.py @@ -0,0 +1,352 @@ +"""Helpers for generating structured evolution report artifacts.""" + +from __future__ import annotations + +import json +import shlex +from pathlib import Path +from typing import Any + + +def summarize_recommendation(metrics: dict[str, Any]) -> dict[str, Any]: + """Derive a human-review recommendation from holdout + gate metrics.""" + reasons: list[str] = [] + tblite_gate = metrics.get("tblite_gate") + + if tblite_gate and not tblite_gate.get("passed", False): + decision = "reject" + reasons.append("benchmark_gate_failed") + elif metrics.get("improvement", 0.0) > 0: + decision = "candidate_for_review" + reasons.append("holdout_improved") + if tblite_gate and tblite_gate.get("passed"): + reasons.append("benchmark_gate_passed") + else: + decision = "review_needed" + reasons.append("no_holdout_improvement") + if tblite_gate and tblite_gate.get("passed"): + reasons.append("benchmark_gate_passed") + + return { + "skill_name": metrics.get("skill_name"), + "decision": decision, + "reasons": reasons, + "improvement": metrics.get("improvement"), + "tblite_gate": tblite_gate, + } + + +def build_evolution_report( + *, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +) -> str: + """Render a markdown report for one evolution run.""" + summary = summarize_recommendation(metrics) + tblite_gate = metrics.get("tblite_gate") + + lines = [ + "# Evolution Report", + "", + f"- Skill: `{metrics.get('skill_name')}`", + f"- Timestamp: `{metrics.get('timestamp')}`", + f"- Decision: `{summary['decision']}`", + f"- Reasons: `{', '.join(summary['reasons']) or 'none'}`", + "", + "## Holdout Results", + "", + f"- Eval backend: `{metrics.get('eval_backend')}`", + f"- Baseline score: `{metrics.get('baseline_score'):.3f}`", + f"- Evolved score: `{metrics.get('evolved_score'):.3f}`", + f"- Improvement: `{metrics.get('improvement'):+.3f}`", + "", + "## Benchmark Gate", + "", + ] + + if tblite_gate: + lines.extend([ + f"- Summary: {tblite_gate.get('summary')}", + f"- Mode: `{tblite_gate.get('mode')}`", + f"- Task filter: `{tblite_gate.get('task_filter')}`", + f"- Baseline pass rate: `{tblite_gate.get('baseline_pass_rate'):.3f}`", + f"- Evolved pass rate: `{tblite_gate.get('evolved_pass_rate'):.3f}`", + f"- Delta: `{tblite_gate.get('delta'):+.3f}`", + ]) + else: + lines.append("- Not run") + + lines.extend([ + "", + "## Artifacts", + "", + f"- Baseline skill: `{baseline_skill_path}`", + f"- Evolved skill: `{evolved_skill_path}`", + ]) + return "\n".join(lines) + "\n" + + +def build_diff_summary( + *, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +) -> str: + """Render a concise markdown diff summary for reviewers.""" + size_delta = metrics.get("evolved_size", 0) - metrics.get("baseline_size", 0) + return "\n".join([ + "# Diff Summary", + "", + f"- Skill: `{metrics.get('skill_name')}`", + f"- Holdout improvement: `{metrics.get('improvement', 0.0):+.3f}`", + f"- Skill size delta: `{size_delta:+d}` chars", + f"- Baseline artifact: `{baseline_skill_path}`", + f"- Evolved artifact: `{evolved_skill_path}`", + "", + "## Reviewer Focus", + "- Confirm the evolved skill improves behavior without overfitting.", + "- Inspect prompt growth and instruction clarity.", + ]) + "\n" + + +def build_review_checklist(metrics: dict[str, Any]) -> str: + """Render a reviewer checklist for human approval.""" + summary = summarize_recommendation(metrics) + tblite_gate = metrics.get("tblite_gate") + + lines = [ + "# Review Checklist", + "", + f"- [ ] Decision `{summary['decision']}` matches the evidence", + f"- [ ] Holdout improvement looks meaningful (`{metrics.get('improvement', 0.0):+.3f}`)", + "- [ ] Evolved skill wording is clearer or more robust than baseline", + ] + if tblite_gate: + lines.append(f"- [ ] TBLite gate result reviewed (`{tblite_gate.get('summary')}`)") + else: + lines.append("- [ ] Confirm whether a benchmark gate is still needed") + lines.append("- [ ] Approve only after inspecting the actual skill diff") + return "\n".join(lines) + "\n" + + +def build_pr_draft( + *, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +) -> str: + """Render a PR-ready markdown draft for candidate variants.""" + summary = summarize_recommendation(metrics) + tblite_gate = metrics.get("tblite_gate") + gate_line = tblite_gate.get("summary") if tblite_gate else "Not run" + + return "\n".join([ + "# PR Draft", + "", + f"Decision: `{summary['decision']}`", + "", + "## Summary", + f"- Candidate skill: `{metrics.get('skill_name')}`", + f"- Holdout improvement: `{metrics.get('improvement', 0.0):+.3f}`", + f"- Benchmark gate: {gate_line}", + "", + "## Artifacts", + f"- Baseline skill: `{baseline_skill_path}`", + f"- Evolved skill: `{evolved_skill_path}`", + "", + "## Test Plan", + "- [x] Focused self-evolution unit tests", + "- [ ] Human review of the skill diff", + "- [ ] Optional broader benchmark rerun before merge", + ]) + "\n" + + +def build_github_pr_title(metrics: dict[str, Any]) -> str: + """Build a concise GitHub PR title for an evolved skill candidate.""" + decision = summarize_recommendation(metrics)["decision"] + suffix = "candidate" if decision == "candidate_for_review" else "review" + return f"feat: evolve skill {metrics.get('skill_name')} ({suffix})" + + +def build_github_pr_body( + *, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, + report_path: str | Path, + summary_path: str | Path, + diff_summary_path: str | Path, + review_checklist_path: str | Path, +) -> str: + """Render a GitHub-ready PR body from evolution artifacts.""" + summary = summarize_recommendation(metrics) + tblite_gate = metrics.get("tblite_gate") + gate_line = tblite_gate.get("summary") if tblite_gate else "Not run" + + return "\n".join([ + "## Summary", + f"- Evolved skill: `{metrics.get('skill_name')}`", + f"- Decision: `{summary['decision']}`", + f"- Reasons: `{', '.join(summary['reasons']) or 'none'}`", + f"- Holdout improvement: `{metrics.get('improvement', 0.0):+.3f}`", + "", + "## Evaluation Evidence", + f"- Holdout backend: `{metrics.get('eval_backend')}`", + f"- Baseline score: `{metrics.get('baseline_score', 0.0):.3f}`", + f"- Evolved score: `{metrics.get('evolved_score', 0.0):.3f}`", + f"- Benchmark gate: {gate_line}", + "", + "## Artifacts", + f"- Baseline skill: `{baseline_skill_path}`", + f"- Evolved skill: `{evolved_skill_path}`", + f"- Evolution report: `{report_path}`", + f"- Summary JSON: `{summary_path}`", + f"- Diff summary: `{diff_summary_path}`", + f"- Review checklist: `{review_checklist_path}`", + "", + "## Test Plan", + "- [x] Focused self-evolution unit tests", + "- [ ] Review generated artifact set", + "- [ ] Inspect evolved skill diff in context", + "- [ ] Re-run broader benchmark gate if needed before merge", + ]) + "\n" + + +def build_gh_pr_create_command( + *, + metrics: dict[str, Any], + body_path: str | Path, + base_branch: str = "main", +) -> str | None: + """Build a gh CLI command suggestion when the run is PR-worthy.""" + decision = summarize_recommendation(metrics)["decision"] + if decision == "reject": + return None + + title = build_github_pr_title(metrics) + draft_flag = " --draft" if decision == "review_needed" else "" + return ( + "gh pr create" + f" --base {shlex.quote(base_branch)}" + f" --title {shlex.quote(title)}" + f" --body-file {shlex.quote(str(body_path))}" + f"{draft_flag}" + ) + + +def write_report_artifacts( + *, + output_dir: str | Path, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +) -> dict[str, Path]: + """Write markdown + machine-readable summary artifacts.""" + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + summary = summarize_recommendation(metrics) + report_md = out_dir / "report.md" + summary_json = out_dir / "summary.json" + + report_md.write_text( + build_evolution_report( + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + ) + summary_json.write_text(json.dumps(summary, indent=2)) + + return { + "report_md": report_md, + "summary_json": summary_json, + } + + +def write_pr_ready_artifacts( + *, + output_dir: str | Path, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +) -> dict[str, Path]: + """Write PR-ready markdown artifacts for review and handoff.""" + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + pr_draft_md = out_dir / "pr_draft.md" + review_checklist_md = out_dir / "review_checklist.md" + diff_summary_md = out_dir / "diff_summary.md" + + pr_draft_md.write_text( + build_pr_draft( + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + ) + review_checklist_md.write_text(build_review_checklist(metrics)) + diff_summary_md.write_text( + build_diff_summary( + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + ) + + return { + "pr_draft_md": pr_draft_md, + "review_checklist_md": review_checklist_md, + "diff_summary_md": diff_summary_md, + } + + +def write_github_pr_artifacts( + *, + output_dir: str | Path, + metrics: dict[str, Any], + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, + report_path: str | Path, + summary_path: str | Path, + diff_summary_path: str | Path, + review_checklist_path: str | Path, + base_branch: str = "main", +) -> dict[str, Path | None]: + """Write GitHub-ready PR body + optional gh command suggestion.""" + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + github_pr_body_md = out_dir / "github_pr_body.md" + github_pr_body_md.write_text( + build_github_pr_body( + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + report_path=report_path, + summary_path=summary_path, + diff_summary_path=diff_summary_path, + review_checklist_path=review_checklist_path, + ) + ) + + gh_pr_create_command_txt = out_dir / "gh_pr_create_command.txt" + command = build_gh_pr_create_command( + metrics=metrics, + body_path=github_pr_body_md, + base_branch=base_branch, + ) + if command is None: + if gh_pr_create_command_txt.exists(): + gh_pr_create_command_txt.unlink() + command_path: Path | None = None + else: + gh_pr_create_command_txt.write_text(command + "\n") + command_path = gh_pr_create_command_txt + + return { + "github_pr_body_md": github_pr_body_md, + "gh_pr_create_command_txt": command_path, + } diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 8ad4d89..28893e5 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -5,7 +5,9 @@ python -m evolution.skills.evolve_skill --skill arxiv --eval-source golden --dataset datasets/skills/arxiv/ """ +import inspect import json +import os import sys import time from pathlib import Path @@ -14,15 +16,29 @@ import click import dspy +import yaml from rich.console import Console -from rich.panel import Panel from rich.table import Table -from evolution.core.config import EvolutionConfig, get_hermes_agent_path +from evolution.core.config import EvolutionConfig from evolution.core.dataset_builder import SyntheticDatasetBuilder, EvalDataset, GoldenDatasetLoader from evolution.core.external_importers import build_dataset_from_external -from evolution.core.fitness import skill_fitness_metric, LLMJudge, FitnessScore +from evolution.core.fitness import LLMJudge, skill_fitness_metric from evolution.core.constraints import ConstraintValidator +from evolution.core.hermes_eval import HermesSkillEvalCase, run_skill_eval +from evolution.core.benchmark_gate import run_tblite_benchmark_gate +from evolution.core.report_artifact import ( + build_github_pr_title, + write_github_pr_artifacts, + write_pr_ready_artifacts, + write_report_artifacts, +) +from evolution.core.git_pr_automation import ( + build_evolution_branch_name, + build_target_skill_path, + execute_git_pr_automation, + write_git_pr_automation_artifacts, +) from evolution.skills.skill_module import ( SkillModule, load_skill, @@ -32,32 +48,447 @@ console = Console() +DEFAULT_OPTIMIZER_MODEL = "openai/gpt-4.1" +DEFAULT_EVAL_MODEL = "openai/gpt-4.1-mini" +DEFAULT_HERMES_EVAL_MAX_ITERATIONS = 12 + + +def load_env_file(path: str | Path) -> dict[str, str]: + """Load KEY=VALUE lines from an env file without overwriting existing env vars.""" + env_path = Path(path).expanduser() + if not env_path.exists(): + return {} + + loaded: dict[str, str] = {} + for raw_line in env_path.read_text().splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + if line.startswith("export "): + line = line[len("export "):].strip() + key, value = line.split("=", 1) + key = key.strip() + value = value.strip() + if not key or key in os.environ: + continue + if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}: + value = value[1:-1] + os.environ[key] = value + loaded[key] = value + + return loaded + + +def load_default_env_files() -> list[Path]: + """Load default credential env files used by local Hermes workflows.""" + loaded_paths: list[Path] = [] + for candidate in [Path.home() / ".hermes" / ".env", Path.cwd() / ".env"]: + if load_env_file(candidate): + loaded_paths.append(candidate) + return loaded_paths + + +def load_hermes_model_config(config_path: str | Path | None = None) -> dict: + """Read the local Hermes model config used by the runtime, if available.""" + candidate = Path(config_path).expanduser() if config_path else Path.home() / ".hermes" / "config.yaml" + if not candidate.exists(): + return {} + data = yaml.safe_load(candidate.read_text()) or {} + model_cfg = data.get("model") or {} + return model_cfg if isinstance(model_cfg, dict) else {} + + +def resolve_runtime_model_settings( + *, + optimizer_model: str, + eval_model: str, + config_path: str | Path | None = None, +) -> tuple[str, str, dict[str, str]]: + """Align self-evolution model defaults with the active Hermes runtime config. + + On machines where Hermes is configured against a custom OpenAI-compatible endpoint, + the repo's hardcoded OpenAI defaults tend to fail. When the caller is still using + those defaults, prefer the Hermes runtime's configured default model and expose its + base URL to LiteLLM/DSPy via env vars without overwriting user-provided values. + """ + model_cfg = load_hermes_model_config(config_path) + provider = str(model_cfg.get("provider") or "").strip().lower() + default_model = str(model_cfg.get("default") or "").strip() + base_url = str(model_cfg.get("base_url") or "").strip().rstrip("/") + applied_env: dict[str, str] = {} + + if provider == "custom" and default_model: + if optimizer_model == DEFAULT_OPTIMIZER_MODEL: + optimizer_model = default_model + if eval_model == DEFAULT_EVAL_MODEL: + eval_model = default_model + + if provider == "custom" and base_url: + for env_name in ("OPENAI_BASE_URL", "OPENAI_API_BASE"): + if not os.environ.get(env_name): + os.environ[env_name] = base_url + applied_env[env_name] = base_url + + return optimizer_model, eval_model, applied_env + + +def score_output_against_example( + *, + example, + agent_output: str, + skill_body: str, + eval_model: str, + hermes_agent_path: str | Path | None = None, +) -> float: + """Score one agent output against an eval example using the richer LLM judge.""" + judge_config = EvolutionConfig( + eval_model=eval_model, + judge_model=eval_model, + ) + if hermes_agent_path is not None: + judge_config.hermes_agent_path = Path(hermes_agent_path).expanduser() + judge = LLMJudge(judge_config) + score = judge.score( + task_input=example.task_input, + expected_behavior=example.expected_behavior, + agent_output=agent_output, + skill_text=skill_body, + ) + return score.composite + + +def evaluate_holdout( + *, + dataset: EvalDataset, + eval_backend: str, + baseline_module, + evolved_module, + baseline_skill_body: str, + evolved_skill_body: str, + eval_model: str, + hermes_repo: str | None = None, + skill_name: str | None = None, +): + """Evaluate baseline and evolved variants on the holdout split.""" + holdout_examples = dataset.to_dspy_examples("holdout") + + baseline_scores = [] + evolved_scores = [] + + if eval_backend == "dspy": + lm = dspy.LM(eval_model) + for ex in holdout_examples: + with dspy.context(lm=lm): + baseline_pred = baseline_module(task_input=ex.task_input) + baseline_scores.append(skill_fitness_metric(ex, baseline_pred)) + + evolved_pred = evolved_module(task_input=ex.task_input) + evolved_scores.append(skill_fitness_metric(ex, evolved_pred)) + return baseline_scores, evolved_scores + + if eval_backend != "hermes": + raise ValueError(f"Unknown eval backend: {eval_backend}") + if not skill_name: + raise ValueError("skill_name is required when eval_backend='hermes'") + + for ex in holdout_examples: + case = HermesSkillEvalCase( + skill_name=skill_name, + task_input=ex.task_input, + ) + baseline_result = run_skill_eval( + case, + model=eval_model, + hermes_repo=hermes_repo, + skill_body_override=baseline_skill_body, + agent_kwargs={"max_iterations": DEFAULT_HERMES_EVAL_MAX_ITERATIONS}, + ) + baseline_scores.append( + score_output_against_example( + example=ex, + agent_output=baseline_result.final_response, + skill_body=baseline_skill_body, + eval_model=eval_model, + hermes_agent_path=hermes_repo, + ) + ) + + evolved_result = run_skill_eval( + case, + model=eval_model, + hermes_repo=hermes_repo, + skill_body_override=evolved_skill_body, + agent_kwargs={"max_iterations": DEFAULT_HERMES_EVAL_MAX_ITERATIONS}, + ) + evolved_scores.append( + score_output_against_example( + example=ex, + agent_output=evolved_result.final_response, + skill_body=evolved_skill_body, + eval_model=eval_model, + hermes_agent_path=hermes_repo, + ) + ) + + return baseline_scores, evolved_scores + + +def maybe_run_tblite_gate( + *, + run_tblite: bool, + skill_name: str, + baseline_skill_body: str, + evolved_skill_body: str, + hermes_repo: str | None, + tblite_regression_threshold: float = 0.02, + tblite_task_filter: str | None = None, + tblite_mode: str = "fast", +): + """Run the optional TBLite regression gate when enabled.""" + if not run_tblite: + return None + + if not hermes_repo: + raise ValueError("hermes_repo is required when run_tblite=True") + + return run_tblite_benchmark_gate( + skill_name=skill_name, + baseline_skill_body=baseline_skill_body, + evolved_skill_body=evolved_skill_body, + hermes_repo=hermes_repo, + regression_threshold=tblite_regression_threshold, + task_filter=tblite_task_filter, + mode=tblite_mode, + ) + + +def write_evolution_report_artifacts( + *, + output_dir: str | Path, + metrics: dict, + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +): + """Write structured report artifacts for one evolution run.""" + return write_report_artifacts( + output_dir=output_dir, + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + + +def write_evolution_pr_ready_artifacts( + *, + output_dir: str | Path, + metrics: dict, + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, +): + """Write PR-ready artifacts for one evolution run.""" + return write_pr_ready_artifacts( + output_dir=output_dir, + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + + +def write_evolution_github_pr_artifacts( + *, + output_dir: str | Path, + metrics: dict, + baseline_skill_path: str | Path, + evolved_skill_path: str | Path, + report_path: str | Path, + summary_path: str | Path, + diff_summary_path: str | Path, + review_checklist_path: str | Path, + base_branch: str = "main", +): + """Write GitHub-ready PR body + gh command suggestion artifacts.""" + return write_github_pr_artifacts( + output_dir=output_dir, + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + report_path=report_path, + summary_path=summary_path, + diff_summary_path=diff_summary_path, + review_checklist_path=review_checklist_path, + base_branch=base_branch, + ) + + +def write_evolution_git_pr_automation_artifacts( + *, + output_dir: str | Path, + metrics: dict, + hermes_repo: str | Path, + skill_relpath: str, + evolved_skill_text: str, + github_pr_body_path: str | Path, +): + """Write git/GitHub connected handoff artifacts for an evolved skill.""" + return write_git_pr_automation_artifacts( + output_dir=output_dir, + metrics=metrics, + hermes_repo=hermes_repo, + skill_relpath=skill_relpath, + evolved_skill_text=evolved_skill_text, + github_pr_body_path=github_pr_body_path, + ) + + +def execute_evolution_git_pr_automation( + *, + git_apply_plan: dict, + gh_pr_create_command: str | None, + execute_push: bool, + execute_pr: bool, +): + """Execute the real git/PR automation flow.""" + return execute_git_pr_automation( + git_apply_plan=git_apply_plan, + gh_pr_create_command=gh_pr_create_command, + execute_push=execute_push, + execute_pr=execute_pr, + ) + + +def validate_skill_constraints( + *, + validator: ConstraintValidator, + full_skill_text: str, + baseline_full_text: str | None = None, +): + """Validate skill constraints against the complete skill text, including frontmatter.""" + return validator.validate_all( + full_skill_text, + "skill", + baseline_text=baseline_full_text, + ) + + +class NormalizedReflectionLM: + """Adapt DSPy/LiteLLM outputs to the list[str|dict] shape GEPA reflective mutation expects.""" + + def __init__(self, base_lm): + self.base_lm = base_lm + + def __call__(self, *args, **kwargs): + result = self.base_lm(*args, **kwargs) + if isinstance(result, list): + if not result: + return [""] + normalized = [] + for item in result: + if item is None: + normalized.append("") + elif isinstance(item, (str, dict)): + normalized.append(item) + else: + normalized.append(str(item)) + return normalized + if result is None: + return [""] + if isinstance(result, (str, dict)): + return [result] + return [str(result)] + + def __getattr__(self, name): + return getattr(self.base_lm, name) + + +def create_gepa_optimizer(*, metric, iterations: int, optimizer_model: str | None = None): + """Instantiate GEPA across DSPy API variants.""" + params = inspect.signature(dspy.GEPA).parameters + kwargs = {"metric": metric} + if "max_steps" in params: + kwargs["max_steps"] = iterations + else: + if callable(metric): + metric_arity = len(inspect.signature(metric).parameters) + if metric_arity < 5: + def _wrapped_metric(gold, pred, trace=None, pred_name=None, pred_trace=None): + return metric(gold, pred, trace) + kwargs["metric"] = _wrapped_metric + if "reflection_lm" in params and optimizer_model: + kwargs["reflection_lm"] = NormalizedReflectionLM(dspy.LM(optimizer_model)) + if "max_full_evals" in params: + kwargs["max_full_evals"] = max(1, iterations) + elif "auto" in params: + kwargs["auto"] = "light" + return dspy.GEPA(**kwargs) + + +def optimize_skill_module(*, baseline_module, trainset, valset, iterations: int, metric, optimizer_model: str | None = None): + """Run GEPA when available, otherwise fall back to MIPROv2.""" + try: + optimizer = create_gepa_optimizer(metric=metric, iterations=iterations, optimizer_model=optimizer_model) + optimized = optimizer.compile( + baseline_module, + trainset=trainset, + valset=valset, + ) + return "GEPA", optimized + except Exception as e: + console.print(f"[yellow]GEPA not available ({e}), falling back to MIPROv2[/yellow]") + optimizer = dspy.MIPROv2( + metric=metric, + auto="light", + ) + optimized = optimizer.compile( + baseline_module, + trainset=trainset, + valset=valset, + ) + return "MIPROv2", optimized + def evolve( skill_name: str, iterations: int = 10, eval_source: str = "synthetic", dataset_path: Optional[str] = None, - optimizer_model: str = "openai/gpt-4.1", - eval_model: str = "openai/gpt-4.1-mini", + optimizer_model: str = DEFAULT_OPTIMIZER_MODEL, + eval_model: str = DEFAULT_EVAL_MODEL, hermes_repo: Optional[str] = None, run_tests: bool = False, dry_run: bool = False, + eval_backend: str = "dspy", + run_tblite: bool = False, + tblite_task_filter: Optional[str] = None, + tblite_mode: str = "fast", + execute_git_apply: bool = False, + execute_push: bool = False, + execute_pr: bool = False, ): """Main evolution function — orchestrates the full optimization loop.""" - config = EvolutionConfig( - iterations=iterations, + load_default_env_files() + optimizer_model, eval_model, runtime_env = resolve_runtime_model_settings( optimizer_model=optimizer_model, eval_model=eval_model, - judge_model=eval_model, # Use same model for dataset generation - run_pytest=run_tests, ) + + config_kwargs = { + "iterations": iterations, + "optimizer_model": optimizer_model, + "eval_model": eval_model, + "judge_model": eval_model, + "run_pytest": run_tests, + "run_tblite": run_tblite, + } if hermes_repo: - config.hermes_agent_path = Path(hermes_repo) + config_kwargs["hermes_agent_path"] = Path(hermes_repo).expanduser() + + config = EvolutionConfig(**config_kwargs) - # ── 1. Find and load the skill ────────────────────────────────────── console.print(f"\n[bold cyan]🧬 Hermes Agent Self-Evolution[/bold cyan] — Evolving skill: [bold]{skill_name}[/bold]\n") + if runtime_env: + env_list = ", ".join(f"{k}={v}" for k, v in runtime_env.items()) + console.print(f" Runtime model alignment: applied {env_list}") skill_path = find_skill(skill_name, config.hermes_agent_path) if not skill_path: @@ -74,10 +505,19 @@ def evolve( console.print(f"\n[bold green]DRY RUN — setup validated successfully.[/bold green]") console.print(f" Would generate eval dataset (source: {eval_source})") console.print(f" Would run GEPA optimization ({iterations} iterations)") + console.print(f" Would evaluate holdout via backend: {eval_backend}") + if run_tblite: + console.print( + f" Would run TBLite {tblite_mode} regression gate " + f"(task filter: {tblite_task_filter or 'mode default'})" + ) + if execute_git_apply: + console.print( + f" Would execute git apply flow (push={execute_push}, pr={execute_pr})" + ) console.print(f" Would validate constraints and create PR") return - # ── 2. Build or load evaluation dataset ───────────────────────────── console.print(f"\n[bold]Building evaluation dataset[/bold] (source: {eval_source})") if eval_source == "golden" and dataset_path: @@ -102,7 +542,6 @@ def evolve( artifact_text=skill["raw"], artifact_type="skill", ) - # Save for reuse save_path = Path("datasets") / "skills" / skill_name dataset.save(save_path) console.print(f" Generated {len(dataset.all_examples)} synthetic examples") @@ -116,10 +555,12 @@ def evolve( console.print(f" Split: {len(dataset.train)} train / {len(dataset.val)} val / {len(dataset.holdout)} holdout") - # ── 3. Validate constraints on baseline ───────────────────────────── console.print(f"\n[bold]Validating baseline constraints[/bold]") validator = ConstraintValidator(config) - baseline_constraints = validator.validate_all(skill["body"], "skill") + baseline_constraints = validate_skill_constraints( + validator=validator, + full_skill_text=skill["raw"], + ) all_pass = True for c in baseline_constraints: icon = "✓" if c.passed else "✗" @@ -131,62 +572,46 @@ def evolve( if not all_pass: console.print("[yellow]⚠ Baseline skill has constraint violations — proceeding anyway[/yellow]") - # ── 4. Set up DSPy + GEPA optimizer ───────────────────────────────── console.print(f"\n[bold]Configuring optimizer[/bold]") console.print(f" Optimizer: GEPA ({iterations} iterations)") console.print(f" Optimizer model: {optimizer_model}") console.print(f" Eval model: {eval_model}") + console.print(f" Holdout backend: {eval_backend}") - # Configure DSPy lm = dspy.LM(eval_model) dspy.configure(lm=lm) - # Create the baseline skill module baseline_module = SkillModule(skill["body"]) - - # Prepare DSPy examples trainset = dataset.to_dspy_examples("train") valset = dataset.to_dspy_examples("val") - # ── 5. Run GEPA optimization ──────────────────────────────────────── console.print(f"\n[bold cyan]Running GEPA optimization ({iterations} iterations)...[/bold cyan]\n") start_time = time.time() - try: - optimizer = dspy.GEPA( - metric=skill_fitness_metric, - max_steps=iterations, - ) - - optimized_module = optimizer.compile( - baseline_module, - trainset=trainset, - valset=valset, - ) - except Exception as e: - # Fall back to MIPROv2 if GEPA isn't available in this DSPy version - console.print(f"[yellow]GEPA not available ({e}), falling back to MIPROv2[/yellow]") - optimizer = dspy.MIPROv2( - metric=skill_fitness_metric, - auto="light", - ) - optimized_module = optimizer.compile( - baseline_module, - trainset=trainset, - ) + optimizer_name, optimized_module = optimize_skill_module( + baseline_module=baseline_module, + trainset=trainset, + valset=valset, + iterations=iterations, + metric=skill_fitness_metric, + optimizer_model=optimizer_model, + ) + if optimizer_name != "GEPA": + console.print(f"[yellow]Optimizer fallback in use: {optimizer_name}[/yellow]") elapsed = time.time() - start_time console.print(f"\n Optimization completed in {elapsed:.1f}s") - # ── 6. Extract evolved skill text ─────────────────────────────────── - # The optimized module's instructions contain the evolved skill text evolved_body = optimized_module.skill_text evolved_full = reassemble_skill(skill["frontmatter"], evolved_body) - # ── 7. Validate evolved skill ─────────────────────────────────────── console.print(f"\n[bold]Validating evolved skill[/bold]") - evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"]) + evolved_constraints = validate_skill_constraints( + validator=validator, + full_skill_text=evolved_full, + baseline_full_text=skill["raw"], + ) all_pass = True for c in evolved_constraints: icon = "✓" if c.passed else "✗" @@ -197,36 +622,30 @@ def evolve( if not all_pass: console.print("[red]✗ Evolved skill FAILED constraints — not deploying[/red]") - # Still save for inspection output_path = Path("output") / skill_name / "evolved_FAILED.md" output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(evolved_full) console.print(f" Saved failed variant to {output_path}") return - # ── 8. Evaluate on holdout set ────────────────────────────────────── console.print(f"\n[bold]Evaluating on holdout set ({len(dataset.holdout)} examples)[/bold]") - holdout_examples = dataset.to_dspy_examples("holdout") - - baseline_scores = [] - evolved_scores = [] - for ex in holdout_examples: - # Score baseline - with dspy.context(lm=lm): - baseline_pred = baseline_module(task_input=ex.task_input) - baseline_score = skill_fitness_metric(ex, baseline_pred) - baseline_scores.append(baseline_score) - - evolved_pred = optimized_module(task_input=ex.task_input) - evolved_score = skill_fitness_metric(ex, evolved_pred) - evolved_scores.append(evolved_score) + baseline_scores, evolved_scores = evaluate_holdout( + dataset=dataset, + eval_backend=eval_backend, + baseline_module=baseline_module, + evolved_module=optimized_module, + baseline_skill_body=skill["body"], + evolved_skill_body=evolved_body, + eval_model=eval_model, + hermes_repo=str(config.hermes_agent_path), + skill_name=skill_name, + ) avg_baseline = sum(baseline_scores) / max(1, len(baseline_scores)) avg_evolved = sum(evolved_scores) / max(1, len(evolved_scores)) improvement = avg_evolved - avg_baseline - # ── 9. Report results ─────────────────────────────────────────────── table = Table(title="Evolution Results") table.add_column("Metric", style="bold") table.add_column("Baseline", justify="right") @@ -248,28 +667,46 @@ def evolve( ) table.add_row("Time", "", f"{elapsed:.1f}s", "") table.add_row("Iterations", "", str(iterations), "") + table.add_row("Eval Backend", "", eval_backend, "") + + tblite_gate_result = maybe_run_tblite_gate( + run_tblite=run_tblite, + skill_name=skill_name, + baseline_skill_body=skill["body"], + evolved_skill_body=evolved_body, + hermes_repo=str(config.hermes_agent_path), + tblite_regression_threshold=config.tblite_regression_threshold, + tblite_task_filter=tblite_task_filter, + tblite_mode=tblite_mode, + ) + if tblite_gate_result is not None: + gate_color = "green" if tblite_gate_result.passed else "red" + table.add_row( + "TBLite Gate", + f"{tblite_gate_result.baseline_pass_rate:.3f}", + f"{tblite_gate_result.evolved_pass_rate:.3f}", + f"[{gate_color}]{tblite_gate_result.delta:+.3f}[/{gate_color}]", + ) console.print() console.print(table) - # ── 10. Save output ───────────────────────────────────────────────── timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = Path("output") / skill_name / timestamp output_dir.mkdir(parents=True, exist_ok=True) - # Save evolved skill (output_dir / "evolved_skill.md").write_text(evolved_full) - - # Save baseline for comparison (output_dir / "baseline_skill.md").write_text(skill["raw"]) + baseline_skill_path = output_dir / "baseline_skill.md" + evolved_skill_path = output_dir / "evolved_skill.md" - # Save metrics metrics = { "skill_name": skill_name, "timestamp": timestamp, "iterations": iterations, "optimizer_model": optimizer_model, "eval_model": eval_model, + "eval_backend": eval_backend, "baseline_score": avg_baseline, "evolved_score": avg_evolved, "improvement": improvement, @@ -280,10 +717,107 @@ def evolve( "holdout_examples": len(dataset.holdout), "elapsed_seconds": elapsed, "constraints_passed": all_pass, + "tblite_gate": None if tblite_gate_result is None else { + "passed": tblite_gate_result.passed, + "mode": tblite_gate_result.mode, + "task_filter": tblite_gate_result.task_filter, + "base_config_path": str(tblite_gate_result.base_config_path), + "baseline_pass_rate": tblite_gate_result.baseline_pass_rate, + "evolved_pass_rate": tblite_gate_result.evolved_pass_rate, + "delta": tblite_gate_result.delta, + "threshold": tblite_gate_result.threshold, + "summary": tblite_gate_result.summary, + }, } (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2)) + report_paths = write_evolution_report_artifacts( + output_dir=output_dir, + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + pr_paths = write_evolution_pr_ready_artifacts( + output_dir=output_dir, + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + ) + github_pr_paths = write_evolution_github_pr_artifacts( + output_dir=output_dir, + metrics=metrics, + baseline_skill_path=baseline_skill_path, + evolved_skill_path=evolved_skill_path, + report_path=report_paths["report_md"], + summary_path=report_paths["summary_json"], + diff_summary_path=pr_paths["diff_summary_md"], + review_checklist_path=pr_paths["review_checklist_md"], + ) + skill_relpath = str(skill_path.relative_to(config.hermes_agent_path / "skills")) + git_pr_automation_paths = write_evolution_git_pr_automation_artifacts( + output_dir=output_dir, + metrics=metrics, + hermes_repo=config.hermes_agent_path, + skill_relpath=skill_relpath, + evolved_skill_text=evolved_full, + github_pr_body_path=github_pr_paths["github_pr_body_md"], + ) console.print(f"\n Output saved to {output_dir}/") + console.print(f" Report: {report_paths['report_md']}") + console.print(f" Summary: {report_paths['summary_json']}") + console.print(f" PR Draft: {pr_paths['pr_draft_md']}") + console.print(f" Review Checklist: {pr_paths['review_checklist_md']}") + console.print(f" Diff Summary: {pr_paths['diff_summary_md']}") + console.print(f" GitHub PR Body: {github_pr_paths['github_pr_body_md']}") + if github_pr_paths["gh_pr_create_command_txt"] is not None: + console.print(f" gh PR Command: {github_pr_paths['gh_pr_create_command_txt']}") + else: + console.print(" gh PR Command: skipped (decision=reject)") + console.print(f" Candidate Skill Patch: {git_pr_automation_paths['candidate_skill_file']}") + console.print(f" Git Apply Plan: {git_pr_automation_paths['git_apply_plan_sh']}") + console.print(f" Git Apply Guide: {git_pr_automation_paths['git_apply_plan_md']}") + if git_pr_automation_paths["gh_pr_create_after_push_txt"] is not None: + console.print(f" gh PR After Push: {git_pr_automation_paths['gh_pr_create_after_push_txt']}") + else: + console.print(" gh PR After Push: skipped (decision=reject)") + + if tblite_gate_result is not None: + gate_style = "bold green" if tblite_gate_result.passed else "bold red" + gate_icon = "✓" if tblite_gate_result.passed else "✗" + console.print(f"[{gate_style}]{gate_icon} {tblite_gate_result.summary}[/{gate_style}]") + if not tblite_gate_result.passed: + console.print("[red]✗ Benchmark regression gate failed — evolved skill should not be deployed[/red]") + return + + if execute_git_apply and execute_pr and not execute_push: + raise ValueError("execute_push must be True when execute_pr=True") + + if execute_git_apply: + branch_name = build_evolution_branch_name(skill_name=skill_name, timestamp=timestamp) + target_skill_path = build_target_skill_path( + hermes_repo=config.hermes_agent_path, + skill_relpath=skill_relpath, + ) + git_apply_plan = { + "hermes_repo": config.hermes_agent_path, + "branch_name": branch_name, + "target_skill_path": target_skill_path, + "candidate_skill_file": git_pr_automation_paths["candidate_skill_file"], + "commit_message": build_github_pr_title(metrics), + } + gh_pr_create_command = None + if git_pr_automation_paths["gh_pr_create_after_push_txt"] is not None: + gh_pr_create_command = Path(git_pr_automation_paths["gh_pr_create_after_push_txt"]).read_text().strip() + + execution_result = execute_evolution_git_pr_automation( + git_apply_plan=git_apply_plan, + gh_pr_create_command=gh_pr_create_command, + execute_push=execute_push, + execute_pr=execute_pr, + ) + console.print(f" Executed Git Apply Steps: {len(execution_result['git']['steps'])}") + if execution_result["pr"] is not None: + console.print(" Executed PR creation via gh") if improvement > 0: console.print(f"\n[bold green]✓ Evolution improved skill by {improvement:+.3f} ({improvement/max(0.001, avg_baseline)*100:+.1f}%)[/bold green]") @@ -298,13 +832,21 @@ def evolve( @click.option("--iterations", default=10, help="Number of GEPA iterations") @click.option("--eval-source", default="synthetic", type=click.Choice(["synthetic", "golden", "sessiondb"]), help="Source for evaluation dataset") +@click.option("--eval-backend", default="dspy", type=click.Choice(["dspy", "hermes"]), + help="Backend used for holdout evaluation") @click.option("--dataset-path", default=None, help="Path to existing eval dataset (JSONL)") -@click.option("--optimizer-model", default="openai/gpt-4.1", help="Model for GEPA reflections") -@click.option("--eval-model", default="openai/gpt-4.1-mini", help="Model for evaluations") +@click.option("--optimizer-model", default=DEFAULT_OPTIMIZER_MODEL, help="Model for GEPA reflections") +@click.option("--eval-model", default=DEFAULT_EVAL_MODEL, help="Model for evaluations") @click.option("--hermes-repo", default=None, help="Path to hermes-agent repo") @click.option("--run-tests", is_flag=True, help="Run full pytest suite as constraint gate") +@click.option("--run-tblite", is_flag=True, help="Run the TBLite regression benchmark gate after holdout eval") +@click.option("--tblite-mode", default="fast", type=click.Choice(["fast", "full"]), help="TBLite gate mode: fast subset or full benchmark") +@click.option("--tblite-task-filter", default=None, help="Optional comma-separated TBLite task filter for a faster gate") +@click.option("--execute-git-apply", is_flag=True, help="Actually apply the evolved skill into the target Hermes repo branch") +@click.option("--execute-push", is_flag=True, help="When executing git apply, also push the branch to the remote") +@click.option("--execute-pr", is_flag=True, help="When executing git apply, also create the PR via gh after push") @click.option("--dry-run", is_flag=True, help="Validate setup without running optimization") -def main(skill, iterations, eval_source, dataset_path, optimizer_model, eval_model, hermes_repo, run_tests, dry_run): +def main(skill, iterations, eval_source, eval_backend, dataset_path, optimizer_model, eval_model, hermes_repo, run_tests, run_tblite, tblite_mode, tblite_task_filter, execute_git_apply, execute_push, execute_pr, dry_run): """Evolve a Hermes Agent skill using DSPy + GEPA optimization.""" evolve( skill_name=skill, @@ -316,6 +858,13 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, eval_mod hermes_repo=hermes_repo, run_tests=run_tests, dry_run=dry_run, + eval_backend=eval_backend, + run_tblite=run_tblite, + tblite_task_filter=tblite_task_filter, + tblite_mode=tblite_mode, + execute_git_apply=execute_git_apply, + execute_push=execute_push, + execute_pr=execute_pr, ) diff --git a/tests/core/test_benchmark_gate.py b/tests/core/test_benchmark_gate.py new file mode 100644 index 0000000..2f9d310 --- /dev/null +++ b/tests/core/test_benchmark_gate.py @@ -0,0 +1,167 @@ +"""Tests for benchmark regression gate helpers.""" + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest +import yaml + +from evolution.core import benchmark_gate as mod + + +@pytest.fixture +def fake_tblite_base_config(tmp_path: Path) -> Path: + path = tmp_path / "local.yaml" + path.write_text( + yaml.safe_dump( + { + "env": { + "terminal_backend": "docker", + "use_wandb": False, + "data_dir_to_save_evals": "environments/benchmarks/evals/openthoughts-tblite-local", + }, + "openai": { + "model_name": "anthropic/claude-sonnet-4", + }, + } + ) + ) + return path + + +def test_parse_tblite_pass_rate_from_stdout(): + output = "hello\nOverall Pass Rate: 0.4200 (42/100)\nbye\n" + + assert mod.parse_tblite_pass_rate(output) == pytest.approx(0.42) + + +def test_parse_tblite_pass_rate_raises_when_missing_summary(): + with pytest.raises(ValueError, match="Overall Pass Rate"): + mod.parse_tblite_pass_rate("no summary here") + + +def test_run_tblite_gate_allows_small_regression(monkeypatch, tmp_path: Path, fake_tblite_base_config: Path): + prompts = { + "BASE": "baseline prompt", + "EVOLVED": "evolved prompt", + } + calls = [] + + def _fake_build_skill_system_prompt(skill_name, hermes_repo=None, skill_body_override=None): + return prompts[skill_body_override], [skill_name] + + def _fake_run(command, capture_output, text, timeout, cwd): + calls.append(command) + config_path = Path(command[command.index("--config") + 1]) + config = yaml.safe_load(config_path.read_text()) + prompt = config["env"]["system_prompt"] + score = 0.50 if prompt == "baseline prompt" else 0.49 + return SimpleNamespace(returncode=0, stdout=f"Overall Pass Rate: {score:.4f} (1/2)", stderr="") + + monkeypatch.setattr(mod, "build_skill_system_prompt", _fake_build_skill_system_prompt) + monkeypatch.setattr(mod.subprocess, "run", _fake_run) + + result = mod.run_tblite_benchmark_gate( + skill_name="github-code-review", + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + hermes_repo=tmp_path, + regression_threshold=0.02, + task_filter="broken-python,pandas-etl", + base_config_path=fake_tblite_base_config, + benchmark_script_path=tmp_path / "tblite_env.py", + ) + + assert result.passed is True + assert result.baseline_pass_rate == pytest.approx(0.50) + assert result.evolved_pass_rate == pytest.approx(0.49) + assert result.delta == pytest.approx(-0.01) + assert len(calls) == 2 + assert calls[0][-2:] == ["--env.task_filter", "broken-python,pandas-etl"] + + +def test_run_tblite_gate_fails_large_regression(monkeypatch, tmp_path: Path, fake_tblite_base_config: Path): + monkeypatch.setattr( + mod, + "build_skill_system_prompt", + lambda skill_name, hermes_repo=None, skill_body_override=None: (f"prompt::{skill_body_override}", [skill_name]), + ) + + def _fake_run(command, capture_output, text, timeout, cwd): + config_path = Path(command[command.index("--config") + 1]) + config = yaml.safe_load(config_path.read_text()) + prompt = config["env"]["system_prompt"] + score = 0.50 if prompt == "prompt::BASE" else 0.40 + return SimpleNamespace(returncode=0, stdout=f"Overall Pass Rate: {score:.4f} (1/2)", stderr="") + + monkeypatch.setattr(mod.subprocess, "run", _fake_run) + + result = mod.run_tblite_benchmark_gate( + skill_name="github-code-review", + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + hermes_repo=tmp_path, + regression_threshold=0.02, + base_config_path=fake_tblite_base_config, + benchmark_script_path=tmp_path / "tblite_env.py", + ) + + assert result.passed is False + assert result.delta == pytest.approx(-0.10) + assert "regression" in result.summary.lower() + + +def test_resolve_tblite_gate_plan_fast_uses_local_config_and_default_subset(tmp_path: Path): + plan = mod.resolve_tblite_gate_plan(mode="fast", hermes_repo=tmp_path) + + assert plan.mode == "fast" + assert plan.base_config_path == tmp_path / "environments/benchmarks/tblite/local.yaml" + assert plan.task_filter == mod.DEFAULT_FAST_TBLITE_TASK_FILTER + + + +def test_resolve_tblite_gate_plan_full_uses_default_config_without_filter(tmp_path: Path): + plan = mod.resolve_tblite_gate_plan(mode="full", hermes_repo=tmp_path) + + assert plan.mode == "full" + assert plan.base_config_path == tmp_path / "environments/benchmarks/tblite/default.yaml" + assert plan.task_filter is None + + + +def test_run_tblite_gate_records_mode_metadata(monkeypatch, tmp_path: Path, fake_tblite_base_config: Path): + monkeypatch.setattr( + mod, + "build_skill_system_prompt", + lambda skill_name, hermes_repo=None, skill_body_override=None: (f"prompt::{skill_body_override}", [skill_name]), + ) + monkeypatch.setattr( + mod, + "resolve_tblite_gate_plan", + lambda **kwargs: mod.TBLiteGatePlan( + mode="fast", + base_config_path=fake_tblite_base_config, + task_filter="broken-python,pandas-etl", + benchmark_script_path=tmp_path / "tblite_env.py", + ), + ) + + def _fake_run(command, capture_output, text, timeout, cwd): + return SimpleNamespace(returncode=0, stdout="Overall Pass Rate: 0.5000 (1/2)", stderr="") + + monkeypatch.setattr(mod.subprocess, "run", _fake_run) + + result = mod.run_tblite_benchmark_gate( + skill_name="github-code-review", + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + hermes_repo=tmp_path, + regression_threshold=0.02, + mode="fast", + ) + + assert result.mode == "fast" + assert result.task_filter == "broken-python,pandas-etl" + assert result.base_config_path == fake_tblite_base_config diff --git a/tests/core/test_constraints.py b/tests/core/test_constraints.py index 88e3aaa..2cebf89 100644 --- a/tests/core/test_constraints.py +++ b/tests/core/test_constraints.py @@ -1,5 +1,7 @@ """Tests for constraint validators.""" +from pathlib import Path + import pytest from evolution.core.constraints import ConstraintValidator from evolution.core.config import EvolutionConfig @@ -7,7 +9,7 @@ @pytest.fixture def validator(): - config = EvolutionConfig() + config = EvolutionConfig(hermes_agent_path=Path("/tmp/hermes-agent")) return ConstraintValidator(config) diff --git a/tests/core/test_git_pr_automation.py b/tests/core/test_git_pr_automation.py new file mode 100644 index 0000000..7c55371 --- /dev/null +++ b/tests/core/test_git_pr_automation.py @@ -0,0 +1,239 @@ +"""Tests for git/GitHub automation artifacts.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from evolution.core import git_pr_automation as mod + + +def sample_metrics() -> dict: + return { + "skill_name": "github-code-review", + "timestamp": "20260414_180000", + "eval_backend": "hermes", + "baseline_score": 0.42, + "evolved_score": 0.57, + "improvement": 0.15, + "tblite_gate": { + "passed": True, + "summary": "TBLite fast gate passed", + }, + } + + +def test_build_evolution_branch_name_sanitizes_skill_and_includes_timestamp(): + branch = mod.build_evolution_branch_name( + skill_name="github/code review", + timestamp="20260414_180000", + ) + + assert branch == "evolution/github-code-review-20260414_180000" + + +def test_build_target_skill_path_resolves_skill_location_from_repo_root(tmp_path: Path): + target = mod.build_target_skill_path( + hermes_repo=tmp_path / "hermes-agent", + skill_relpath="github/github-code-review/SKILL.md", + ) + + assert target == tmp_path / "hermes-agent" / "skills" / "github/github-code-review/SKILL.md" + + +def test_write_skill_patch_artifacts_writes_candidate_skill_file_and_manifest(tmp_path: Path): + paths = mod.write_skill_patch_artifacts( + output_dir=tmp_path, + skill_relpath="github/github-code-review/SKILL.md", + evolved_skill_text="# evolved skill\n", + hermes_repo=tmp_path / "hermes-agent", + ) + + assert paths["candidate_skill_file"].exists() + assert paths["git_patch_manifest_json"].exists() + assert "github/github-code-review/SKILL.md" in paths["git_patch_manifest_json"].read_text() + + +def test_build_git_apply_plan_includes_branch_checkout_commit_and_push(tmp_path: Path): + plan = mod.build_git_apply_plan( + hermes_repo=tmp_path / "hermes-agent", + branch_name="evolution/github-code-review-20260414_180000", + target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md", + candidate_skill_file=tmp_path / "candidate_skill.md", + commit_message="feat: evolve skill github-code-review (candidate)", + push_remote="origin", + ) + + assert "git checkout -b evolution/github-code-review-20260414_180000" in plan + assert "cp" in plan + assert "git add" in plan + assert "git commit -m" in plan + assert "git push -u origin" in plan + + +def test_write_git_apply_plan_artifacts_writes_plan_files(tmp_path: Path): + paths = mod.write_git_apply_plan_artifacts( + output_dir=tmp_path, + hermes_repo=tmp_path / "hermes-agent", + branch_name="evolution/github-code-review-20260414_180000", + target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md", + candidate_skill_file=tmp_path / "candidate_skill.md", + commit_message="feat: evolve skill github-code-review (candidate)", + ) + + assert paths["git_apply_plan_sh"].exists() + assert paths["git_apply_plan_md"].exists() + assert "git checkout -b" in paths["git_apply_plan_sh"].read_text() + assert "Commit message" in paths["git_apply_plan_md"].read_text() + + +def test_write_full_git_pr_automation_artifacts_writes_branch_and_pr_handoff(tmp_path: Path): + paths = mod.write_git_pr_automation_artifacts( + output_dir=tmp_path, + metrics=sample_metrics(), + hermes_repo=tmp_path / "hermes-agent", + skill_relpath="github/github-code-review/SKILL.md", + evolved_skill_text="# evolved skill\n", + github_pr_body_path=tmp_path / "github_pr_body.md", + ) + + assert paths["candidate_skill_file"].exists() + assert paths["git_apply_plan_sh"].exists() + assert paths["git_apply_plan_md"].exists() + assert paths["git_patch_manifest_json"].exists() + assert paths["gh_pr_create_after_push_txt"].exists() + assert "gh pr create" in paths["gh_pr_create_after_push_txt"].read_text() + + + +def test_build_target_skill_path_rejects_escape_from_skills_root(tmp_path: Path): + with pytest.raises(ValueError, match="skills"): + mod.build_target_skill_path( + hermes_repo=tmp_path / "hermes-agent", + skill_relpath="../outside.md", + ) + + + +def test_execute_git_apply_plan_rejects_target_outside_skills_root(tmp_path: Path): + with pytest.raises(ValueError, match="skills"): + mod.execute_git_apply_plan( + hermes_repo=tmp_path / "hermes-agent", + branch_name="evolution/github-code-review-20260414_180000", + target_skill_path=tmp_path / "outside.md", + candidate_skill_file=tmp_path / "candidate_skill.md", + commit_message="feat: evolve skill github-code-review (candidate)", + run_push=False, + runner=lambda command, *, workdir=None: {"exit_code": 0, "output": "ok"}, + ) + + + +def test_write_full_git_pr_automation_artifacts_skips_pr_command_for_reject(tmp_path: Path): + metrics = sample_metrics() + metrics["tblite_gate"]["passed"] = False + + paths = mod.write_git_pr_automation_artifacts( + output_dir=tmp_path, + metrics=metrics, + hermes_repo=tmp_path / "hermes-agent", + skill_relpath="github/github-code-review/SKILL.md", + evolved_skill_text="# evolved skill\n", + github_pr_body_path=tmp_path / "github_pr_body.md", + ) + + assert paths["gh_pr_create_after_push_txt"] is None + + +def test_execute_git_apply_plan_runs_copy_commit_and_push_steps_in_order(tmp_path: Path): + commands = [] + + def _fake_runner(command: str, *, workdir=None): + commands.append((command, workdir)) + return {"exit_code": 0, "output": "ok"} + + result = mod.execute_git_apply_plan( + hermes_repo=tmp_path / "hermes-agent", + branch_name="evolution/github-code-review-20260414_180000", + target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md", + candidate_skill_file=tmp_path / "candidate_skill.md", + commit_message="feat: evolve skill github-code-review (candidate)", + run_push=True, + runner=_fake_runner, + ) + + assert result["steps"][0]["name"] == "checkout_branch" + assert result["steps"][-1]["name"] == "push_branch" + assert any("git checkout -b" in cmd for cmd, _ in commands) + assert any("git commit -m" in cmd for cmd, _ in commands) + assert any("git push -u origin" in cmd for cmd, _ in commands) + + +def test_execute_git_apply_plan_can_skip_push(tmp_path: Path): + commands = [] + + def _fake_runner(command: str, *, workdir=None): + commands.append(command) + return {"exit_code": 0, "output": "ok"} + + result = mod.execute_git_apply_plan( + hermes_repo=tmp_path / "hermes-agent", + branch_name="evolution/github-code-review-20260414_180000", + target_skill_path=tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md", + candidate_skill_file=tmp_path / "candidate_skill.md", + commit_message="feat: evolve skill github-code-review (candidate)", + run_push=False, + runner=_fake_runner, + ) + + assert all(step["name"] != "push_branch" for step in result["steps"]) + assert not any("git push -u origin" in cmd for cmd in commands) + + +def test_execute_gh_pr_create_runs_only_when_command_is_present(tmp_path: Path): + commands = [] + + def _fake_runner(command: str, *, workdir=None): + commands.append((command, workdir)) + return {"exit_code": 0, "output": "created"} + + result = mod.execute_gh_pr_create( + command="gh pr create --base main --head evolution/test --title 'x' --body-file body.md", + hermes_repo=tmp_path / "hermes-agent", + runner=_fake_runner, + ) + + assert result["executed"] is True + assert commands[0][0].startswith("gh pr create") + + +def test_execute_full_git_pr_automation_skips_pr_creation_without_command(tmp_path: Path): + calls = [] + + def _fake_execute_git_apply_plan(**kwargs): + calls.append(("apply", kwargs)) + return {"steps": [{"name": "checkout_branch"}]} + + def _fake_execute_gh_pr_create(**kwargs): + calls.append(("pr", kwargs)) + return {"executed": False} + + result = mod.execute_git_pr_automation( + git_apply_plan={ + "hermes_repo": tmp_path / "hermes-agent", + "branch_name": "evolution/github-code-review-20260414_180000", + "target_skill_path": tmp_path / "hermes-agent/skills/github/github-code-review/SKILL.md", + "candidate_skill_file": tmp_path / "candidate_skill.md", + "commit_message": "feat: evolve skill github-code-review (candidate)", + }, + gh_pr_create_command=None, + execute_push=True, + execute_pr=False, + execute_git_apply_plan_fn=_fake_execute_git_apply_plan, + execute_gh_pr_create_fn=_fake_execute_gh_pr_create, + ) + + assert result["git"]["steps"][0]["name"] == "checkout_branch" + assert result["pr"] is None + assert [kind for kind, _ in calls] == ["apply"] diff --git a/tests/core/test_hermes_eval.py b/tests/core/test_hermes_eval.py new file mode 100644 index 0000000..98847ff --- /dev/null +++ b/tests/core/test_hermes_eval.py @@ -0,0 +1,203 @@ +"""Tests for real Hermes Agent-backed skill evaluation helpers.""" + +from types import SimpleNamespace + +import pytest + +from evolution.core.hermes_eval import ( + HermesSkillEvalCase, + _load_hermes_symbols, + build_skill_system_prompt, + run_skill_eval, +) + + +class FakeAgent: + last_init = None + last_user_message = None + + def __init__(self, **kwargs): + FakeAgent.last_init = kwargs + + def run_conversation(self, user_message: str, conversation_history=None): + FakeAgent.last_user_message = user_message + return {"final_response": "done", "messages": []} + + +class InlineOnlyAgent(FakeAgent): + pass + + +@pytest.fixture(autouse=True) +def reset_fake_agent(): + FakeAgent.last_init = None + FakeAgent.last_user_message = None + + +@pytest.fixture +def fake_loader(monkeypatch): + def _fake_loader(hermes_repo=None): + def _build_preloaded_skills_prompt(skill_identifiers, task_id=None): + assert skill_identifiers == ["github-code-review"] + return ("[SYSTEM: skill prompt]", ["github-code-review"], []) + + return SimpleNamespace( + AIAgent=FakeAgent, + build_preloaded_skills_prompt=_build_preloaded_skills_prompt, + ) + + monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader) + + +def test_build_skill_system_prompt_can_inline_alternate_skill_body(monkeypatch): + def _fake_loader(hermes_repo=None): + def _build_preloaded_skills_prompt(skill_identifiers, task_id=None): + return ("[SYSTEM: original skill prompt]", ["github-code-review"], []) + + return SimpleNamespace( + AIAgent=InlineOnlyAgent, + build_preloaded_skills_prompt=_build_preloaded_skills_prompt, + ) + + monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader) + + prompt, loaded = build_skill_system_prompt( + "github-code-review", + skill_body_override="# EVOLVED\nUse stricter review criteria.", + ) + + assert loaded == ["github-code-review"] + assert "original skill prompt" not in prompt + assert "EVOLVED" in prompt + assert "stricter review criteria" in prompt + + +def test_load_hermes_symbols_imports_from_repo(monkeypatch, tmp_path): + repo = tmp_path / "hermes-agent" + (repo / "agent").mkdir(parents=True) + (repo / "run_agent.py").write_text( + 'class AIAgent:\n' + ' pass\n' + ) + (repo / "agent" / "skill_commands.py").write_text( + 'def build_preloaded_skills_prompt(skill_identifiers, task_id=None):\n' + ' return "prompt", ["loaded"], []\n' + ) + + symbols = _load_hermes_symbols(repo) + + assert symbols.AIAgent.__name__ == "AIAgent" + prompt, loaded, missing = symbols.build_preloaded_skills_prompt(["x"]) + assert prompt == "prompt" + assert loaded == ["loaded"] + assert missing == [] + + +def test_load_hermes_symbols_tolerates_missing_fire_dependency(monkeypatch, tmp_path): + repo = tmp_path / "hermes-agent" + (repo / "agent").mkdir(parents=True) + (repo / "run_agent.py").write_text( + 'import fire\n' + 'class AIAgent:\n' + ' pass\n' + ) + (repo / "agent" / "skill_commands.py").write_text( + 'def build_preloaded_skills_prompt(skill_identifiers, task_id=None):\n' + ' return "prompt", ["loaded"], []\n' + ) + monkeypatch.delitem(__import__("sys").modules, "fire", raising=False) + + symbols = _load_hermes_symbols(repo) + + assert symbols.AIAgent.__name__ == "AIAgent" + + + +def test_load_hermes_symbols_adds_repo_to_sys_path_for_local_imports(tmp_path): + repo = tmp_path / "hermes-agent" + (repo / "agent").mkdir(parents=True) + (repo / "hermes_constants.py").write_text('VALUE = "ok"\n') + (repo / "run_agent.py").write_text( + 'from hermes_constants import VALUE\n' + 'class AIAgent:\n' + ' value = VALUE\n' + ) + (repo / "agent" / "skill_commands.py").write_text( + 'def build_preloaded_skills_prompt(skill_identifiers, task_id=None):\n' + ' return "prompt", ["loaded"], []\n' + ) + + symbols = _load_hermes_symbols(repo) + + assert symbols.AIAgent.value == "ok" + + +def test_build_skill_system_prompt_returns_loaded_skill_prompt(fake_loader): + prompt, loaded = build_skill_system_prompt("github-code-review", hermes_repo="/tmp/hermes") + + assert prompt == "[SYSTEM: skill prompt]" + assert loaded == ["github-code-review"] + + +def test_build_skill_system_prompt_raises_for_missing_skill(monkeypatch): + def _fake_loader(hermes_repo=None): + def _build_preloaded_skills_prompt(skill_identifiers, task_id=None): + return ("", [], ["missing-skill"]) + + return SimpleNamespace( + AIAgent=FakeAgent, + build_preloaded_skills_prompt=_build_preloaded_skills_prompt, + ) + + monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader) + + with pytest.raises(ValueError, match="missing-skill"): + build_skill_system_prompt("github-code-review") + + +def test_run_skill_eval_uses_real_agent_shape_and_preloaded_skill_prompt(fake_loader): + case = HermesSkillEvalCase( + skill_name="github-code-review", + task_input="Review this diff for security issues.", + system_prompt="[SYSTEM: custom evaluator instructions]", + ) + + result = run_skill_eval(case, model="openai/gpt-4.1-mini", hermes_repo="/tmp/hermes") + + assert result.final_response == "done" + assert result.loaded_skills == ["github-code-review"] + assert "custom evaluator instructions" in result.effective_system_prompt + assert "skill prompt" in result.effective_system_prompt + assert FakeAgent.last_user_message == "Review this diff for security issues." + assert FakeAgent.last_init["model"] == "openai/gpt-4.1-mini" + assert FakeAgent.last_init["quiet_mode"] is True + assert FakeAgent.last_init["skip_context_files"] is True + assert FakeAgent.last_init["skip_memory"] is True + assert FakeAgent.last_init["ephemeral_system_prompt"] == result.effective_system_prompt + + +def test_run_skill_eval_accepts_string_agent_result(fake_loader, monkeypatch): + class StringAgent(FakeAgent): + def run_conversation(self, user_message: str, conversation_history=None): + return "plain string response" + + def _fake_loader(hermes_repo=None): + def _build_preloaded_skills_prompt(skill_identifiers, task_id=None): + return ("[SYSTEM: skill prompt]", ["github-code-review"], []) + + return SimpleNamespace( + AIAgent=StringAgent, + build_preloaded_skills_prompt=_build_preloaded_skills_prompt, + ) + + monkeypatch.setattr("evolution.core.hermes_eval._load_hermes_symbols", _fake_loader) + + result = run_skill_eval( + HermesSkillEvalCase( + skill_name="github-code-review", + task_input="Review this diff.", + ) + ) + + assert result.final_response == "plain string response" + assert result.raw_result == "plain string response" diff --git a/tests/core/test_report_artifact.py b/tests/core/test_report_artifact.py new file mode 100644 index 0000000..0097985 --- /dev/null +++ b/tests/core/test_report_artifact.py @@ -0,0 +1,205 @@ +"""Tests for report artifact helpers.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from evolution.core import report_artifact as mod + + +def sample_metrics() -> dict: + return { + "skill_name": "github-code-review", + "timestamp": "20260414_180000", + "iterations": 5, + "optimizer_model": "openai/gpt-4.1", + "eval_model": "openai/gpt-4.1-mini", + "eval_backend": "hermes", + "baseline_score": 0.42, + "evolved_score": 0.57, + "improvement": 0.15, + "baseline_size": 1000, + "evolved_size": 1100, + "train_examples": 10, + "val_examples": 5, + "holdout_examples": 5, + "elapsed_seconds": 12.5, + "constraints_passed": True, + "tblite_gate": { + "passed": True, + "mode": "fast", + "task_filter": "broken-python,pandas-etl", + "base_config_path": "/tmp/local.yaml", + "baseline_pass_rate": 0.50, + "evolved_pass_rate": 0.48, + "delta": -0.02, + "threshold": 0.02, + "summary": "TBLite fast gate passed", + }, + } + + +def test_build_evolution_report_contains_decision_and_key_metrics(): + report = mod.build_evolution_report( + metrics=sample_metrics(), + baseline_skill_path="output/github-code-review/20260414_180000/baseline_skill.md", + evolved_skill_path="output/github-code-review/20260414_180000/evolved_skill.md", + ) + + assert "# Evolution Report" in report + assert "github-code-review" in report + assert "candidate_for_review" in report + assert "TBLite fast gate passed" in report + assert "output/github-code-review/20260414_180000/evolved_skill.md" in report + + + +def test_summarize_recommendation_rejects_on_failed_gate(): + metrics = sample_metrics() + metrics["tblite_gate"]["passed"] = False + metrics["tblite_gate"]["summary"] = "TBLite fast regression detected" + + summary = mod.summarize_recommendation(metrics) + + assert summary["decision"] == "reject" + assert "benchmark_gate_failed" in summary["reasons"] + + + +def test_write_report_artifacts_creates_markdown_and_summary_json(tmp_path: Path): + metrics = sample_metrics() + + paths = mod.write_report_artifacts( + output_dir=tmp_path, + metrics=metrics, + baseline_skill_path=tmp_path / "baseline_skill.md", + evolved_skill_path=tmp_path / "evolved_skill.md", + ) + + assert paths["report_md"].exists() + assert paths["summary_json"].exists() + + summary = json.loads(paths["summary_json"].read_text()) + assert summary["decision"] == "candidate_for_review" + assert summary["skill_name"] == "github-code-review" + + + +def test_write_pr_ready_artifacts_creates_pr_files(tmp_path: Path): + metrics = sample_metrics() + + paths = mod.write_pr_ready_artifacts( + output_dir=tmp_path, + metrics=metrics, + baseline_skill_path=tmp_path / "baseline_skill.md", + evolved_skill_path=tmp_path / "evolved_skill.md", + ) + + assert paths["pr_draft_md"].exists() + assert paths["review_checklist_md"].exists() + assert paths["diff_summary_md"].exists() + + + +def test_build_pr_draft_marks_review_candidate_and_contains_sections(tmp_path: Path): + metrics = sample_metrics() + + draft = mod.build_pr_draft( + metrics=metrics, + baseline_skill_path=tmp_path / "baseline_skill.md", + evolved_skill_path=tmp_path / "evolved_skill.md", + ) + + assert "# PR Draft" in draft + assert "candidate_for_review" in draft + assert "## Summary" in draft + assert "## Test Plan" in draft + + + +def test_build_review_checklist_mentions_gate_and_holdout(): + checklist = mod.build_review_checklist(sample_metrics()) + + assert "Holdout improvement looks meaningful" in checklist + assert "TBLite gate result reviewed" in checklist + + + +def test_build_diff_summary_mentions_size_and_improvement(tmp_path: Path): + summary = mod.build_diff_summary( + metrics=sample_metrics(), + baseline_skill_path=tmp_path / "baseline_skill.md", + evolved_skill_path=tmp_path / "evolved_skill.md", + ) + + assert "Skill size delta" in summary + assert "+0.150" in summary + assert "baseline_skill.md" in summary + + +def test_build_github_pr_body_contains_github_sections_and_artifact_paths(tmp_path: Path): + body = mod.build_github_pr_body( + metrics=sample_metrics(), + baseline_skill_path=tmp_path / "baseline_skill.md", + evolved_skill_path=tmp_path / "evolved_skill.md", + report_path=tmp_path / "report.md", + summary_path=tmp_path / "summary.json", + diff_summary_path=tmp_path / "diff_summary.md", + review_checklist_path=tmp_path / "review_checklist.md", + ) + + assert "## Summary" in body + assert "## Evaluation Evidence" in body + assert "## Artifacts" in body + assert "## Test Plan" in body + assert "report.md" in body + assert "review_checklist.md" in body + + +def test_build_gh_pr_create_command_marks_review_needed_variants_as_draft(tmp_path: Path): + metrics = sample_metrics() + metrics["improvement"] = 0.0 + metrics["evolved_score"] = metrics["baseline_score"] + + command = mod.build_gh_pr_create_command( + metrics=metrics, + body_path=tmp_path / "github_pr_body.md", + ) + + assert command is not None + assert "gh pr create" in command + assert "--draft" in command + assert "--body-file" in command + + +def test_build_gh_pr_create_command_skips_rejected_variants(tmp_path: Path): + metrics = sample_metrics() + metrics["tblite_gate"]["passed"] = False + metrics["tblite_gate"]["summary"] = "TBLite fast regression detected" + + command = mod.build_gh_pr_create_command( + metrics=metrics, + body_path=tmp_path / "github_pr_body.md", + ) + + assert command is None + + +def test_write_github_pr_artifacts_writes_body_and_command_for_candidates(tmp_path: Path): + paths = mod.write_github_pr_artifacts( + output_dir=tmp_path, + metrics=sample_metrics(), + baseline_skill_path=tmp_path / "baseline_skill.md", + evolved_skill_path=tmp_path / "evolved_skill.md", + report_path=tmp_path / "report.md", + summary_path=tmp_path / "summary.json", + diff_summary_path=tmp_path / "diff_summary.md", + review_checklist_path=tmp_path / "review_checklist.md", + ) + + assert paths["github_pr_body_md"].exists() + assert paths["gh_pr_create_command_txt"].exists() + command = paths["gh_pr_create_command_txt"].read_text() + assert "gh pr create" in command + assert "--title" in command diff --git a/tests/skills/test_evolve_skill.py b/tests/skills/test_evolve_skill.py new file mode 100644 index 0000000..cc9b315 --- /dev/null +++ b/tests/skills/test_evolve_skill.py @@ -0,0 +1,614 @@ +"""Tests for evolve_skill orchestration helpers.""" + +import os +from types import SimpleNamespace + +from evolution.core.dataset_builder import EvalExample, EvalDataset +from evolution.skills import evolve_skill as mod + + +class FakeModule: + def __init__(self, skill_text): + self.skill_text = skill_text + + def __call__(self, task_input: str): + return SimpleNamespace(output=f"base::{task_input}") + + +class FakeOptimizedModule: + def __init__(self, skill_text): + self.skill_text = skill_text + + def __call__(self, task_input: str): + return SimpleNamespace(output=f"evolved::{task_input}") + + +def sample_dataset(): + return EvalDataset( + train=[EvalExample(task_input="train task", expected_behavior="train rubric")], + val=[EvalExample(task_input="val task", expected_behavior="val rubric")], + holdout=[ + EvalExample(task_input="task one", expected_behavior="rubric one"), + EvalExample(task_input="task two", expected_behavior="rubric two"), + ], + ) + + +def test_evaluate_holdout_with_dspy_backend(monkeypatch): + monkeypatch.setattr(mod, "skill_fitness_metric", lambda ex, pred: 0.8 if pred.output.startswith("base::") else 0.95) + + baseline, evolved = mod.evaluate_holdout( + dataset=sample_dataset(), + eval_backend="dspy", + baseline_module=FakeModule("BASE"), + evolved_module=FakeOptimizedModule("EVOLVED"), + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + eval_model="openai/gpt-4.1-mini", + hermes_repo="/tmp/hermes", + ) + + assert baseline == [0.8, 0.8] + assert evolved == [0.95, 0.95] + + + +def test_evaluate_holdout_with_hermes_backend(monkeypatch): + calls = [] + + def _fake_run_skill_eval(case, **kwargs): + calls.append({ + "skill_name": case.skill_name, + "task_input": case.task_input, + "skill_body_override": kwargs.get("skill_body_override"), + "agent_kwargs": kwargs.get("agent_kwargs"), + }) + return SimpleNamespace(final_response=f"resp::{case.task_input}", raw_result={}) + + def score_by_body(skill_body): + return 0.4 if skill_body == "BASE" else 0.9 + + monkeypatch.setattr(mod, "run_skill_eval", _fake_run_skill_eval) + monkeypatch.setattr(mod, "score_output_against_example", lambda **kwargs: score_by_body(kwargs["skill_body"])) + + baseline, evolved = mod.evaluate_holdout( + dataset=sample_dataset(), + eval_backend="hermes", + baseline_module=FakeModule("BASE"), + evolved_module=FakeOptimizedModule("EVOLVED"), + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + eval_model="openai/gpt-4.1-mini", + hermes_repo="/tmp/hermes", + skill_name="github-code-review", + ) + + assert baseline == [0.4, 0.4] + assert evolved == [0.9, 0.9] + assert calls == [ + { + "skill_name": "github-code-review", + "task_input": "task one", + "skill_body_override": "BASE", + "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS}, + }, + { + "skill_name": "github-code-review", + "task_input": "task one", + "skill_body_override": "EVOLVED", + "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS}, + }, + { + "skill_name": "github-code-review", + "task_input": "task two", + "skill_body_override": "BASE", + "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS}, + }, + { + "skill_name": "github-code-review", + "task_input": "task two", + "skill_body_override": "EVOLVED", + "agent_kwargs": {"max_iterations": mod.DEFAULT_HERMES_EVAL_MAX_ITERATIONS}, + }, + ] + + + +def test_maybe_run_tblite_gate_skips_when_disabled(): + result = mod.maybe_run_tblite_gate( + run_tblite=False, + skill_name="github-code-review", + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + hermes_repo="/tmp/hermes", + ) + + assert result is None + + + +def test_maybe_run_tblite_gate_invokes_gate_when_enabled(monkeypatch): + calls = [] + + def _fake_gate(**kwargs): + calls.append(kwargs) + return SimpleNamespace(passed=True, delta=0.01) + + monkeypatch.setattr(mod, "run_tblite_benchmark_gate", _fake_gate) + + result = mod.maybe_run_tblite_gate( + run_tblite=True, + skill_name="github-code-review", + baseline_skill_body="BASE", + evolved_skill_body="EVOLVED", + hermes_repo="/tmp/hermes", + tblite_regression_threshold=0.03, + tblite_task_filter="broken-python", + tblite_mode="fast", + ) + + assert result.passed is True + assert calls == [{ + "skill_name": "github-code-review", + "baseline_skill_body": "BASE", + "evolved_skill_body": "EVOLVED", + "hermes_repo": "/tmp/hermes", + "regression_threshold": 0.03, + "task_filter": "broken-python", + "mode": "fast", + }] + + + +def test_write_report_artifacts_delegates_to_report_module(tmp_path, monkeypatch): + metrics = {"skill_name": "github-code-review"} + called = {} + + def _fake_write_report_artifacts(**kwargs): + called.update(kwargs) + return { + "report_md": tmp_path / "report.md", + "summary_json": tmp_path / "summary.json", + } + + monkeypatch.setattr(mod, "write_report_artifacts", _fake_write_report_artifacts) + + result = mod.write_evolution_report_artifacts( + output_dir=tmp_path, + metrics=metrics, + baseline_skill_path=tmp_path / "baseline.md", + evolved_skill_path=tmp_path / "evolved.md", + ) + + assert result["report_md"] == tmp_path / "report.md" + assert called["metrics"] == metrics + assert called["baseline_skill_path"] == tmp_path / "baseline.md" + + + +def test_write_pr_ready_artifacts_delegates_to_report_module(tmp_path, monkeypatch): + metrics = {"skill_name": "github-code-review"} + called = {} + + def _fake_write_pr_ready_artifacts(**kwargs): + called.update(kwargs) + return {"pr_draft_md": tmp_path / "pr_draft.md"} + + monkeypatch.setattr(mod, "write_pr_ready_artifacts", _fake_write_pr_ready_artifacts) + + result = mod.write_evolution_pr_ready_artifacts( + output_dir=tmp_path, + metrics=metrics, + baseline_skill_path=tmp_path / "baseline.md", + evolved_skill_path=tmp_path / "evolved.md", + ) + + assert result["pr_draft_md"] == tmp_path / "pr_draft.md" + assert called["evolved_skill_path"] == tmp_path / "evolved.md" + + + +def test_write_github_pr_artifacts_delegates_to_report_module(tmp_path, monkeypatch): + metrics = {"skill_name": "github-code-review"} + called = {} + + def _fake_write_github_pr_artifacts(**kwargs): + called.update(kwargs) + return {"github_pr_body_md": tmp_path / "github_pr_body.md"} + + monkeypatch.setattr(mod, "write_github_pr_artifacts", _fake_write_github_pr_artifacts) + + result = mod.write_evolution_github_pr_artifacts( + output_dir=tmp_path, + metrics=metrics, + baseline_skill_path=tmp_path / "baseline.md", + evolved_skill_path=tmp_path / "evolved.md", + report_path=tmp_path / "report.md", + summary_path=tmp_path / "summary.json", + diff_summary_path=tmp_path / "diff_summary.md", + review_checklist_path=tmp_path / "review_checklist.md", + ) + + assert result["github_pr_body_md"] == tmp_path / "github_pr_body.md" + assert called["report_path"] == tmp_path / "report.md" + assert called["review_checklist_path"] == tmp_path / "review_checklist.md" + + + +def test_write_git_pr_automation_artifacts_delegates_to_automation_module(tmp_path, monkeypatch): + metrics = {"skill_name": "github-code-review"} + called = {} + + def _fake_write_git_pr_automation_artifacts(**kwargs): + called.update(kwargs) + return {"git_apply_plan_sh": tmp_path / "git_apply_plan.sh"} + + monkeypatch.setattr(mod, "write_git_pr_automation_artifacts", _fake_write_git_pr_automation_artifacts) + + result = mod.write_evolution_git_pr_automation_artifacts( + output_dir=tmp_path, + metrics=metrics, + hermes_repo=tmp_path / "hermes-agent", + skill_relpath="github/github-code-review/SKILL.md", + evolved_skill_text="# evolved\n", + github_pr_body_path=tmp_path / "github_pr_body.md", + ) + + assert result["git_apply_plan_sh"] == tmp_path / "git_apply_plan.sh" + assert called["skill_relpath"] == "github/github-code-review/SKILL.md" + assert called["github_pr_body_path"] == tmp_path / "github_pr_body.md" + + + +def test_execute_evolution_git_pr_automation_delegates_to_execution_module(tmp_path, monkeypatch): + called = {} + + def _fake_execute_git_pr_automation(**kwargs): + called.update(kwargs) + return {"git": {"steps": []}, "pr": None} + + monkeypatch.setattr(mod, "execute_git_pr_automation", _fake_execute_git_pr_automation) + + result = mod.execute_evolution_git_pr_automation( + git_apply_plan={"hermes_repo": tmp_path / "hermes-agent"}, + gh_pr_create_command="gh pr create --draft", + execute_push=True, + execute_pr=False, + ) + + assert result["git"]["steps"] == [] + assert called["execute_push"] is True + assert called["execute_pr"] is False + + +def test_git_apply_is_skipped_when_tblite_gate_fails(tmp_path, monkeypatch): + monkeypatch.setattr(mod, "load_default_env_files", lambda: []) + monkeypatch.setattr(mod, "resolve_runtime_model_settings", lambda **kwargs: ( + kwargs["optimizer_model"], + kwargs["eval_model"], + {}, + )) + monkeypatch.setattr( + mod, + "find_skill", + lambda skill_name, hermes_repo: tmp_path / "hermes-agent" / "skills" / skill_name / "SKILL.md", + ) + monkeypatch.setattr(mod, "load_skill", lambda path: { + "raw": "---\nname: dogfood\ndescription: test\n---\n\n# Body", + "body": "# Body", + "frontmatter": {"name": "dogfood", "description": "test"}, + "name": "dogfood", + "description": "test", + }) + + dataset = sample_dataset() + monkeypatch.setattr(mod.GoldenDatasetLoader, "load", lambda path: dataset) + + class FakeValidator: + def validate_all(self, artifact_text, artifact_type, baseline_text=None): + return [SimpleNamespace(passed=True, constraint_name="ok", message="ok")] + + monkeypatch.setattr(mod, "ConstraintValidator", lambda config: FakeValidator()) + monkeypatch.setattr(mod.dspy, "LM", lambda model: object()) + monkeypatch.setattr(mod.dspy, "configure", lambda **kwargs: None) + monkeypatch.setattr(mod, "SkillModule", lambda text: FakeModule(text)) + monkeypatch.setattr(mod, "optimize_skill_module", lambda **kwargs: ("GEPA", FakeOptimizedModule("# Evolved"))) + monkeypatch.setattr(mod, "evaluate_holdout", lambda **kwargs: ([0.4, 0.4], [0.9, 0.9])) + monkeypatch.setattr( + mod, + "maybe_run_tblite_gate", + lambda **kwargs: SimpleNamespace( + passed=False, + baseline_pass_rate=0.6, + evolved_pass_rate=0.4, + delta=-0.2, + threshold=0.02, + summary="TBLite fast regression detected", + mode="fast", + task_filter="broken-python", + base_config_path=tmp_path / "local.yaml", + ), + ) + monkeypatch.setattr(mod, "write_evolution_report_artifacts", lambda **kwargs: { + "report_md": tmp_path / "report.md", + "summary_json": tmp_path / "summary.json", + }) + monkeypatch.setattr(mod, "write_evolution_pr_ready_artifacts", lambda **kwargs: { + "pr_draft_md": tmp_path / "pr_draft.md", + "review_checklist_md": tmp_path / "review_checklist.md", + "diff_summary_md": tmp_path / "diff_summary.md", + }) + monkeypatch.setattr(mod, "write_evolution_github_pr_artifacts", lambda **kwargs: { + "github_pr_body_md": tmp_path / "github_pr_body.md", + "gh_pr_create_command_txt": tmp_path / "gh_pr_create_command.txt", + }) + monkeypatch.setattr(mod, "write_evolution_git_pr_automation_artifacts", lambda **kwargs: { + "candidate_skill_file": tmp_path / "candidate_skill_patch.md", + "git_apply_plan_sh": tmp_path / "git_apply_plan.sh", + "git_apply_plan_md": tmp_path / "git_apply_plan.md", + "gh_pr_create_after_push_txt": None, + }) + + execute_calls = [] + monkeypatch.setattr(mod, "execute_evolution_git_pr_automation", lambda **kwargs: execute_calls.append(kwargs)) + + mod.evolve( + skill_name="dogfood", + eval_source="golden", + dataset_path=str(tmp_path / "dataset"), + hermes_repo=str(tmp_path / "hermes-agent"), + eval_backend="hermes", + run_tblite=True, + execute_git_apply=True, + execute_push=True, + execute_pr=False, + ) + + assert execute_calls == [] + + +def test_load_env_file_sets_missing_values_without_overwriting_existing(tmp_path, monkeypatch): + env_path = tmp_path / ".env" + env_path.write_text( + "# comment\n" + "OPENAI_API_KEY=loaded-key\n" + "export OPENAI_BASE_URL='https://example.test/v1'\n" + ) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setenv("OPENAI_BASE_URL", "https://keep-me") + + loaded = mod.load_env_file(env_path) + + assert loaded == {"OPENAI_API_KEY": "loaded-key"} + assert os.environ["OPENAI_API_KEY"] == "loaded-key" + assert os.environ["OPENAI_BASE_URL"] == "https://keep-me" + + + +def test_load_default_env_files_prefers_home_hermes_env(tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + env_path = hermes_home / ".env" + env_path.write_text("OPENAI_API_KEY=from-hermes-home\n") + monkeypatch.setattr(mod.Path, "home", lambda: tmp_path) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + + loaded_paths = mod.load_default_env_files() + + assert env_path in loaded_paths + assert os.environ["OPENAI_API_KEY"] == "from-hermes-home" + + + +def test_resolve_runtime_model_settings_uses_custom_hermes_config_defaults(tmp_path, monkeypatch): + config_path = tmp_path / "config.yaml" + config_path.write_text( + "model:\n" + " provider: custom\n" + " default: gpt-5.4\n" + " base_url: https://custom.example/v1\n" + ) + monkeypatch.delenv("OPENAI_BASE_URL", raising=False) + monkeypatch.delenv("OPENAI_API_BASE", raising=False) + + optimizer_model, eval_model, applied_env = mod.resolve_runtime_model_settings( + optimizer_model="openai/gpt-4.1", + eval_model="openai/gpt-4.1-mini", + config_path=config_path, + ) + + assert optimizer_model == "gpt-5.4" + assert eval_model == "gpt-5.4" + assert applied_env == { + "OPENAI_BASE_URL": "https://custom.example/v1", + "OPENAI_API_BASE": "https://custom.example/v1", + } + assert os.environ["OPENAI_BASE_URL"] == "https://custom.example/v1" + assert os.environ["OPENAI_API_BASE"] == "https://custom.example/v1" + + + +def test_resolve_runtime_model_settings_preserves_explicit_models_and_env(tmp_path, monkeypatch): + config_path = tmp_path / "config.yaml" + config_path.write_text( + "model:\n" + " provider: custom\n" + " default: gpt-5.4\n" + " base_url: https://custom.example/v1\n" + ) + monkeypatch.setenv("OPENAI_BASE_URL", "https://keep.example/v1") + monkeypatch.delenv("OPENAI_API_BASE", raising=False) + + optimizer_model, eval_model, applied_env = mod.resolve_runtime_model_settings( + optimizer_model="anthropic/claude-sonnet-4.6", + eval_model="google/gemini-3-flash", + config_path=config_path, + ) + + assert optimizer_model == "anthropic/claude-sonnet-4.6" + assert eval_model == "google/gemini-3-flash" + assert applied_env == {"OPENAI_API_BASE": "https://custom.example/v1"} + assert os.environ["OPENAI_BASE_URL"] == "https://keep.example/v1" + assert os.environ["OPENAI_API_BASE"] == "https://custom.example/v1" + + + +def test_normalized_reflection_lm_preserves_list_shape_for_none_outputs(): + lm = mod.NormalizedReflectionLM(lambda prompt: [None]) + + assert lm("prompt") == [""] + + + +def test_normalized_reflection_lm_wraps_scalar_outputs_as_single_item_list(): + assert mod.NormalizedReflectionLM(lambda prompt: "hello")("prompt") == ["hello"] + assert mod.NormalizedReflectionLM(lambda prompt: {"text": "hello"})("prompt") == [{"text": "hello"}] + + + +def test_create_gepa_optimizer_uses_legacy_max_steps_when_supported(monkeypatch): + called = {} + + class FakeLegacyGEPA: + def __init__(self, **kwargs): + called.update(kwargs) + + monkeypatch.setattr(mod.dspy, "GEPA", FakeLegacyGEPA) + monkeypatch.setattr(mod.inspect, "signature", lambda obj: SimpleNamespace(parameters={"metric": object(), "max_steps": object()})) + + optimizer = mod.create_gepa_optimizer(metric="metric", iterations=7) + + assert isinstance(optimizer, FakeLegacyGEPA) + assert called == {"metric": "metric", "max_steps": 7} + + + +def test_create_gepa_optimizer_adapts_to_new_signature(monkeypatch): + called = {} + + class FakeModernGEPA: + def __init__(self, **kwargs): + called.update(kwargs) + + class FakeLM: + def __init__(self, model): + self.model = model + + def __call__(self, prompt): + return ["normalized-text"] + + monkeypatch.setattr(mod.dspy, "GEPA", FakeModernGEPA) + monkeypatch.setattr(mod.dspy, "LM", FakeLM) + monkeypatch.setattr( + mod.inspect, + "signature", + lambda obj: SimpleNamespace(parameters={ + "metric": object(), + "reflection_lm": object(), + "max_full_evals": object(), + }), + ) + + optimizer = mod.create_gepa_optimizer(metric="metric", iterations=3, optimizer_model="openai/gpt-4.1") + + assert isinstance(optimizer, FakeModernGEPA) + assert called["metric"] == "metric" + assert called["max_full_evals"] == 3 + assert called["reflection_lm"]("prompt") == ["normalized-text"] + + + +def test_create_gepa_optimizer_wraps_legacy_metric_for_new_gepa(monkeypatch): + called = {} + + class FakeModernGEPA: + def __init__(self, **kwargs): + called.update(kwargs) + + def _metric(example, prediction, trace=None): + return 0.42 + + class FakeLM: + def __init__(self, model): + self.model = model + + def __call__(self, prompt): + return [f"lm::{self.model}"] + + monkeypatch.setattr(mod.dspy, "GEPA", FakeModernGEPA) + monkeypatch.setattr(mod.dspy, "LM", FakeLM) + + def _fake_signature(obj): + if obj is FakeModernGEPA: + return SimpleNamespace(parameters={ + "metric": object(), + "reflection_lm": object(), + "max_full_evals": object(), + }) + if obj is _metric: + return SimpleNamespace(parameters={"example": object(), "prediction": object(), "trace": object()}) + raise AssertionError(f"unexpected object: {obj}") + + monkeypatch.setattr(mod.inspect, "signature", _fake_signature) + + mod.create_gepa_optimizer(metric=_metric, iterations=2, optimizer_model="openai/gpt-4.1") + + assert called["reflection_lm"]("prompt") == ["lm::openai/gpt-4.1"] + assert called["max_full_evals"] == 2 + assert called["metric"]("gold", "pred", "trace", "pred_name", "pred_trace") == 0.42 + + + +def test_optimize_skill_module_falls_back_to_miprov2_when_gepa_fails(monkeypatch): + calls = [] + + def _fake_create_gepa_optimizer(**kwargs): + raise TypeError("old/new API mismatch") + + class FakeMIPRO: + def __init__(self, **kwargs): + calls.append(("mipro_init", kwargs)) + + def compile(self, baseline_module, **kwargs): + calls.append(("mipro_compile", kwargs)) + return "optimized-module" + + monkeypatch.setattr(mod, "create_gepa_optimizer", _fake_create_gepa_optimizer) + monkeypatch.setattr(mod.dspy, "MIPROv2", FakeMIPRO) + + optimizer_name, optimized = mod.optimize_skill_module( + baseline_module="baseline", + trainset=["train"], + valset=["val"], + iterations=2, + metric="metric", + ) + + assert optimizer_name == "MIPROv2" + assert optimized == "optimized-module" + assert calls == [ + ("mipro_init", {"metric": "metric", "auto": "light"}), + ("mipro_compile", {"trainset": ["train"], "valset": ["val"]}), + ] + + +def test_validate_skill_constraints_uses_full_skill_text_for_structure(monkeypatch): + calls = [] + + class FakeValidator: + def validate_all(self, artifact_text, artifact_type, baseline_text=None): + calls.append((artifact_text, artifact_type, baseline_text)) + return [SimpleNamespace(passed=True, constraint_name="skill_structure", message="ok")] + + result = mod.validate_skill_constraints( + validator=FakeValidator(), + full_skill_text="---\nname: dogfood\ndescription: d\n---\n\n# Body", + baseline_full_text="---\nname: base\ndescription: d\n---\n\n# Base", + ) + + assert result[0].passed is True + assert calls == [( + "---\nname: dogfood\ndescription: d\n---\n\n# Body", + "skill", + "---\nname: base\ndescription: d\n---\n\n# Base", + )]