diff --git a/.github/workflows/eval-continuous.yml b/.github/workflows/eval-continuous.yml new file mode 100644 index 00000000..c0d29242 --- /dev/null +++ b/.github/workflows/eval-continuous.yml @@ -0,0 +1,120 @@ +name: agent-eval-continuous + +permissions: + contents: read + issues: write + +concurrency: + group: ${{ github.workflow }}-${{ github.event.inputs.agent || 'all' }} + cancel-in-progress: false + +on: + schedule: + - cron: '0 6 * * *' + workflow_dispatch: + inputs: + agent: + description: Optional agent name; leave empty for all discovered agents + required: false + default: '' + dry_run: + description: Run evaluations without creating drift issues + type: boolean + required: false + default: false + +jobs: + discover-agents: + name: discover eval scope + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.discover.outputs.matrix }} + has-agents: ${{ steps.discover.outputs.has-agents }} + steps: + - uses: actions/checkout@v4 + - name: Build agent matrix + id: discover + env: + SELECTED_AGENT: ${{ github.event.inputs.agent || '' }} + run: | + python - <<'PY' + import glob + import json + import os + + selected = os.environ.get("SELECTED_AGENT", "").strip() + discovered = [] + for config_path in sorted(glob.glob("apps/*/.foundry/eval-config.yaml")): + agent_root = config_path.replace("/.foundry/eval-config.yaml", "") + agent_name = agent_root.split("/")[-1] + if selected and agent_name != selected: + continue + discovered.append({"name": agent_name, "root": agent_root}) + + matrix = json.dumps({"include": discovered}, separators=(",", ":")) + with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output: + print(f"matrix={matrix}", file=output) + print(f"has-agents={str(bool(discovered)).lower()}", file=output) + PY + + monitor: + name: continuous eval (${{ matrix.name }}) + needs: discover-agents + if: needs.discover-agents.outputs.has-agents == 'true' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.discover-agents.outputs.matrix) }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Set up uv + uses: astral-sh/setup-uv@v5 + - name: Install evaluation runtime + run: | + uv pip install --system -e ./lib/src + - name: Run continuous evaluation monitor + env: + AGENT_ROOT: ${{ matrix.root }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} + PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} + PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} + FOUNDRY_PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} + FOUNDRY_PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} + run: | + set -euo pipefail + timestamp="$(date -u +%Y%m%dT%H%M%SZ)" + results_dir="$AGENT_ROOT/.foundry/results" + result_path="$results_dir/run-$timestamp.json" + log_path="$results_dir/run-$timestamp.log" + state_path="$results_dir/.drift_state.json" + mkdir -p "$results_dir" + + issue_args="--create-issue" + if [ "$DRY_RUN" = "true" ]; then + issue_args="--dry-run" + fi + + python scripts/ci/continuous_eval_monitor.py \ + --agent-root "$AGENT_ROOT" \ + --run-name "continuous-$timestamp-${{ matrix.name }}" \ + --write-result "$result_path" \ + --write-log "$log_path" \ + --state-path "$state_path" \ + --repo "${{ github.repository }}" \ + --github-token "$GITHUB_TOKEN" \ + $issue_args + - name: Upload evaluation artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-${{ matrix.name }}-${{ github.run_id }} + path: | + ${{ matrix.root }}/.foundry/results/run-*.json + ${{ matrix.root }}/.foundry/results/run-*.log + ${{ matrix.root }}/.foundry/results/.drift_state.json + if-no-files-found: warn \ No newline at end of file diff --git a/docs/architecture/adrs/adr-017-deployment-strategy.md b/docs/architecture/adrs/adr-017-deployment-strategy.md index 4b432473..10a23879 100644 --- a/docs/architecture/adrs/adr-017-deployment-strategy.md +++ b/docs/architecture/adrs/adr-017-deployment-strategy.md @@ -176,13 +176,21 @@ Required repository secrets: - `AZURE_TENANT_ID` — Azure AD tenant - `AZURE_SUBSCRIPTION_ID` — Target subscription +# ADR-017: Deployment Strategy - azd Provisioning + Flux CD GitOps + +**Status**: Accepted (Revised) ### Evaluation Workflow Integration (Amended: 2026-04) -ADR-028 adds evaluation evidence to PR and deployment governance without changing the deployment source of truth. The current evaluation workflow is `.github/workflows/eval-advisory.yml`, whose workflow name is `agent-eval-advisory`. It discovers the pilot evaluation scope, runs `scripts/ci/run_agent_evaluation.py` for changed pilot agents, writes normalized `.foundry-results/*.json`, publishes job summaries, and uploads evaluation artifacts. +ADR-028 integrates evaluation evidence into PR and deployment governance while preserving the azd + Flux deployment source of truth. The repository includes an advisory matrix workflow `.github/workflows/eval-advisory.yml` (`agent-eval-advisory`) that runs evaluation for changed pilot agents, publishes summaries, and uploads artifacts for reviewer evidence. Separately, `.github/workflows/eval-continuous.yml` (`agent-eval-continuous`) runs daily by default at `0 6 * * *` UTC to detect quality drift across agents that include `.foundry/eval-config.yaml`. + +Key controls for evaluation workflows: -`agent-eval-advisory` is intentionally advisory and non-required. It must remain outside required branch-protection checks until `docs/governance/README.md` is explicitly revised to promote it. There is no `eval-gate.yml` or `eval-continuous.yml` workflow in the current repository snapshot, so deployment governance must reference the existing advisory workflow rather than stale gate names. +- Both `agent-eval-advisory` and `agent-eval-continuous` are advisory and non-required by default. They must remain outside required branch-protection checks unless `docs/governance/README.md` is explicitly updated to promote them. +- `agent-eval-continuous` discovers agents by scanning `apps/*/.foundry/eval-config.yaml` and runs the evaluation monitor in a matrix with `fail-fast: false`. +- The continuous workflow writes run artifacts to per-agent `.foundry/results/` directories in the workflow workspace and uploads them as workflow artifacts; it does not commit result or baseline files back to the repository. +- When drift is detected, the continuous workflow files an issue with labels `evaluation` and `drift:` unless `dry_run` is set. The workflow guards against duplicate open issues by searching existing open issues for a stable drift fingerprint. -PR reviewers use evaluation artifacts as architecture and quality evidence when prompts, datasets, routing, or evaluation framework code changes. Deployment workflows remain governed by the azd + Flux path in this ADR; evaluation evidence can block a PR by human review policy, but it does not independently deploy, roll back, rename workflows, or bypass `lint` / `test` branch-protection baselines. +These workflows are monitoring and advisory only: they do not perform automatic remediation, rollbacks, or code changes. Deployment governance continues to be enforced by azd + Flux and the `lint`/`test` baseline described in `docs/governance/README.md`. ## Consequences diff --git a/scripts/ci/continuous_eval_monitor.py b/scripts/ci/continuous_eval_monitor.py new file mode 100644 index 00000000..b0197c88 --- /dev/null +++ b/scripts/ci/continuous_eval_monitor.py @@ -0,0 +1,305 @@ +"""Run continuous agent evaluation and file drift issues when needed.""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import math +import sys +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +LIB_SRC = REPO_ROOT / "lib" / "src" +if str(LIB_SRC) not in sys.path: + sys.path.insert(0, str(LIB_SRC)) + +from holiday_peak_lib.evaluation import ( # noqa: E402 + ConfiguredEvaluationRunner, + DatasetLoader, + DriftDetector, + EvaluationResultEvent, +) +from holiday_peak_lib.evaluation.models import DriftReport # noqa: E402 + + +@dataclass(frozen=True) +class GitHubIssueClient: + """Small GitHub Issues REST client used by the CI monitor.""" + + repo: str + token: str + + def search_open_drift_issue(self, fingerprint: str) -> bool: + query = f'repo:{self.repo} state:open "Fingerprint: {fingerprint}"' + encoded_query = urllib.parse.quote(query) + payload = self._request_json("GET", f"/search/issues?q={encoded_query}") + return int(payload.get("total_count", 0)) > 0 + + def create_issue(self, *, title: str, body: str, labels: list[str]) -> str | None: + payload = self._request_json( + "POST", + f"/repos/{self.repo}/issues", + body={"title": title, "body": body, "labels": labels}, + ) + html_url = payload.get("html_url") + return str(html_url) if html_url else None + + def _request_json( + self, + method: str, + path: str, + *, + body: dict[str, Any] | None = None, + ) -> dict[str, Any]: + data = None + if body is not None: + data = json.dumps(body).encode("utf-8") + request = urllib.request.Request( + f"https://api.github.com{path}", + data=data, + method=method, + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json", + "X-GitHub-Api-Version": "2022-11-28", + }, + ) + with urllib.request.urlopen(request, timeout=30) as response: # noqa: S310 + return json.loads(response.read().decode("utf-8")) + + +def parse_args() -> argparse.Namespace: + """Parse monitor command-line arguments.""" + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--agent-root", required=True, help="Path to the agent app root") + parser.add_argument("--run-name", default="continuous", help="Evaluation run name") + parser.add_argument("--write-result", help="Path to write normalized result JSON") + parser.add_argument("--write-log", help="Path to write monitor summary log") + parser.add_argument("--state-path", help="Optional drift-state JSON path") + parser.add_argument("--repo", help="GitHub owner/repo for issue creation") + parser.add_argument("--github-token", help="GitHub token for issue creation") + parser.add_argument("--create-issue", action="store_true", help="Create drift issues") + parser.add_argument("--dry-run", action="store_true", help="Suppress issue creation") + parser.add_argument( + "--update-baseline", + action="store_true", + help="Update configured baseline.json when no drift is detected", + ) + return parser.parse_args() + + +async def run_monitor(args: argparse.Namespace) -> int: + """Run one continuous evaluation monitor cycle.""" + + agent_root = Path(args.agent_root).resolve() + loader = DatasetLoader(agent_root / ".foundry") + runner = ConfiguredEvaluationRunner(loader=loader, prefer_foundry=True) + result = await runner.run(run_name=str(args.run_name)) + baseline = loader.load_baseline(runner.config) + drift_report = DriftDetector(runner.config).detect( + result, + baseline=baseline, + run_name=str(args.run_name), + ) + payload = _build_event_payload(runner.config.agent_name, args.run_name, result, drift_report) + + if args.write_result: + _write_json(Path(args.write_result), payload) + if args.state_path: + _write_state(Path(args.state_path), runner.config.agent_name, drift_report) + if args.update_baseline and drift_report is None and runner.config.baseline_path: + _update_baseline(loader, runner.config, result) + + issue_url = None + if drift_report is not None and args.create_issue and not args.dry_run: + if not args.repo or not args.github_token: + print("Skipping drift issue creation: repo or GitHub token not provided") + else: + issue_url = create_deduped_issue( + repo=args.repo, + token=args.github_token, + agent_name=runner.config.agent_name, + report=drift_report, + result_path=args.write_result, + ) + + if args.write_log: + _write_log(Path(args.write_log), runner.config.agent_name, payload, issue_url) + print(json.dumps(payload, indent=2, sort_keys=True)) + return 0 + + +def _build_event_payload( + agent_name: str, + run_name: str, + result: Any, + drift_report: DriftReport | None, +) -> dict[str, Any]: + event = EvaluationResultEvent( + agent_name=agent_name, + run_name=run_name, + backend=result.backend, + status=result.status, + metrics=_float_metrics(result.metrics), + eval_score=result.score, + eval_baseline_id=result.baseline_id, + baseline_source=result.baseline_source, + drift_detected=drift_report is not None, + drift_report=drift_report, + details=result.details, + ) + return event.model_dump(mode="json", by_alias=True) + + +def _float_metrics(metrics: dict[str, Any]) -> dict[str, float]: + normalized: dict[str, float] = {} + for key, value in metrics.items(): + try: + numeric_value = float(value) + except (TypeError, ValueError): + continue + if math.isfinite(numeric_value): + normalized[key] = numeric_value + return normalized + + +def _write_json(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + +def _write_state(path: Path, agent_name: str, report: DriftReport | None) -> None: + state: dict[str, Any] = {} + if path.exists(): + try: + state = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + state = {} + state[agent_name] = { + "consecutive_failures": report.consecutive_failures if report else 0, + "last_signal_at": datetime.now(timezone.utc).isoformat() if report else None, + } + _write_json(path, state) + + +def _update_baseline(loader: DatasetLoader, config: Any, result: Any) -> None: + baseline_path = loader.resolve_path(config.baseline_path) + existing_payload: dict[str, Any] = {} + if baseline_path.exists(): + try: + existing_payload = json.loads(baseline_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + existing_payload = {} + now = datetime.now(timezone.utc).isoformat() + baseline_payload = { + "agent_name": config.agent_name, + "metrics": _float_metrics(result.metrics), + "baseline_id": config.resolved_baseline_id, + "dataset_version": result.details.get("dataset_version", "seed"), + "created_at": existing_payload.get("created_at", now), + "updated_at": now, + } + temporary_path = baseline_path.with_suffix(".tmp") + _write_json(temporary_path, baseline_payload) + temporary_path.replace(baseline_path) + + +def create_deduped_issue( + *, + repo: str, + token: str, + agent_name: str, + report: DriftReport, + result_path: str | None, + client: GitHubIssueClient | None = None, +) -> str | None: + """Create one drift issue per open fingerprint.""" + + fingerprint = _report_fingerprint(report) + github_client = client or GitHubIssueClient(repo=repo, token=token) + try: + if github_client.search_open_drift_issue(fingerprint): + print(f"Found existing open drift issue for fingerprint {fingerprint}") + return None + severity = report.severity.value + issue_url = github_client.create_issue( + title=f"[DRIFT:{severity.upper()}] {agent_name} evaluation drift", + body=_issue_body(agent_name, report, fingerprint, result_path), + labels=["evaluation", f"drift:{severity}"], + ) + if issue_url: + print(f"Created drift issue: {issue_url}") + return issue_url + except (urllib.error.URLError, TimeoutError, OSError) as exc: + print(f"Failed to create drift issue: {exc}") + return None + + +def _issue_body( + agent_name: str, + report: DriftReport, + fingerprint: str, + result_path: str | None, +) -> str: + lines = [ + f"Agent: **{agent_name}**", + f"Severity: **{report.severity.value}**", + f"Baseline: `{report.baseline_id or 'n/a'}`", + f"Consecutive failures: `{report.consecutive_failures}`", + f"Fingerprint: `{fingerprint}`", + "", + "Breached metrics:", + ] + lines.extend(f"- `{item}`" for item in report.breached_thresholds) + lines.extend(["", "Drift magnitude:"]) + lines.extend( + f"- `{key}`: `{value}`" for key, value in sorted(report.drift_metrics.items()) + ) + if result_path: + lines.extend(["", f"Evaluation result path: `{result_path}`"]) + return "\n".join(lines) + + +def _report_fingerprint(report: DriftReport) -> str: + parts = [ + *sorted(report.breached_thresholds), + *[f"{key}:{round(value, 4)}" for key, value in sorted(report.drift_metrics.items())], + ] + return "|".join(parts) or "no-drift-details" + + +def _write_log( + path: Path, + agent_name: str, + payload: dict[str, Any], + issue_url: str | None, +) -> None: + lines = [ + f"agent={agent_name}", + f"status={payload.get('status')}", + f"backend={payload.get('backend')}", + f"drift_detected={payload.get('drift_detected')}", + ] + if issue_url: + lines.append(f"issue_url={issue_url}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def main() -> int: + """Run the monitor CLI.""" + + return asyncio.run(run_monitor(parse_args())) + + +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/tests/ci/test_continuous_eval_monitor.py b/tests/ci/test_continuous_eval_monitor.py new file mode 100644 index 00000000..6c9a2b2b --- /dev/null +++ b/tests/ci/test_continuous_eval_monitor.py @@ -0,0 +1,120 @@ +"""Tests for the continuous evaluation monitor helper.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from holiday_peak_lib.evaluation import EvaluationRunResult +from holiday_peak_lib.evaluation.models import DriftReport, EvalSeverity + +from scripts.ci.continuous_eval_monitor import ( + _build_event_payload, + _float_metrics, + _report_fingerprint, + _write_state, + create_deduped_issue, +) + + +class FakeGitHubIssueClient: + def __init__(self, *, existing: bool = False) -> None: + self.existing = existing + self.created: list[dict[str, object]] = [] + + def search_open_drift_issue(self, _fingerprint: str) -> bool: + return self.existing + + def create_issue(self, *, title: str, body: str, labels: list[str]) -> str: + self.created.append({"title": title, "body": body, "labels": labels}) + return "https://github.com/example/repo/issues/1" + + +def test_float_metrics_skips_non_finite_values() -> None: + source = {"score": "1.0", "count": 3, "nan": "nan", "bad": None} + + assert _float_metrics(source) == {"score": 1.0, "count": 3.0} + + +def test_event_payload_marks_no_drift() -> None: + result = EvaluationRunResult( + status="ok", + backend="local-fallback", + metrics={"dataset_readiness": 1.0}, + details={"case_count": 2}, + score=1.0, + baseline_id="catalog-search:baseline", + ) + + payload = _build_event_payload("catalog-search", "unit", result, None) + + assert payload["agent_name"] == "catalog-search" + assert payload["drift_detected"] is False + assert payload["eval.score"] == 1.0 + assert payload["metrics"] == {"dataset_readiness": 1.0} + + +def test_write_state_records_drift_signal(tmp_path: Path) -> None: + report = _drift_report() + state_path = tmp_path / ".drift_state.json" + + _write_state(state_path, "catalog-search", report) + + state = json.loads(state_path.read_text(encoding="utf-8")) + assert state["catalog-search"]["consecutive_failures"] == 3 + assert state["catalog-search"]["last_signal_at"] + + +def test_create_deduped_issue_creates_when_fingerprint_is_new() -> None: + report = _drift_report() + client = FakeGitHubIssueClient(existing=False) + + issue_url = create_deduped_issue( + repo="owner/repo", + token="token", + agent_name="catalog-search", + report=report, + result_path="apps/catalog/.foundry/results/run.json", + client=client, # type: ignore[arg-type] + ) + + assert issue_url == "https://github.com/example/repo/issues/1" + assert client.created + assert "[DRIFT:CRITICAL] catalog-search evaluation drift" == client.created[0]["title"] + assert client.created[0]["labels"] == ["evaluation", "drift:critical"] + assert "Fingerprint:" in str(client.created[0]["body"]) + + +def test_create_deduped_issue_skips_existing_fingerprint() -> None: + report = _drift_report() + client = FakeGitHubIssueClient(existing=True) + + issue_url = create_deduped_issue( + repo="owner/repo", + token="token", + agent_name="catalog-search", + report=report, + result_path=None, + client=client, # type: ignore[arg-type] + ) + + assert issue_url is None + assert client.created == [] + + +def test_report_fingerprint_is_stable() -> None: + assert _report_fingerprint(_drift_report()) == "quality|quality:-0.25" + + +def _drift_report() -> DriftReport: + return DriftReport( + agent_name="catalog-search", + run_name="unit", + severity=EvalSeverity.CRITICAL, + breached_thresholds=["quality"], + drift_metrics={"quality": -0.25}, + current_metrics={"quality": 0.55}, + baseline_metrics={"quality": 0.8}, + baseline_id="catalog-search:baseline", + consecutive_failures=3, + ) \ No newline at end of file