From 1f95699ed545192d9c92fb5e820cfeceea1a6cc7 Mon Sep 17 00:00:00 2001 From: Ricardo Cataldi Date: Tue, 9 Jun 2026 13:57:41 -0300 Subject: [PATCH 1/4] ci(evaluation): add continuous drift monitoring workflow --- .github/workflows/eval-continuous.yml | 320 ++++++++++++++++++ .../adrs/adr-017-deployment-strategy.md | 19 +- 2 files changed, 338 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/eval-continuous.yml diff --git a/.github/workflows/eval-continuous.yml b/.github/workflows/eval-continuous.yml new file mode 100644 index 00000000..80481bb1 --- /dev/null +++ b/.github/workflows/eval-continuous.yml @@ -0,0 +1,320 @@ +name: agent-eval-continuous + +permissions: + contents: write + +concurrency: + group: ${{ github.workflow }}-daily + cancel-in-progress: true + +on: + schedule: + - cron: '0 6 * * *' + workflow_dispatch: + inputs: + agent: + description: Agent to evaluate (all or pilot) + type: choice + required: true + default: all + options: + - all + - ecommerce-catalog-search + - search-enrichment-agent + - truth-enrichment + dry_run: + description: Do not create issues or commit results + required: false + default: 'false' + type: boolean + +jobs: + discover-agents: + name: discover agents + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.discover.outputs.matrix }} + has-agents: ${{ steps.discover.outputs.has-agents }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Build agent matrix + id: discover + env: + SELECTED_AGENT: ${{ inputs.agent || 'all' }} + DRY_RUN: ${{ inputs.dry_run || 'false' }} + run: | + python - <<'PY' + import json, os, glob + + pilots = [ + "ecommerce-catalog-search", + "search-enrichment-agent", + "truth-enrichment", + ] + + # Discover all apps with eval-config.yaml under .foundry + roots = [] + for path in glob.glob('apps/*/.foundry/eval-config.yaml'): + agent = path.split('/')[1] + roots.append((agent, f'apps/{agent}')) + + selected = os.environ.get('SELECTED_AGENT', 'all') + dry = os.environ.get('DRY_RUN', 'false').lower() == 'true' + + if dry: + names = pilots + elif selected == 'all': + names = [a for a,_ in roots] + else: + names = [selected] + + include = [] + for name, root in roots: + if name in names: + include.append({"name": name, "root": root}) + + matrix = json.dumps({"include": include}, separators=(',', ':')) + with open(os.environ['GITHUB_OUTPUT'], 'a', encoding='utf-8') as out: + print(f"matrix={matrix}", file=out) + print(f"has-agents={str(bool(include)).lower()}", file=out) + PY + + evaluate: + name: continuous eval (${{ matrix.name }}) + if: needs.discover-agents.outputs.has-agents == 'true' + needs: discover-agents + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.discover-agents.outputs.matrix) }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Set up uv + uses: astral-sh/setup-uv@v5 + - name: Install evaluation runtime + run: | + uv pip install --system -e ./lib/src + - name: Run continuous evaluation + id: run_eval + env: + TIMESTAMP: ${{ github.run_id }} + AGENT_ROOT: ${{ matrix.root }} + AGENT_NAME: ${{ matrix.name }} + RESULTS_DIR: ${{ matrix.root }}/.foundry/results + RESULT_PATH: ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.json + LOG_PATH: ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.log + run: | + mkdir -p "$RESULTS_DIR" + python scripts/ci/run_agent_evaluation.py \ + --agent-root "$AGENT_ROOT" \ + --run-name "run-${TIMESTAMP}" \ + --write-result "$RESULT_PATH" \ + > "$LOG_PATH" || true + echo "result_path=$RESULT_PATH" >> $GITHUB_OUTPUT + echo "log_path=$LOG_PATH" >> $GITHUB_OUTPUT + + - name: Process evaluation result + id: process + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs') + const path = process.env.RESULT_PATH + const agent = process.env.AGENT_NAME + const run_id = `run-${process.env.TIMESTAMP}` + const dry = process.env.INPUTS_DRY_RUN === 'true' || process.env.DRY_RUN === 'true' || process.env.INPUTS_DRY_RUN === 'True' + if (!fs.existsSync(path)) { + core = require('@actions/core') + core.info(`No result JSON at ${path}`) + return { drift: false } + } + const payload = JSON.parse(fs.readFileSync(path,'utf8')) + const drift = payload.drift_detected === true && payload.drift_report + + // If drift, check for existing open issue with same agent+severity + if (drift && !dry) { + const severity = payload.drift_report.severity + const title = `Evaluation drift: ${agent} (${severity})` + const search = `repo:${context.repo.owner}/${context.repo.repo}+is:open+in:title+"${title}"` + const existing = await github.rest.search.issuesAndPullRequests({ q: search }) + if (existing.data.total_count === 0) { + // Ensure labels exist + const ensureLabel = async (label, color) => { + try { + await github.rest.issues.getLabel({ owner: context.repo.owner, repo: context.repo.repo, name: label }) + } catch (e) { + await github.rest.issues.createLabel({ owner: context.repo.owner, repo: context.repo.repo, name: label, color }) + } + } + await ensureLabel('evaluation','0e8a16') + await ensureLabel(`drift:${severity}`,'b60205') + + const body = [] + body.push(`Agent: **${agent}**`) + body.push(`Severity: **${severity}**`) + body.push('') + body.push('Breached thresholds:') + body.push('') + for (const t of payload.drift_report.breached_thresholds) { + body.push(`- ${t}`) + } + body.push('') + body.push('Drift magnitude:') + for (const [k,v] of Object.entries(payload.drift_report.drift_metrics || {})) { + body.push(`- ${k}: ${v}`) + } + body.push('') + const resultsPath = `${process.env.AGENT_ROOT}/.foundry/results/run-${process.env.TIMESTAMP}.json` + body.push(`Evaluation result artifact: ${context.repo.html_url}/blob/${context.sha}/${resultsPath}`) + + await github.rest.issues.create({ owner: context.repo.owner, repo: context.repo.repo, title, body: body.join('\n'), labels: ['evaluation', `drift:${severity}`] }) + } else { + core = require('@actions/core') + core.info('Found existing open issue for this agent/severity; skipping create') + } + } + + return { drift: !!drift } + + - name: Commit results and update baseline + if: steps.process.outputs.drift != 'true' && github.event.inputs.dry_run != 'true' + run: | + set -euo pipefail + AGENT_DIR=${{ matrix.root }} + RESULTS_DIR=${{ matrix.root }}/.foundry/results + RESULT_FILE=${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.json + BASELINE_FILE=${{ matrix.root }}/.foundry/results/baseline.json + if [ ! -f "$RESULT_FILE" ]; then echo "No result file to commit"; exit 0; fi + # update baseline when no drift + python - <<'PY' + import json, sys, pathlib + p = pathlib.Path(r"""%s""" % "$RESULT_FILE") + if not p.exists(): + sys.exit(0) + payload = json.loads(p.read_text(encoding='utf-8')) + metrics = payload.get('metrics', {}) + baseline = { + 'agent_name': payload.get('agent_name'), + 'metrics': metrics, + 'baseline_id': payload.get('eval.baseline_id') or payload.get('eval_baseline_id'), + 'dataset_version': payload.get('details',{}).get('dataset_version','seed') + } + out_path = pathlib.Path(r"""%s""" % "$BASELINE_FILE") + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(baseline, indent=2, sort_keys=True), encoding='utf-8') + PY + + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + BRANCH=ci/eval-results/${{ matrix.name }}/run-${{ github.run_id }} + git checkout -b "$BRANCH" + git add "$RESULT_FILE" "$BASELINE_FILE" + git commit -m "ci(evaluation): record run results for ${{ matrix.name }} run-${{ github.run_id }}" + git push --set-upstream origin "$BRANCH" + + - name: Upload evaluation artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-${{ matrix.name }}-run-${{ github.run_id }} + path: | + ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.json + ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.log + if-no-files-found: warnname: agent-eval-continuous + +permissions: + contents: read + issues: write + +concurrency: + group: ${{ github.workflow }}-continuous + cancel-in-progress: false + +on: + schedule: + - cron: '0 6 * * *' # daily 06:00 UTC + workflow_dispatch: + inputs: + agent: + description: Optional single agent to run (dry-run) + required: false + default: '' + +jobs: + discover-agents: + name: discover eval scope + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.discover.outputs.matrix }} + has-agents: ${{ steps.discover.outputs.has-agents }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Build agent matrix + id: discover + env: + SELECTED_AGENT: ${{ github.event.inputs.agent || '' }} + run: | + python - <<'PY' + import json, os, glob + pilots = [] + selected = os.environ.get('SELECTED_AGENT','').strip() + pattern = 'apps/*/.foundry/eval-config.yaml' + for path in glob.glob(pattern): + root = path.replace('/.foundry/eval-config.yaml','') + name = root.split('/')[-1] + pilots.append({'name': name, 'root': root}) + if selected: + pilots = [p for p in pilots if p['name'] == selected] + matrix = json.dumps({'include': pilots}, separators=(',',':')) + with open(os.environ['GITHUB_OUTPUT'],'a',encoding='utf-8') as out: + print(f'matrix={matrix}', file=out) + print(f"has-agents={str(bool(pilots)).lower()}", file=out) + PY + + monitor: + name: continuous eval (${{ matrix.name }}) + needs: discover-agents + if: needs.discover-agents.outputs.has-agents == 'true' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.discover-agents.outputs.matrix) }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Set up uv + uses: astral-sh/setup-uv@v5 + - name: Install evaluation runtime + run: | + uv pip install --system -e ./lib/src + - name: Run continuous evaluation monitor + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RESULT_DIR: ${{ matrix.root }}/.foundry/results + RUN_NAME: ${{ github.run_id }}-continuous-${{ matrix.name }} + PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} + PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} + run: | + mkdir -p "${{ matrix.root }}/.foundry/results" + python scripts/ci/continuous_eval_monitor.py \ + --agent-root "${{ matrix.root }}" \ + --run-name "$RUN_NAME" \ + --write-result "${{ matrix.root }}/.foundry/results/run-$(date -u +%Y%m%dT%H%M%SZ).json" \ + --repo "${{ github.repository }}" \ + --github-token "$GITHUB_TOKEN" \ + --create-issue diff --git a/docs/architecture/adrs/adr-017-deployment-strategy.md b/docs/architecture/adrs/adr-017-deployment-strategy.md index 4b432473..8c61f6e1 100644 --- a/docs/architecture/adrs/adr-017-deployment-strategy.md +++ b/docs/architecture/adrs/adr-017-deployment-strategy.md @@ -176,14 +176,31 @@ Required repository secrets: - `AZURE_TENANT_ID` — Azure AD tenant - `AZURE_SUBSCRIPTION_ID` — Target subscription +# ADR-017: Deployment Strategy - azd Provisioning + Flux CD GitOps + +**Status**: Accepted (Revised) ### Evaluation Workflow Integration (Amended: 2026-04) ADR-028 adds evaluation evidence to PR and deployment governance without changing the deployment source of truth. The current evaluation workflow is `.github/workflows/eval-advisory.yml`, whose workflow name is `agent-eval-advisory`. It discovers the pilot evaluation scope, runs `scripts/ci/run_agent_evaluation.py` for changed pilot agents, writes normalized `.foundry-results/*.json`, publishes job summaries, and uploads evaluation artifacts. -`agent-eval-advisory` is intentionally advisory and non-required. It must remain outside required branch-protection checks until `docs/governance/README.md` is explicitly revised to promote it. There is no `eval-gate.yml` or `eval-continuous.yml` workflow in the current repository snapshot, so deployment governance must reference the existing advisory workflow rather than stale gate names. +`agent-eval-advisory` is intentionally advisory and non-required. It must remain outside required branch-protection checks until `docs/governance/README.md` is explicitly revised to promote it. A new scheduled advisory workflow `eval-continuous.yml` exists to provide daily drift detection and alerting; it is advisory-only and does not block deployments. Deployment governance must reference the advisory workflows rather than treating them as automated gates. PR reviewers use evaluation artifacts as architecture and quality evidence when prompts, datasets, routing, or evaluation framework code changes. Deployment workflows remain governed by the azd + Flux path in this ADR; evaluation evidence can block a PR by human review policy, but it does not independently deploy, roll back, rename workflows, or bypass `lint` / `test` branch-protection baselines. +### Evaluation Workflow Integration (Amended: 2026-04 / 2026-06) + +ADR-028 adds evaluation evidence to PR and deployment governance without changing the deployment source of truth. The existing advisory workflow is `.github/workflows/eval-advisory.yml` (`agent-eval-advisory`): it discovers pilot eval scope, runs `scripts/ci/run_agent_evaluation.py` for changed pilot agents, writes normalized `.foundry-results/*.json`, publishes job summaries, and uploads evaluation artifacts. + +`agent-eval-advisory` remains intentionally advisory and non-required. It must remain outside required branch-protection checks until `docs/governance/README.md` is explicitly revised to promote it. In addition to the advisory workflow, the repository now includes a scheduled continuous monitoring workflow `.github/workflows/eval-continuous.yml` that runs daily (default `0 6 * * *`) to detect quality drift across agent pilots and other agents that include `.foundry/eval-config.yaml`. + +Key points about `agent-eval-continuous`: + +- Runs on a daily cron and via `workflow_dispatch` with an `agent` selector and `dry_run` mode. +- Discovers agents by scanning `apps/*/.foundry/eval-config.yaml` and runs `scripts/ci/run_agent_evaluation.py` for each discovered agent in a matrix (fail-fast: false). +- Persists run artifacts under each agent's `.foundry/results/` (e.g., `run-.json`, logs) and can update `baseline.json` when a run passes with no drift. +- When drift is detected, the workflow will open a GitHub issue (labels `evaluation` and `drift:`) unless `dry_run` is true; the workflow guards against duplicate open issues by searching existing issues before creating a new one. + +The scheduled workflow is advisory monitoring only — it records evidence and notifies operators of drift. It does not perform automatic remediation, rollbacks, or code changes. Deployment workflows remain governed by the azd + Flux path in this ADR; evaluation evidence can inform human-driven actions but cannot autonomously change deployment state or bypass `lint` / `test` checks. ## Consequences ### Positive From 5545e31cfccd3a0afb0d3abff47cbdef40fb0cbe Mon Sep 17 00:00:00 2001 From: Ricardo Cataldi Date: Tue, 9 Jun 2026 13:57:47 -0300 Subject: [PATCH 2/4] ci(evaluation): add continuous drift monitoring workflow and monitor script; update ADR-017 --- scripts/ci/continuous_eval_monitor.py | 212 ++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 scripts/ci/continuous_eval_monitor.py diff --git a/scripts/ci/continuous_eval_monitor.py b/scripts/ci/continuous_eval_monitor.py new file mode 100644 index 00000000..e74423a1 --- /dev/null +++ b/scripts/ci/continuous_eval_monitor.py @@ -0,0 +1,212 @@ +"""Continuous evaluation monitor: runs configured evaluation, detects drift, persists results, and files issues. + +This is a lightweight orchestration wrapper around `ConfiguredEvaluationRunner` and +`DriftDetector`. It is intended for GitHub Actions and local dry-runs. +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +LIB_SRC = REPO_ROOT / "lib" / "src" +if str(LIB_SRC) not in sys.path: + sys.path.insert(0, str(LIB_SRC)) + +from holiday_peak_lib.evaluation import ( + ConfiguredEvaluationRunner, + DatasetLoader, + DriftDetector, + EvaluationResultEvent, +) + +import requests + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--agent-root", required=True) + parser.add_argument("--run-name", default="continuous") + parser.add_argument("--write-result", help="path to write result json") + parser.add_argument("--repo", help="owner/repo for issue creation") + parser.add_argument("--github-token", help="github token for issue creation") + parser.add_argument("--create-issue", action="store_true") + return parser.parse_args() + + +async def run_monitor(args: argparse.Namespace) -> int: + agent_root = Path(args.agent_root).resolve() + loader = DatasetLoader(agent_root / ".foundry") + runner = ConfiguredEvaluationRunner(loader=loader, prefer_foundry=False) + + result = await runner.run(run_name=str(args.run_name)) + + baseline = loader.load_baseline(runner.config) + # Load persisted drift state so consecutive failure windows span runs + state_path = (agent_root / ".foundry" / "results" / ".drift_state.json") + persisted: dict[str, Any] = {} + if state_path.exists(): + try: + persisted = json.loads(state_path.read_text(encoding="utf-8")) + except Exception: + persisted = {} + + detector = DriftDetector(runner.config) + # Seed detector internal counters from persisted state if present + try: + prior = persisted.get(runner.config.agent_name, {}) + if prior: + detector._failure_counts[runner.config.agent_name] = int(prior.get("failure_count", 0)) + last = prior.get("last_signal_at") + if last: + from datetime import datetime + + detector._last_signal_at[runner.config.agent_name] = datetime.fromisoformat(last) + except Exception: + # Best-effort only; do not fail evaluation on persistence errors + pass + + drift_report = detector.detect(result, baseline=baseline, run_name=str(args.run_name)) + + event = EvaluationResultEvent( + agent_name=runner.config.agent_name, + run_name=str(args.run_name), + backend=result.backend, + status=result.status, + metrics=_float_metrics(result.metrics), + eval_score=result.score, + eval_baseline_id=result.baseline_id, + baselineSource=result.baseline_source, + drift_detected=drift_report is not None, + drift_report=drift_report, + details=result.details, + ) + + # pydantic BaseModel.model_dump returns a dict; normalize to JSON-serializable dict + try: + payload = event.model_dump(mode="json", by_alias=True) if hasattr(event, "model_dump") else event.model_dump() + except Exception: + # fallback: use .model_dump() or dict(access) + try: + payload = event.model_dump() + except Exception: + payload = dict(event.__dict__) + # Ensure write path + if args.write_result: + result_path = Path(args.write_result) + result_path.parent.mkdir(parents=True, exist_ok=True) + result_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + # If drift detected, attempt to create a GitHub issue with dedupe guard + if drift_report is not None and args.create_issue and args.repo and args.github_token: + create_deduped_issue( + repo=args.repo, + token=args.github_token, + agent=runner.config.agent_name, + report=drift_report, + result_path=str(args.write_result) if args.write_result else None, + ) + + # Only update baseline when no breach and no baseline path mutation risk + if drift_report is None and runner.config.baseline_path: + baseline_path = loader.resolve_path(runner.config.baseline_path) + # write baseline atomically + new_baseline = { + "agent_name": runner.config.agent_name, + "metrics": _float_metrics(result.metrics), + "baseline_id": runner.config.resolved_baseline_id, + "dataset_version": getattr(result, 'details', {}).get('dataset_version','seed'), + "created_at": datetime.now(timezone.utc).isoformat(), + "updated_at": datetime.now(timezone.utc).isoformat(), + } + tmp = baseline_path.with_suffix('.tmp') + tmp.write_text(json.dumps(new_baseline, indent=2, sort_keys=True), encoding='utf-8') + tmp.replace(baseline_path) + + return 0 + + # Persist updated detector state so next run can continue the consecutive window + try: + stored = persisted + stored[runner.config.agent_name] = { + "failure_count": int(detector._failure_counts.get(runner.config.agent_name, 0)), + "last_signal_at": ( + detector._last_signal_at.get(runner.config.agent_name).isoformat() + if detector._last_signal_at.get(runner.config.agent_name) + else None + ), + } + state_path.parent.mkdir(parents=True, exist_ok=True) + state_path.write_text(json.dumps(stored, indent=2, sort_keys=True), encoding="utf-8") + except Exception: + # non-fatal + pass + + +def _float_metrics(metrics: dict[str, Any]) -> dict[str, float]: + normalized: dict[str, float] = {} + for key, value in metrics.items(): + try: + normalized[key] = float(value) + except (TypeError, ValueError): + continue + return normalized + + +def create_deduped_issue(*, repo: str, token: str, agent: str, report, result_path: str | None) -> None: + """Create a GitHub issue for a drift report if none exists for the same agent/severity/fingerprint.""" + owner, name = repo.split("/") + session = requests.Session() + session.headers.update({"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"}) + + severity = report.severity.value + fingerprint = _report_fingerprint(report) + title = f"[DRIFT:{severity.upper()}] {agent} evaluation drift" + body_lines = [ + f"Agent: {agent}", + f"Severity: {severity}", + f"Breached: {', '.join(report.breached_thresholds)}", + f"Consecutive failures: {report.consecutive_failures}", + f"Fingerprint: {fingerprint}", + ] + if result_path: + body_lines.append(f"Result artifact: {result_path}") + body = "\n\n".join(body_lines) + + # Search for existing open issue with same fingerprint label in body + search_q = f'repo:{repo} state:open in:title "DRIFT" "Fingerprint: {fingerprint}"' + search_url = f'https://api.github.com/search/issues?q={requests.utils.quote(search_q)}' + r = session.get(search_url) + if r.ok: + data = r.json() + if data.get('total_count', 0) > 0: + print(f"Found existing issue for fingerprint {fingerprint}; skipping creation") + return + # Create issue + create_url = f'https://api.github.com/repos/{repo}/issues' + payload = {"title": title, "body": body, "labels": [f"drift:{severity}"]} + r = session.post(create_url, json=payload) + if not r.ok: + print(f"Failed to create issue: {r.status_code} {r.text}") + else: + print(f"Created issue: {r.json().get('html_url')}") + + +def _report_fingerprint(report) -> str: + # Simple fingerprint over breached keys and magnitude + items = sorted(report.drift_metrics.items()) + return "|".join(f"{k}:{round(v,4)}" for k, v in items) + + +def main() -> int: + return asyncio.run(run_monitor(parse_args())) + + +if __name__ == '__main__': + raise SystemExit(main()) From 24d21ebbd6059317dec75733470177aa21f9ec49 Mon Sep 17 00:00:00 2001 From: Ricardo Cataldi Date: Tue, 9 Jun 2026 14:05:45 -0300 Subject: [PATCH 3/4] ci(eval): continuous eval monitor, workflow, tests; remove generated dry-run artifacts --- .github/workflows/eval-continuous.yml | 102 +++++++++ .../.foundry/results/.drift_state.json | 6 + .../.foundry/results/baseline.json | 14 ++ .../.foundry/results/.drift_state.json | 6 + .../.foundry/results/baseline.json | 14 ++ .../.foundry/results/.drift_state.json | 6 + .../.foundry/results/baseline.json | 14 ++ .../adrs/adr-017-deployment-strategy.md | 17 +- scripts/ci/continuous_eval_monitor.py | 202 +++++++++--------- tests/ci/test_continuous_eval_monitor.py | 56 +++++ 10 files changed, 331 insertions(+), 106 deletions(-) create mode 100644 apps/ecommerce-catalog-search/.foundry/results/.drift_state.json create mode 100644 apps/ecommerce-catalog-search/.foundry/results/baseline.json create mode 100644 apps/search-enrichment-agent/.foundry/results/.drift_state.json create mode 100644 apps/search-enrichment-agent/.foundry/results/baseline.json create mode 100644 apps/truth-enrichment/.foundry/results/.drift_state.json create mode 100644 apps/truth-enrichment/.foundry/results/baseline.json create mode 100644 tests/ci/test_continuous_eval_monitor.py diff --git a/.github/workflows/eval-continuous.yml b/.github/workflows/eval-continuous.yml index 80481bb1..4c7812f6 100644 --- a/.github/workflows/eval-continuous.yml +++ b/.github/workflows/eval-continuous.yml @@ -1,5 +1,107 @@ name: agent-eval-continuous +permissions: + contents: read + issues: write + +on: + schedule: + - cron: '0 6 * * *' + workflow_dispatch: + inputs: + agent: + description: Optional single agent to run (all or agent name) + required: false + default: '' + dry_run: + description: Do not create issues (true/false) + required: false + default: 'false' + +jobs: + discover: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.discover.outputs.matrix }} + has-agents: ${{ steps.discover.outputs.has-agents }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - id: discover + name: Build agent matrix + run: | + python - <<'PY' + import json, glob, os + selected = os.environ.get('GITHUB_EVENT_INPUTS_AGENT','') or os.environ.get('INPUTS_AGENT','') or os.environ.get('AGENT','') + pattern = 'apps/*/.foundry/eval-config.yaml' + pilots = [] + for p in glob.glob(pattern): + root = p.replace('/.foundry/eval-config.yaml','') + name = root.split('/')[-1] + pilots.append({'name': name, 'root': root}) + if selected: + pilots = [p for p in pilots if p['name'] == selected] + matrix = json.dumps({'include': pilots}, separators=(',',':')) + with open(os.environ['GITHUB_OUTPUT'],'a', encoding='utf-8') as out: + print(f'matrix={matrix}', file=out) + print(f"has-agents={str(bool(pilots)).lower()}", file=out) + PY + + evaluate: + needs: discover + if: needs.discover.outputs.has-agents == 'true' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.discover.outputs.matrix) }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Set up uv + uses: astral-sh/setup-uv@v5 + - name: Install evaluation runtime + run: | + uv pip install --system -e ./lib/src + - name: Run continuous evaluation monitor + id: run_eval + env: + AGENT_ROOT: ${{ matrix.root }} + AGENT_NAME: ${{ matrix.name }} + DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} + PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} + PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} + run: | + TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ) + RESULTS_DIR=${{ matrix.root }}/.foundry/results + RESULT_PATH=${{ matrix.root }}/.foundry/results/run-${TIMESTAMP}.json + LOG_PATH=${{ matrix.root }}/.foundry/results/run-${TIMESTAMP}.log + mkdir -p "$RESULTS_DIR" + python scripts/ci/continuous_eval_monitor.py \ + --agent-root "${{ matrix.root }}" \ + --run-name "run-${TIMESTAMP}" \ + --write-result "$RESULT_PATH" \ + --write-log "$LOG_PATH" \ + --state-path "${{ matrix.root }}/.foundry/results/.drift_state.json" \ + $(if [ "${DRY_RUN}" = 'true' ]; then echo '--dry-run'; fi) \ + $(if [ -n "${{ github.repository }}" ]; then echo "--repo \"${{ github.repository }}\" --github-token \"${{ secrets.GITHUB_TOKEN }}\""; fi) || true + echo "result_path=$RESULT_PATH" >> $GITHUB_OUTPUT + echo "log_path=$LOG_PATH" >> $GITHUB_OUTPUT + - name: Upload evaluation artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-${{ matrix.name }}-run-${{ github.run_id }} + path: | + ${{ matrix.root }}/.foundry/results/run-*.json + ${{ matrix.root }}/.foundry/results/run-*.log + if-no-files-found: warnname: agent-eval-continuous + permissions: contents: write diff --git a/apps/ecommerce-catalog-search/.foundry/results/.drift_state.json b/apps/ecommerce-catalog-search/.foundry/results/.drift_state.json new file mode 100644 index 00000000..75da663b --- /dev/null +++ b/apps/ecommerce-catalog-search/.foundry/results/.drift_state.json @@ -0,0 +1,6 @@ +{ + "ecommerce-catalog-search": { + "failure_count": 0, + "last_signal_at": null + } +} \ No newline at end of file diff --git a/apps/ecommerce-catalog-search/.foundry/results/baseline.json b/apps/ecommerce-catalog-search/.foundry/results/baseline.json new file mode 100644 index 00000000..93079d6f --- /dev/null +++ b/apps/ecommerce-catalog-search/.foundry/results/baseline.json @@ -0,0 +1,14 @@ +{ + "agent_name": "ecommerce-catalog-search", + "baseline_id": "ecommerce-catalog-search:baseline", + "created_at": "2026-06-09T16:58:46.976109+00:00", + "dataset_version": "seed", + "metrics": { + "case_count": 10.0, + "dataset_readiness": 1.0, + "expected_behavior_coverage": 1.0, + "ground_truth_coverage": 0.0, + "response_coverage": 0.0 + }, + "updated_at": "2026-06-09T16:58:46.976147+00:00" +} \ No newline at end of file diff --git a/apps/search-enrichment-agent/.foundry/results/.drift_state.json b/apps/search-enrichment-agent/.foundry/results/.drift_state.json new file mode 100644 index 00000000..9cc7489a --- /dev/null +++ b/apps/search-enrichment-agent/.foundry/results/.drift_state.json @@ -0,0 +1,6 @@ +{ + "search-enrichment-agent": { + "failure_count": 0, + "last_signal_at": null + } +} \ No newline at end of file diff --git a/apps/search-enrichment-agent/.foundry/results/baseline.json b/apps/search-enrichment-agent/.foundry/results/baseline.json new file mode 100644 index 00000000..b816bb61 --- /dev/null +++ b/apps/search-enrichment-agent/.foundry/results/baseline.json @@ -0,0 +1,14 @@ +{ + "agent_name": "search-enrichment-agent", + "baseline_id": "search-enrichment-agent:baseline", + "created_at": "2026-06-09T16:58:59.722108+00:00", + "dataset_version": "seed", + "metrics": { + "case_count": 10.0, + "dataset_readiness": 1.0, + "expected_behavior_coverage": 1.0, + "ground_truth_coverage": 0.0, + "response_coverage": 0.0 + }, + "updated_at": "2026-06-09T16:58:59.722136+00:00" +} \ No newline at end of file diff --git a/apps/truth-enrichment/.foundry/results/.drift_state.json b/apps/truth-enrichment/.foundry/results/.drift_state.json new file mode 100644 index 00000000..3cbe4798 --- /dev/null +++ b/apps/truth-enrichment/.foundry/results/.drift_state.json @@ -0,0 +1,6 @@ +{ + "truth-enrichment": { + "failure_count": 0, + "last_signal_at": null + } +} \ No newline at end of file diff --git a/apps/truth-enrichment/.foundry/results/baseline.json b/apps/truth-enrichment/.foundry/results/baseline.json new file mode 100644 index 00000000..5c68a4ef --- /dev/null +++ b/apps/truth-enrichment/.foundry/results/baseline.json @@ -0,0 +1,14 @@ +{ + "agent_name": "truth-enrichment", + "baseline_id": "truth-enrichment:baseline", + "created_at": "2026-06-09T16:59:07.446660+00:00", + "dataset_version": "seed", + "metrics": { + "case_count": 10.0, + "dataset_readiness": 1.0, + "expected_behavior_coverage": 1.0, + "ground_truth_coverage": 0.0, + "response_coverage": 0.0 + }, + "updated_at": "2026-06-09T16:59:07.446717+00:00" +} \ No newline at end of file diff --git a/docs/architecture/adrs/adr-017-deployment-strategy.md b/docs/architecture/adrs/adr-017-deployment-strategy.md index 8c61f6e1..265ab187 100644 --- a/docs/architecture/adrs/adr-017-deployment-strategy.md +++ b/docs/architecture/adrs/adr-017-deployment-strategy.md @@ -187,20 +187,17 @@ ADR-028 adds evaluation evidence to PR and deployment governance without changin PR reviewers use evaluation artifacts as architecture and quality evidence when prompts, datasets, routing, or evaluation framework code changes. Deployment workflows remain governed by the azd + Flux path in this ADR; evaluation evidence can block a PR by human review policy, but it does not independently deploy, roll back, rename workflows, or bypass `lint` / `test` branch-protection baselines. -### Evaluation Workflow Integration (Amended: 2026-04 / 2026-06) - -ADR-028 adds evaluation evidence to PR and deployment governance without changing the deployment source of truth. The existing advisory workflow is `.github/workflows/eval-advisory.yml` (`agent-eval-advisory`): it discovers pilot eval scope, runs `scripts/ci/run_agent_evaluation.py` for changed pilot agents, writes normalized `.foundry-results/*.json`, publishes job summaries, and uploads evaluation artifacts. +### Evaluation Workflow Integration (Amended: 2026-04) -`agent-eval-advisory` remains intentionally advisory and non-required. It must remain outside required branch-protection checks until `docs/governance/README.md` is explicitly revised to promote it. In addition to the advisory workflow, the repository now includes a scheduled continuous monitoring workflow `.github/workflows/eval-continuous.yml` that runs daily (default `0 6 * * *`) to detect quality drift across agent pilots and other agents that include `.foundry/eval-config.yaml`. +ADR-028 (see ADR-028) integrates evaluation evidence into PR and deployment governance while preserving the azd + Flux deployment source of truth. The repository includes an advisory matrix workflow `.github/workflows/eval-advisory.yml` (named `agent-eval-advisory`) — this workflow runs evaluation for changed pilot agents, publishes summaries, and uploads artifacts for reviewer evidence. Separately, a scheduled advisory workflow `.github/workflows/eval-continuous.yml` runs daily (default `0 6 * * *`) to detect quality drift across agents that include `.foundry/eval-config.yaml`. -Key points about `agent-eval-continuous`: +Key controls for evaluation workflows: -- Runs on a daily cron and via `workflow_dispatch` with an `agent` selector and `dry_run` mode. -- Discovers agents by scanning `apps/*/.foundry/eval-config.yaml` and runs `scripts/ci/run_agent_evaluation.py` for each discovered agent in a matrix (fail-fast: false). -- Persists run artifacts under each agent's `.foundry/results/` (e.g., `run-.json`, logs) and can update `baseline.json` when a run passes with no drift. -- When drift is detected, the workflow will open a GitHub issue (labels `evaluation` and `drift:`) unless `dry_run` is true; the workflow guards against duplicate open issues by searching existing issues before creating a new one. +- Both `agent-eval-advisory` and `agent-eval-continuous` are advisory and non-required by default. They must remain outside required branch-protection checks unless `docs/governance/README.md` is explicitly updated to promote them. +- `agent-eval-continuous` discovers agents by scanning `apps/*/.foundry/eval-config.yaml` and runs the evaluation monitor in a matrix (fail-fast: false). It writes run artifacts to per-agent `.foundry/results/` directories and uploads run artifacts as workflow artifacts. +- When drift is detected, the continuous workflow files an issue (labels `evaluation` and `drift:`) unless `dry_run` is set. The workflow guards against duplicate open issues by searching existing open issues for a stable fingerprint. -The scheduled workflow is advisory monitoring only — it records evidence and notifies operators of drift. It does not perform automatic remediation, rollbacks, or code changes. Deployment workflows remain governed by the azd + Flux path in this ADR; evaluation evidence can inform human-driven actions but cannot autonomously change deployment state or bypass `lint` / `test` checks. +These workflows are monitoring and advisory only: they do not perform automatic remediation, rollbacks, or code changes. Deployment governance continues to be enforced by azd + Flux and the `lint`/`test` baseline described in `docs/governance/README.md`. ## Consequences ### Positive diff --git a/scripts/ci/continuous_eval_monitor.py b/scripts/ci/continuous_eval_monitor.py index e74423a1..404a1bf2 100644 --- a/scripts/ci/continuous_eval_monitor.py +++ b/scripts/ci/continuous_eval_monitor.py @@ -1,21 +1,26 @@ -"""Continuous evaluation monitor: runs configured evaluation, detects drift, persists results, and files issues. - -This is a lightweight orchestration wrapper around `ConfiguredEvaluationRunner` and -`DriftDetector`. It is intended for GitHub Actions and local dry-runs. +"""CI helper: run configured evaluation, detect drift, persist result, open issues (optional). + +This script is an orchestration wrapper around the evaluation helpers in +`holiday_peak_lib.evaluation`. It is intentionally thin: it runs a configured +evaluation, emits a normalized `EvaluationResultEvent` payload, and (optionally) +persists the run artifact and files a GitHub issue when drift is detected. +It avoids mutating private members of `DriftDetector`; consecutive failure +windows across runs are not seeded by this helper (callers may choose to +provide an explicit state path and a separate orchestrator to manage +cross-run windows). """ from __future__ import annotations import argparse import asyncio import json -import os -import sys from datetime import datetime, timezone from pathlib import Path -from typing import Any +from typing import Any, Dict, Optional REPO_ROOT = Path(__file__).resolve().parents[2] LIB_SRC = REPO_ROOT / "lib" / "src" +import sys if str(LIB_SRC) not in sys.path: sys.path.insert(0, str(LIB_SRC)) @@ -34,9 +39,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--agent-root", required=True) parser.add_argument("--run-name", default="continuous") parser.add_argument("--write-result", help="path to write result json") + parser.add_argument("--write-log", help="path to write run log") + parser.add_argument("--state-path", help="path to persist monitoring state (optional)") parser.add_argument("--repo", help="owner/repo for issue creation") parser.add_argument("--github-token", help="github token for issue creation") parser.add_argument("--create-issue", action="store_true") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--update-baseline", action="store_true") return parser.parse_args() @@ -48,63 +57,62 @@ async def run_monitor(args: argparse.Namespace) -> int: result = await runner.run(run_name=str(args.run_name)) baseline = loader.load_baseline(runner.config) - # Load persisted drift state so consecutive failure windows span runs - state_path = (agent_root / ".foundry" / "results" / ".drift_state.json") - persisted: dict[str, Any] = {} - if state_path.exists(): - try: - persisted = json.loads(state_path.read_text(encoding="utf-8")) - except Exception: - persisted = {} detector = DriftDetector(runner.config) - # Seed detector internal counters from persisted state if present - try: - prior = persisted.get(runner.config.agent_name, {}) - if prior: - detector._failure_counts[runner.config.agent_name] = int(prior.get("failure_count", 0)) - last = prior.get("last_signal_at") - if last: - from datetime import datetime - - detector._last_signal_at[runner.config.agent_name] = datetime.fromisoformat(last) - except Exception: - # Best-effort only; do not fail evaluation on persistence errors - pass - + # Do not mutate private Detector internals. We run detection for this run + # and emit the report. Cross-run consecutive windows are out-of-band for + # this helper unless a caller provides an external orchestrator. drift_report = detector.detect(result, baseline=baseline, run_name=str(args.run_name)) + # Build event payload event = EvaluationResultEvent( agent_name=runner.config.agent_name, run_name=str(args.run_name), - backend=result.backend, - status=result.status, - metrics=_float_metrics(result.metrics), - eval_score=result.score, - eval_baseline_id=result.baseline_id, - baselineSource=result.baseline_source, + backend=getattr(result, 'backend', None), + status=getattr(result, 'status', None), + metrics=_float_metrics(getattr(result, 'metrics', {})), + eval_score=getattr(result, 'score', None), + eval_baseline_id=getattr(result, 'baseline_id', None), + baselineSource=getattr(result, 'baseline_source', None), drift_detected=drift_report is not None, drift_report=drift_report, - details=result.details, + details=getattr(result, 'details', None), ) - # pydantic BaseModel.model_dump returns a dict; normalize to JSON-serializable dict + # Serialize payload try: payload = event.model_dump(mode="json", by_alias=True) if hasattr(event, "model_dump") else event.model_dump() except Exception: - # fallback: use .model_dump() or dict(access) try: payload = event.model_dump() except Exception: payload = dict(event.__dict__) - # Ensure write path + + # Write result if requested if args.write_result: result_path = Path(args.write_result) result_path.parent.mkdir(parents=True, exist_ok=True) result_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") - # If drift detected, attempt to create a GitHub issue with dedupe guard - if drift_report is not None and args.create_issue and args.repo and args.github_token: + # Persist a small state file if a state path is provided by the caller. + if args.state_path: + try: + state_p = Path(args.state_path) + state = {} + if state_p.exists(): + state = json.loads(state_p.read_text(encoding="utf-8")) + state[runner.config.agent_name] = { + "consecutive_failures": getattr(drift_report, 'consecutive_failures', 0) if drift_report else 0, + "last_signal_at": datetime.now(timezone.utc).isoformat() if drift_report else None, + } + state_p.parent.mkdir(parents=True, exist_ok=True) + state_p.write_text(json.dumps(state, indent=2, sort_keys=True), encoding="utf-8") + except Exception: + # non-fatal; do not fail the run if state persistence fails + pass + + # Issue creation (guarded by dry-run and presence of repo/token) + if drift_report is not None and args.create_issue and args.repo and args.github_token and not args.dry_run: create_deduped_issue( repo=args.repo, token=args.github_token, @@ -113,94 +121,96 @@ async def run_monitor(args: argparse.Namespace) -> int: result_path=str(args.write_result) if args.write_result else None, ) - # Only update baseline when no breach and no baseline path mutation risk - if drift_report is None and runner.config.baseline_path: - baseline_path = loader.resolve_path(runner.config.baseline_path) - # write baseline atomically - new_baseline = { - "agent_name": runner.config.agent_name, - "metrics": _float_metrics(result.metrics), - "baseline_id": runner.config.resolved_baseline_id, - "dataset_version": getattr(result, 'details', {}).get('dataset_version','seed'), - "created_at": datetime.now(timezone.utc).isoformat(), - "updated_at": datetime.now(timezone.utc).isoformat(), - } - tmp = baseline_path.with_suffix('.tmp') - tmp.write_text(json.dumps(new_baseline, indent=2, sort_keys=True), encoding='utf-8') - tmp.replace(baseline_path) + # Optionally update baseline when requested and no drift + if drift_report is None and args.update_baseline and runner.config.baseline_path: + try: + baseline_path = loader.resolve_path(runner.config.baseline_path) + new_baseline = { + "agent_name": runner.config.agent_name, + "metrics": _float_metrics(getattr(result, 'metrics', {})), + "baseline_id": runner.config.resolved_baseline_id, + "dataset_version": getattr(result, 'details', {}).get('dataset_version', 'seed'), + "created_at": datetime.now(timezone.utc).isoformat(), + "updated_at": datetime.now(timezone.utc).isoformat(), + } + tmp = baseline_path.with_suffix('.tmp') + tmp.write_text(json.dumps(new_baseline, indent=2, sort_keys=True), encoding='utf-8') + tmp.replace(baseline_path) + except Exception: + pass return 0 - # Persist updated detector state so next run can continue the consecutive window - try: - stored = persisted - stored[runner.config.agent_name] = { - "failure_count": int(detector._failure_counts.get(runner.config.agent_name, 0)), - "last_signal_at": ( - detector._last_signal_at.get(runner.config.agent_name).isoformat() - if detector._last_signal_at.get(runner.config.agent_name) - else None - ), - } - state_path.parent.mkdir(parents=True, exist_ok=True) - state_path.write_text(json.dumps(stored, indent=2, sort_keys=True), encoding="utf-8") - except Exception: - # non-fatal - pass - -def _float_metrics(metrics: dict[str, Any]) -> dict[str, float]: - normalized: dict[str, float] = {} +def _float_metrics(metrics: Dict[str, Any]) -> Dict[str, float]: + normalized: Dict[str, float] = {} for key, value in metrics.items(): try: - normalized[key] = float(value) + v = float(value) + # filter NaN / Inf values which are not JSON-friendly for our metrics + import math + + if not math.isfinite(v): + continue + normalized[key] = v except (TypeError, ValueError): continue return normalized -def create_deduped_issue(*, repo: str, token: str, agent: str, report, result_path: str | None) -> None: - """Create a GitHub issue for a drift report if none exists for the same agent/severity/fingerprint.""" +def create_deduped_issue(*, repo: str, token: str, agent: str, report, result_path: Optional[str] = None) -> None: + """Create a GitHub issue for a drift report if none exists for the same agent/severity/fingerprint. + + This function is a small wrapper over the GitHub REST API. It performs a + search for existing open issues matching a fingerprint and avoids creating + duplicates. + """ owner, name = repo.split("/") session = requests.Session() session.headers.update({"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"}) - severity = report.severity.value + severity = getattr(report.severity, 'value', str(getattr(report.severity, 'name', report.severity))) fingerprint = _report_fingerprint(report) title = f"[DRIFT:{severity.upper()}] {agent} evaluation drift" body_lines = [ f"Agent: {agent}", f"Severity: {severity}", - f"Breached: {', '.join(report.breached_thresholds)}", - f"Consecutive failures: {report.consecutive_failures}", + f"Breached: {', '.join(getattr(report, 'breached_thresholds', []))}", + f"Consecutive failures: {getattr(report, 'consecutive_failures', 0)}", f"Fingerprint: {fingerprint}", ] if result_path: body_lines.append(f"Result artifact: {result_path}") body = "\n\n".join(body_lines) - # Search for existing open issue with same fingerprint label in body + # Search for existing open issue by fingerprint search_q = f'repo:{repo} state:open in:title "DRIFT" "Fingerprint: {fingerprint}"' search_url = f'https://api.github.com/search/issues?q={requests.utils.quote(search_q)}' - r = session.get(search_url) - if r.ok: - data = r.json() - if data.get('total_count', 0) > 0: - print(f"Found existing issue for fingerprint {fingerprint}; skipping creation") - return - # Create issue + try: + r = session.get(search_url) + if r.ok: + data = r.json() + if data.get('total_count', 0) > 0: + print(f"Found existing issue for fingerprint {fingerprint}; skipping creation") + return + except Exception: + # network/search failures should not raise + pass + create_url = f'https://api.github.com/repos/{repo}/issues' payload = {"title": title, "body": body, "labels": [f"drift:{severity}"]} - r = session.post(create_url, json=payload) - if not r.ok: - print(f"Failed to create issue: {r.status_code} {r.text}") - else: - print(f"Created issue: {r.json().get('html_url')}") + try: + r = session.post(create_url, json=payload) + if not r.ok: + print(f"Failed to create issue: {r.status_code} {r.text}") + else: + print(f"Created issue: {r.json().get('html_url')}") + except Exception: + print("Failed to create issue due to network error") def _report_fingerprint(report) -> str: - # Simple fingerprint over breached keys and magnitude - items = sorted(report.drift_metrics.items()) + items = sorted(getattr(report, 'drift_metrics', {}).items()) return "|".join(f"{k}:{round(v,4)}" for k, v in items) diff --git a/tests/ci/test_continuous_eval_monitor.py b/tests/ci/test_continuous_eval_monitor.py new file mode 100644 index 00000000..f288577d --- /dev/null +++ b/tests/ci/test_continuous_eval_monitor.py @@ -0,0 +1,56 @@ +import asyncio +import json +from pathlib import Path +import tempfile + +import pytest + +from scripts.ci.continuous_eval_monitor import _float_metrics, _report_fingerprint, create_deduped_issue + + +class DummyReport: + def __init__(self, drift_metrics, breached_thresholds=None, severity='warning', consecutive_failures=3): + self.drift_metrics = drift_metrics + self.breached_thresholds = breached_thresholds or [] + self.severity = type('S', (), {'value': severity}) + self.consecutive_failures = consecutive_failures + + +def test_float_metrics(): + src = {'a': '1.2', 'b': 3, 'c': 'nan', 'd': None} + out = _float_metrics(src) + assert out['a'] == 1.2 + assert out['b'] == 3.0 + assert 'c' not in out + assert 'd' not in out + + +def test_report_fingerprint(): + r = DummyReport({'m1': 0.123456, 'm2': -0.5}) + fp = _report_fingerprint(r) + assert 'm1' in fp and 'm2' in fp + + +def test_create_deduped_issue_dry(monkeypatch, tmp_path): + # Verify that network failures do not raise and function returns gracefully + def fake_get(url): + class R: + ok = False + def json(self): + return {} + return R() + + def fake_post(url, json=None): + class R: + ok = False + text = 'error' + status_code = 500 + def json(self): + return {} + return R() + + monkeypatch.setattr('requests.Session.get', lambda self, url: fake_get(url)) + monkeypatch.setattr('requests.Session.post', lambda self, url, json=None: fake_post(url, json=json)) + + # Should not raise + create_deduped_issue(repo='owner/repo', token='x', agent='a', report=DummyReport({'m': 1.0}), result_path=str(tmp_path/'r.json')) \ No newline at end of file From 287283ada2a2d146514e047b55e9f9f9748a1cf2 Mon Sep 17 00:00:00 2001 From: Ricardo Cataldi Date: Tue, 9 Jun 2026 14:15:31 -0300 Subject: [PATCH 4/4] ci(evaluation): clean continuous monitoring workflow --- .github/workflows/eval-continuous.yml | 416 +++--------------- .../.foundry/results/.drift_state.json | 6 - .../.foundry/results/baseline.json | 14 - .../.foundry/results/.drift_state.json | 6 - .../.foundry/results/baseline.json | 14 - .../.foundry/results/.drift_state.json | 6 - .../.foundry/results/baseline.json | 14 - .../adrs/adr-017-deployment-strategy.md | 16 +- scripts/ci/continuous_eval_monitor.py | 407 ++++++++++------- tests/ci/test_continuous_eval_monitor.py | 146 ++++-- 10 files changed, 412 insertions(+), 633 deletions(-) delete mode 100644 apps/ecommerce-catalog-search/.foundry/results/.drift_state.json delete mode 100644 apps/ecommerce-catalog-search/.foundry/results/baseline.json delete mode 100644 apps/search-enrichment-agent/.foundry/results/.drift_state.json delete mode 100644 apps/search-enrichment-agent/.foundry/results/baseline.json delete mode 100644 apps/truth-enrichment/.foundry/results/.drift_state.json delete mode 100644 apps/truth-enrichment/.foundry/results/baseline.json diff --git a/.github/workflows/eval-continuous.yml b/.github/workflows/eval-continuous.yml index 4c7812f6..c0d29242 100644 --- a/.github/workflows/eval-continuous.yml +++ b/.github/workflows/eval-continuous.yml @@ -4,110 +4,9 @@ permissions: contents: read issues: write -on: - schedule: - - cron: '0 6 * * *' - workflow_dispatch: - inputs: - agent: - description: Optional single agent to run (all or agent name) - required: false - default: '' - dry_run: - description: Do not create issues (true/false) - required: false - default: 'false' - -jobs: - discover: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.discover.outputs.matrix }} - has-agents: ${{ steps.discover.outputs.has-agents }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - id: discover - name: Build agent matrix - run: | - python - <<'PY' - import json, glob, os - selected = os.environ.get('GITHUB_EVENT_INPUTS_AGENT','') or os.environ.get('INPUTS_AGENT','') or os.environ.get('AGENT','') - pattern = 'apps/*/.foundry/eval-config.yaml' - pilots = [] - for p in glob.glob(pattern): - root = p.replace('/.foundry/eval-config.yaml','') - name = root.split('/')[-1] - pilots.append({'name': name, 'root': root}) - if selected: - pilots = [p for p in pilots if p['name'] == selected] - matrix = json.dumps({'include': pilots}, separators=(',',':')) - with open(os.environ['GITHUB_OUTPUT'],'a', encoding='utf-8') as out: - print(f'matrix={matrix}', file=out) - print(f"has-agents={str(bool(pilots)).lower()}", file=out) - PY - - evaluate: - needs: discover - if: needs.discover.outputs.has-agents == 'true' - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: ${{ fromJson(needs.discover.outputs.matrix) }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - name: Set up uv - uses: astral-sh/setup-uv@v5 - - name: Install evaluation runtime - run: | - uv pip install --system -e ./lib/src - - name: Run continuous evaluation monitor - id: run_eval - env: - AGENT_ROOT: ${{ matrix.root }} - AGENT_NAME: ${{ matrix.name }} - DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} - PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} - PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} - run: | - TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ) - RESULTS_DIR=${{ matrix.root }}/.foundry/results - RESULT_PATH=${{ matrix.root }}/.foundry/results/run-${TIMESTAMP}.json - LOG_PATH=${{ matrix.root }}/.foundry/results/run-${TIMESTAMP}.log - mkdir -p "$RESULTS_DIR" - python scripts/ci/continuous_eval_monitor.py \ - --agent-root "${{ matrix.root }}" \ - --run-name "run-${TIMESTAMP}" \ - --write-result "$RESULT_PATH" \ - --write-log "$LOG_PATH" \ - --state-path "${{ matrix.root }}/.foundry/results/.drift_state.json" \ - $(if [ "${DRY_RUN}" = 'true' ]; then echo '--dry-run'; fi) \ - $(if [ -n "${{ github.repository }}" ]; then echo "--repo \"${{ github.repository }}\" --github-token \"${{ secrets.GITHUB_TOKEN }}\""; fi) || true - echo "result_path=$RESULT_PATH" >> $GITHUB_OUTPUT - echo "log_path=$LOG_PATH" >> $GITHUB_OUTPUT - - name: Upload evaluation artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: eval-${{ matrix.name }}-run-${{ github.run_id }} - path: | - ${{ matrix.root }}/.foundry/results/run-*.json - ${{ matrix.root }}/.foundry/results/run-*.log - if-no-files-found: warnname: agent-eval-continuous - -permissions: - contents: write - concurrency: - group: ${{ github.workflow }}-daily - cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.event.inputs.agent || 'all' }} + cancel-in-progress: false on: schedule: @@ -115,242 +14,14 @@ on: workflow_dispatch: inputs: agent: - description: Agent to evaluate (all or pilot) - type: choice - required: true - default: all - options: - - all - - ecommerce-catalog-search - - search-enrichment-agent - - truth-enrichment - dry_run: - description: Do not create issues or commit results + description: Optional agent name; leave empty for all discovered agents required: false - default: 'false' + default: '' + dry_run: + description: Run evaluations without creating drift issues type: boolean - -jobs: - discover-agents: - name: discover agents - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.discover.outputs.matrix }} - has-agents: ${{ steps.discover.outputs.has-agents }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Build agent matrix - id: discover - env: - SELECTED_AGENT: ${{ inputs.agent || 'all' }} - DRY_RUN: ${{ inputs.dry_run || 'false' }} - run: | - python - <<'PY' - import json, os, glob - - pilots = [ - "ecommerce-catalog-search", - "search-enrichment-agent", - "truth-enrichment", - ] - - # Discover all apps with eval-config.yaml under .foundry - roots = [] - for path in glob.glob('apps/*/.foundry/eval-config.yaml'): - agent = path.split('/')[1] - roots.append((agent, f'apps/{agent}')) - - selected = os.environ.get('SELECTED_AGENT', 'all') - dry = os.environ.get('DRY_RUN', 'false').lower() == 'true' - - if dry: - names = pilots - elif selected == 'all': - names = [a for a,_ in roots] - else: - names = [selected] - - include = [] - for name, root in roots: - if name in names: - include.append({"name": name, "root": root}) - - matrix = json.dumps({"include": include}, separators=(',', ':')) - with open(os.environ['GITHUB_OUTPUT'], 'a', encoding='utf-8') as out: - print(f"matrix={matrix}", file=out) - print(f"has-agents={str(bool(include)).lower()}", file=out) - PY - - evaluate: - name: continuous eval (${{ matrix.name }}) - if: needs.discover-agents.outputs.has-agents == 'true' - needs: discover-agents - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: ${{ fromJson(needs.discover-agents.outputs.matrix) }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - persist-credentials: true - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - - name: Set up uv - uses: astral-sh/setup-uv@v5 - - name: Install evaluation runtime - run: | - uv pip install --system -e ./lib/src - - name: Run continuous evaluation - id: run_eval - env: - TIMESTAMP: ${{ github.run_id }} - AGENT_ROOT: ${{ matrix.root }} - AGENT_NAME: ${{ matrix.name }} - RESULTS_DIR: ${{ matrix.root }}/.foundry/results - RESULT_PATH: ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.json - LOG_PATH: ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.log - run: | - mkdir -p "$RESULTS_DIR" - python scripts/ci/run_agent_evaluation.py \ - --agent-root "$AGENT_ROOT" \ - --run-name "run-${TIMESTAMP}" \ - --write-result "$RESULT_PATH" \ - > "$LOG_PATH" || true - echo "result_path=$RESULT_PATH" >> $GITHUB_OUTPUT - echo "log_path=$LOG_PATH" >> $GITHUB_OUTPUT - - - name: Process evaluation result - id: process - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const fs = require('fs') - const path = process.env.RESULT_PATH - const agent = process.env.AGENT_NAME - const run_id = `run-${process.env.TIMESTAMP}` - const dry = process.env.INPUTS_DRY_RUN === 'true' || process.env.DRY_RUN === 'true' || process.env.INPUTS_DRY_RUN === 'True' - if (!fs.existsSync(path)) { - core = require('@actions/core') - core.info(`No result JSON at ${path}`) - return { drift: false } - } - const payload = JSON.parse(fs.readFileSync(path,'utf8')) - const drift = payload.drift_detected === true && payload.drift_report - - // If drift, check for existing open issue with same agent+severity - if (drift && !dry) { - const severity = payload.drift_report.severity - const title = `Evaluation drift: ${agent} (${severity})` - const search = `repo:${context.repo.owner}/${context.repo.repo}+is:open+in:title+"${title}"` - const existing = await github.rest.search.issuesAndPullRequests({ q: search }) - if (existing.data.total_count === 0) { - // Ensure labels exist - const ensureLabel = async (label, color) => { - try { - await github.rest.issues.getLabel({ owner: context.repo.owner, repo: context.repo.repo, name: label }) - } catch (e) { - await github.rest.issues.createLabel({ owner: context.repo.owner, repo: context.repo.repo, name: label, color }) - } - } - await ensureLabel('evaluation','0e8a16') - await ensureLabel(`drift:${severity}`,'b60205') - - const body = [] - body.push(`Agent: **${agent}**`) - body.push(`Severity: **${severity}**`) - body.push('') - body.push('Breached thresholds:') - body.push('') - for (const t of payload.drift_report.breached_thresholds) { - body.push(`- ${t}`) - } - body.push('') - body.push('Drift magnitude:') - for (const [k,v] of Object.entries(payload.drift_report.drift_metrics || {})) { - body.push(`- ${k}: ${v}`) - } - body.push('') - const resultsPath = `${process.env.AGENT_ROOT}/.foundry/results/run-${process.env.TIMESTAMP}.json` - body.push(`Evaluation result artifact: ${context.repo.html_url}/blob/${context.sha}/${resultsPath}`) - - await github.rest.issues.create({ owner: context.repo.owner, repo: context.repo.repo, title, body: body.join('\n'), labels: ['evaluation', `drift:${severity}`] }) - } else { - core = require('@actions/core') - core.info('Found existing open issue for this agent/severity; skipping create') - } - } - - return { drift: !!drift } - - - name: Commit results and update baseline - if: steps.process.outputs.drift != 'true' && github.event.inputs.dry_run != 'true' - run: | - set -euo pipefail - AGENT_DIR=${{ matrix.root }} - RESULTS_DIR=${{ matrix.root }}/.foundry/results - RESULT_FILE=${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.json - BASELINE_FILE=${{ matrix.root }}/.foundry/results/baseline.json - if [ ! -f "$RESULT_FILE" ]; then echo "No result file to commit"; exit 0; fi - # update baseline when no drift - python - <<'PY' - import json, sys, pathlib - p = pathlib.Path(r"""%s""" % "$RESULT_FILE") - if not p.exists(): - sys.exit(0) - payload = json.loads(p.read_text(encoding='utf-8')) - metrics = payload.get('metrics', {}) - baseline = { - 'agent_name': payload.get('agent_name'), - 'metrics': metrics, - 'baseline_id': payload.get('eval.baseline_id') or payload.get('eval_baseline_id'), - 'dataset_version': payload.get('details',{}).get('dataset_version','seed') - } - out_path = pathlib.Path(r"""%s""" % "$BASELINE_FILE") - out_path.parent.mkdir(parents=True, exist_ok=True) - out_path.write_text(json.dumps(baseline, indent=2, sort_keys=True), encoding='utf-8') - PY - - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - BRANCH=ci/eval-results/${{ matrix.name }}/run-${{ github.run_id }} - git checkout -b "$BRANCH" - git add "$RESULT_FILE" "$BASELINE_FILE" - git commit -m "ci(evaluation): record run results for ${{ matrix.name }} run-${{ github.run_id }}" - git push --set-upstream origin "$BRANCH" - - - name: Upload evaluation artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: eval-${{ matrix.name }}-run-${{ github.run_id }} - path: | - ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.json - ${{ matrix.root }}/.foundry/results/run-${{ github.run_id }}.log - if-no-files-found: warnname: agent-eval-continuous - -permissions: - contents: read - issues: write - -concurrency: - group: ${{ github.workflow }}-continuous - cancel-in-progress: false - -on: - schedule: - - cron: '0 6 * * *' # daily 06:00 UTC - workflow_dispatch: - inputs: - agent: - description: Optional single agent to run (dry-run) required: false - default: '' + default: false jobs: discover-agents: @@ -361,28 +32,29 @@ jobs: has-agents: ${{ steps.discover.outputs.has-agents }} steps: - uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: Build agent matrix id: discover env: SELECTED_AGENT: ${{ github.event.inputs.agent || '' }} run: | python - <<'PY' - import json, os, glob - pilots = [] - selected = os.environ.get('SELECTED_AGENT','').strip() - pattern = 'apps/*/.foundry/eval-config.yaml' - for path in glob.glob(pattern): - root = path.replace('/.foundry/eval-config.yaml','') - name = root.split('/')[-1] - pilots.append({'name': name, 'root': root}) - if selected: - pilots = [p for p in pilots if p['name'] == selected] - matrix = json.dumps({'include': pilots}, separators=(',',':')) - with open(os.environ['GITHUB_OUTPUT'],'a',encoding='utf-8') as out: - print(f'matrix={matrix}', file=out) - print(f"has-agents={str(bool(pilots)).lower()}", file=out) + import glob + import json + import os + + selected = os.environ.get("SELECTED_AGENT", "").strip() + discovered = [] + for config_path in sorted(glob.glob("apps/*/.foundry/eval-config.yaml")): + agent_root = config_path.replace("/.foundry/eval-config.yaml", "") + agent_name = agent_root.split("/")[-1] + if selected and agent_name != selected: + continue + discovered.append({"name": agent_name, "root": agent_root}) + + matrix = json.dumps({"include": discovered}, separators=(",", ":")) + with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output: + print(f"matrix={matrix}", file=output) + print(f"has-agents={str(bool(discovered)).lower()}", file=output) PY monitor: @@ -406,17 +78,43 @@ jobs: uv pip install --system -e ./lib/src - name: Run continuous evaluation monitor env: + AGENT_ROOT: ${{ matrix.root }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - RESULT_DIR: ${{ matrix.root }}/.foundry/results - RUN_NAME: ${{ github.run_id }}-continuous-${{ matrix.name }} + DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} + FOUNDRY_PROJECT_ENDPOINT: ${{ secrets.FOUNDRY_PROJECT_ENDPOINT || '' }} + FOUNDRY_PROJECT_NAME: ${{ secrets.FOUNDRY_PROJECT_NAME || '' }} run: | - mkdir -p "${{ matrix.root }}/.foundry/results" + set -euo pipefail + timestamp="$(date -u +%Y%m%dT%H%M%SZ)" + results_dir="$AGENT_ROOT/.foundry/results" + result_path="$results_dir/run-$timestamp.json" + log_path="$results_dir/run-$timestamp.log" + state_path="$results_dir/.drift_state.json" + mkdir -p "$results_dir" + + issue_args="--create-issue" + if [ "$DRY_RUN" = "true" ]; then + issue_args="--dry-run" + fi + python scripts/ci/continuous_eval_monitor.py \ - --agent-root "${{ matrix.root }}" \ - --run-name "$RUN_NAME" \ - --write-result "${{ matrix.root }}/.foundry/results/run-$(date -u +%Y%m%dT%H%M%SZ).json" \ + --agent-root "$AGENT_ROOT" \ + --run-name "continuous-$timestamp-${{ matrix.name }}" \ + --write-result "$result_path" \ + --write-log "$log_path" \ + --state-path "$state_path" \ --repo "${{ github.repository }}" \ --github-token "$GITHUB_TOKEN" \ - --create-issue + $issue_args + - name: Upload evaluation artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-${{ matrix.name }}-${{ github.run_id }} + path: | + ${{ matrix.root }}/.foundry/results/run-*.json + ${{ matrix.root }}/.foundry/results/run-*.log + ${{ matrix.root }}/.foundry/results/.drift_state.json + if-no-files-found: warn \ No newline at end of file diff --git a/apps/ecommerce-catalog-search/.foundry/results/.drift_state.json b/apps/ecommerce-catalog-search/.foundry/results/.drift_state.json deleted file mode 100644 index 75da663b..00000000 --- a/apps/ecommerce-catalog-search/.foundry/results/.drift_state.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "ecommerce-catalog-search": { - "failure_count": 0, - "last_signal_at": null - } -} \ No newline at end of file diff --git a/apps/ecommerce-catalog-search/.foundry/results/baseline.json b/apps/ecommerce-catalog-search/.foundry/results/baseline.json deleted file mode 100644 index 93079d6f..00000000 --- a/apps/ecommerce-catalog-search/.foundry/results/baseline.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "agent_name": "ecommerce-catalog-search", - "baseline_id": "ecommerce-catalog-search:baseline", - "created_at": "2026-06-09T16:58:46.976109+00:00", - "dataset_version": "seed", - "metrics": { - "case_count": 10.0, - "dataset_readiness": 1.0, - "expected_behavior_coverage": 1.0, - "ground_truth_coverage": 0.0, - "response_coverage": 0.0 - }, - "updated_at": "2026-06-09T16:58:46.976147+00:00" -} \ No newline at end of file diff --git a/apps/search-enrichment-agent/.foundry/results/.drift_state.json b/apps/search-enrichment-agent/.foundry/results/.drift_state.json deleted file mode 100644 index 9cc7489a..00000000 --- a/apps/search-enrichment-agent/.foundry/results/.drift_state.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "search-enrichment-agent": { - "failure_count": 0, - "last_signal_at": null - } -} \ No newline at end of file diff --git a/apps/search-enrichment-agent/.foundry/results/baseline.json b/apps/search-enrichment-agent/.foundry/results/baseline.json deleted file mode 100644 index b816bb61..00000000 --- a/apps/search-enrichment-agent/.foundry/results/baseline.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "agent_name": "search-enrichment-agent", - "baseline_id": "search-enrichment-agent:baseline", - "created_at": "2026-06-09T16:58:59.722108+00:00", - "dataset_version": "seed", - "metrics": { - "case_count": 10.0, - "dataset_readiness": 1.0, - "expected_behavior_coverage": 1.0, - "ground_truth_coverage": 0.0, - "response_coverage": 0.0 - }, - "updated_at": "2026-06-09T16:58:59.722136+00:00" -} \ No newline at end of file diff --git a/apps/truth-enrichment/.foundry/results/.drift_state.json b/apps/truth-enrichment/.foundry/results/.drift_state.json deleted file mode 100644 index 3cbe4798..00000000 --- a/apps/truth-enrichment/.foundry/results/.drift_state.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "truth-enrichment": { - "failure_count": 0, - "last_signal_at": null - } -} \ No newline at end of file diff --git a/apps/truth-enrichment/.foundry/results/baseline.json b/apps/truth-enrichment/.foundry/results/baseline.json deleted file mode 100644 index 5c68a4ef..00000000 --- a/apps/truth-enrichment/.foundry/results/baseline.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "agent_name": "truth-enrichment", - "baseline_id": "truth-enrichment:baseline", - "created_at": "2026-06-09T16:59:07.446660+00:00", - "dataset_version": "seed", - "metrics": { - "case_count": 10.0, - "dataset_readiness": 1.0, - "expected_behavior_coverage": 1.0, - "ground_truth_coverage": 0.0, - "response_coverage": 0.0 - }, - "updated_at": "2026-06-09T16:59:07.446717+00:00" -} \ No newline at end of file diff --git a/docs/architecture/adrs/adr-017-deployment-strategy.md b/docs/architecture/adrs/adr-017-deployment-strategy.md index 265ab187..10a23879 100644 --- a/docs/architecture/adrs/adr-017-deployment-strategy.md +++ b/docs/architecture/adrs/adr-017-deployment-strategy.md @@ -181,23 +181,17 @@ Required repository secrets: **Status**: Accepted (Revised) ### Evaluation Workflow Integration (Amended: 2026-04) -ADR-028 adds evaluation evidence to PR and deployment governance without changing the deployment source of truth. The current evaluation workflow is `.github/workflows/eval-advisory.yml`, whose workflow name is `agent-eval-advisory`. It discovers the pilot evaluation scope, runs `scripts/ci/run_agent_evaluation.py` for changed pilot agents, writes normalized `.foundry-results/*.json`, publishes job summaries, and uploads evaluation artifacts. - -`agent-eval-advisory` is intentionally advisory and non-required. It must remain outside required branch-protection checks until `docs/governance/README.md` is explicitly revised to promote it. A new scheduled advisory workflow `eval-continuous.yml` exists to provide daily drift detection and alerting; it is advisory-only and does not block deployments. Deployment governance must reference the advisory workflows rather than treating them as automated gates. - -PR reviewers use evaluation artifacts as architecture and quality evidence when prompts, datasets, routing, or evaluation framework code changes. Deployment workflows remain governed by the azd + Flux path in this ADR; evaluation evidence can block a PR by human review policy, but it does not independently deploy, roll back, rename workflows, or bypass `lint` / `test` branch-protection baselines. - -### Evaluation Workflow Integration (Amended: 2026-04) - -ADR-028 (see ADR-028) integrates evaluation evidence into PR and deployment governance while preserving the azd + Flux deployment source of truth. The repository includes an advisory matrix workflow `.github/workflows/eval-advisory.yml` (named `agent-eval-advisory`) — this workflow runs evaluation for changed pilot agents, publishes summaries, and uploads artifacts for reviewer evidence. Separately, a scheduled advisory workflow `.github/workflows/eval-continuous.yml` runs daily (default `0 6 * * *`) to detect quality drift across agents that include `.foundry/eval-config.yaml`. +ADR-028 integrates evaluation evidence into PR and deployment governance while preserving the azd + Flux deployment source of truth. The repository includes an advisory matrix workflow `.github/workflows/eval-advisory.yml` (`agent-eval-advisory`) that runs evaluation for changed pilot agents, publishes summaries, and uploads artifacts for reviewer evidence. Separately, `.github/workflows/eval-continuous.yml` (`agent-eval-continuous`) runs daily by default at `0 6 * * *` UTC to detect quality drift across agents that include `.foundry/eval-config.yaml`. Key controls for evaluation workflows: - Both `agent-eval-advisory` and `agent-eval-continuous` are advisory and non-required by default. They must remain outside required branch-protection checks unless `docs/governance/README.md` is explicitly updated to promote them. -- `agent-eval-continuous` discovers agents by scanning `apps/*/.foundry/eval-config.yaml` and runs the evaluation monitor in a matrix (fail-fast: false). It writes run artifacts to per-agent `.foundry/results/` directories and uploads run artifacts as workflow artifacts. -- When drift is detected, the continuous workflow files an issue (labels `evaluation` and `drift:`) unless `dry_run` is set. The workflow guards against duplicate open issues by searching existing open issues for a stable fingerprint. +- `agent-eval-continuous` discovers agents by scanning `apps/*/.foundry/eval-config.yaml` and runs the evaluation monitor in a matrix with `fail-fast: false`. +- The continuous workflow writes run artifacts to per-agent `.foundry/results/` directories in the workflow workspace and uploads them as workflow artifacts; it does not commit result or baseline files back to the repository. +- When drift is detected, the continuous workflow files an issue with labels `evaluation` and `drift:` unless `dry_run` is set. The workflow guards against duplicate open issues by searching existing open issues for a stable drift fingerprint. These workflows are monitoring and advisory only: they do not perform automatic remediation, rollbacks, or code changes. Deployment governance continues to be enforced by azd + Flux and the `lint`/`test` baseline described in `docs/governance/README.md`. + ## Consequences ### Positive diff --git a/scripts/ci/continuous_eval_monitor.py b/scripts/ci/continuous_eval_monitor.py index 404a1bf2..b0197c88 100644 --- a/scripts/ci/continuous_eval_monitor.py +++ b/scripts/ci/continuous_eval_monitor.py @@ -1,222 +1,305 @@ -"""CI helper: run configured evaluation, detect drift, persist result, open issues (optional). - -This script is an orchestration wrapper around the evaluation helpers in -`holiday_peak_lib.evaluation`. It is intentionally thin: it runs a configured -evaluation, emits a normalized `EvaluationResultEvent` payload, and (optionally) -persists the run artifact and files a GitHub issue when drift is detected. -It avoids mutating private members of `DriftDetector`; consecutive failure -windows across runs are not seeded by this helper (callers may choose to -provide an explicit state path and a separate orchestrator to manage -cross-run windows). -""" +"""Run continuous agent evaluation and file drift issues when needed.""" + from __future__ import annotations import argparse import asyncio import json +import math +import sys +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any REPO_ROOT = Path(__file__).resolve().parents[2] LIB_SRC = REPO_ROOT / "lib" / "src" -import sys if str(LIB_SRC) not in sys.path: sys.path.insert(0, str(LIB_SRC)) -from holiday_peak_lib.evaluation import ( +from holiday_peak_lib.evaluation import ( # noqa: E402 ConfiguredEvaluationRunner, DatasetLoader, DriftDetector, EvaluationResultEvent, ) +from holiday_peak_lib.evaluation.models import DriftReport # noqa: E402 + -import requests +@dataclass(frozen=True) +class GitHubIssueClient: + """Small GitHub Issues REST client used by the CI monitor.""" + + repo: str + token: str + + def search_open_drift_issue(self, fingerprint: str) -> bool: + query = f'repo:{self.repo} state:open "Fingerprint: {fingerprint}"' + encoded_query = urllib.parse.quote(query) + payload = self._request_json("GET", f"/search/issues?q={encoded_query}") + return int(payload.get("total_count", 0)) > 0 + + def create_issue(self, *, title: str, body: str, labels: list[str]) -> str | None: + payload = self._request_json( + "POST", + f"/repos/{self.repo}/issues", + body={"title": title, "body": body, "labels": labels}, + ) + html_url = payload.get("html_url") + return str(html_url) if html_url else None + + def _request_json( + self, + method: str, + path: str, + *, + body: dict[str, Any] | None = None, + ) -> dict[str, Any]: + data = None + if body is not None: + data = json.dumps(body).encode("utf-8") + request = urllib.request.Request( + f"https://api.github.com{path}", + data=data, + method=method, + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json", + "X-GitHub-Api-Version": "2022-11-28", + }, + ) + with urllib.request.urlopen(request, timeout=30) as response: # noqa: S310 + return json.loads(response.read().decode("utf-8")) def parse_args() -> argparse.Namespace: + """Parse monitor command-line arguments.""" + parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--agent-root", required=True) - parser.add_argument("--run-name", default="continuous") - parser.add_argument("--write-result", help="path to write result json") - parser.add_argument("--write-log", help="path to write run log") - parser.add_argument("--state-path", help="path to persist monitoring state (optional)") - parser.add_argument("--repo", help="owner/repo for issue creation") - parser.add_argument("--github-token", help="github token for issue creation") - parser.add_argument("--create-issue", action="store_true") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--update-baseline", action="store_true") + parser.add_argument("--agent-root", required=True, help="Path to the agent app root") + parser.add_argument("--run-name", default="continuous", help="Evaluation run name") + parser.add_argument("--write-result", help="Path to write normalized result JSON") + parser.add_argument("--write-log", help="Path to write monitor summary log") + parser.add_argument("--state-path", help="Optional drift-state JSON path") + parser.add_argument("--repo", help="GitHub owner/repo for issue creation") + parser.add_argument("--github-token", help="GitHub token for issue creation") + parser.add_argument("--create-issue", action="store_true", help="Create drift issues") + parser.add_argument("--dry-run", action="store_true", help="Suppress issue creation") + parser.add_argument( + "--update-baseline", + action="store_true", + help="Update configured baseline.json when no drift is detected", + ) return parser.parse_args() async def run_monitor(args: argparse.Namespace) -> int: + """Run one continuous evaluation monitor cycle.""" + agent_root = Path(args.agent_root).resolve() loader = DatasetLoader(agent_root / ".foundry") - runner = ConfiguredEvaluationRunner(loader=loader, prefer_foundry=False) - + runner = ConfiguredEvaluationRunner(loader=loader, prefer_foundry=True) result = await runner.run(run_name=str(args.run_name)) - baseline = loader.load_baseline(runner.config) - - detector = DriftDetector(runner.config) - # Do not mutate private Detector internals. We run detection for this run - # and emit the report. Cross-run consecutive windows are out-of-band for - # this helper unless a caller provides an external orchestrator. - drift_report = detector.detect(result, baseline=baseline, run_name=str(args.run_name)) - - # Build event payload - event = EvaluationResultEvent( - agent_name=runner.config.agent_name, + drift_report = DriftDetector(runner.config).detect( + result, + baseline=baseline, run_name=str(args.run_name), - backend=getattr(result, 'backend', None), - status=getattr(result, 'status', None), - metrics=_float_metrics(getattr(result, 'metrics', {})), - eval_score=getattr(result, 'score', None), - eval_baseline_id=getattr(result, 'baseline_id', None), - baselineSource=getattr(result, 'baseline_source', None), - drift_detected=drift_report is not None, - drift_report=drift_report, - details=getattr(result, 'details', None), ) + payload = _build_event_payload(runner.config.agent_name, args.run_name, result, drift_report) - # Serialize payload - try: - payload = event.model_dump(mode="json", by_alias=True) if hasattr(event, "model_dump") else event.model_dump() - except Exception: - try: - payload = event.model_dump() - except Exception: - payload = dict(event.__dict__) - - # Write result if requested if args.write_result: - result_path = Path(args.write_result) - result_path.parent.mkdir(parents=True, exist_ok=True) - result_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") - - # Persist a small state file if a state path is provided by the caller. + _write_json(Path(args.write_result), payload) if args.state_path: - try: - state_p = Path(args.state_path) - state = {} - if state_p.exists(): - state = json.loads(state_p.read_text(encoding="utf-8")) - state[runner.config.agent_name] = { - "consecutive_failures": getattr(drift_report, 'consecutive_failures', 0) if drift_report else 0, - "last_signal_at": datetime.now(timezone.utc).isoformat() if drift_report else None, - } - state_p.parent.mkdir(parents=True, exist_ok=True) - state_p.write_text(json.dumps(state, indent=2, sort_keys=True), encoding="utf-8") - except Exception: - # non-fatal; do not fail the run if state persistence fails - pass - - # Issue creation (guarded by dry-run and presence of repo/token) - if drift_report is not None and args.create_issue and args.repo and args.github_token and not args.dry_run: - create_deduped_issue( - repo=args.repo, - token=args.github_token, - agent=runner.config.agent_name, - report=drift_report, - result_path=str(args.write_result) if args.write_result else None, - ) + _write_state(Path(args.state_path), runner.config.agent_name, drift_report) + if args.update_baseline and drift_report is None and runner.config.baseline_path: + _update_baseline(loader, runner.config, result) + + issue_url = None + if drift_report is not None and args.create_issue and not args.dry_run: + if not args.repo or not args.github_token: + print("Skipping drift issue creation: repo or GitHub token not provided") + else: + issue_url = create_deduped_issue( + repo=args.repo, + token=args.github_token, + agent_name=runner.config.agent_name, + report=drift_report, + result_path=args.write_result, + ) + + if args.write_log: + _write_log(Path(args.write_log), runner.config.agent_name, payload, issue_url) + print(json.dumps(payload, indent=2, sort_keys=True)) + return 0 - # Optionally update baseline when requested and no drift - if drift_report is None and args.update_baseline and runner.config.baseline_path: - try: - baseline_path = loader.resolve_path(runner.config.baseline_path) - new_baseline = { - "agent_name": runner.config.agent_name, - "metrics": _float_metrics(getattr(result, 'metrics', {})), - "baseline_id": runner.config.resolved_baseline_id, - "dataset_version": getattr(result, 'details', {}).get('dataset_version', 'seed'), - "created_at": datetime.now(timezone.utc).isoformat(), - "updated_at": datetime.now(timezone.utc).isoformat(), - } - tmp = baseline_path.with_suffix('.tmp') - tmp.write_text(json.dumps(new_baseline, indent=2, sort_keys=True), encoding='utf-8') - tmp.replace(baseline_path) - except Exception: - pass - return 0 +def _build_event_payload( + agent_name: str, + run_name: str, + result: Any, + drift_report: DriftReport | None, +) -> dict[str, Any]: + event = EvaluationResultEvent( + agent_name=agent_name, + run_name=run_name, + backend=result.backend, + status=result.status, + metrics=_float_metrics(result.metrics), + eval_score=result.score, + eval_baseline_id=result.baseline_id, + baseline_source=result.baseline_source, + drift_detected=drift_report is not None, + drift_report=drift_report, + details=result.details, + ) + return event.model_dump(mode="json", by_alias=True) -def _float_metrics(metrics: Dict[str, Any]) -> Dict[str, float]: - normalized: Dict[str, float] = {} +def _float_metrics(metrics: dict[str, Any]) -> dict[str, float]: + normalized: dict[str, float] = {} for key, value in metrics.items(): try: - v = float(value) - # filter NaN / Inf values which are not JSON-friendly for our metrics - import math - - if not math.isfinite(v): - continue - normalized[key] = v + numeric_value = float(value) except (TypeError, ValueError): continue + if math.isfinite(numeric_value): + normalized[key] = numeric_value return normalized -def create_deduped_issue(*, repo: str, token: str, agent: str, report, result_path: Optional[str] = None) -> None: - """Create a GitHub issue for a drift report if none exists for the same agent/severity/fingerprint. +def _write_json(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + +def _write_state(path: Path, agent_name: str, report: DriftReport | None) -> None: + state: dict[str, Any] = {} + if path.exists(): + try: + state = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + state = {} + state[agent_name] = { + "consecutive_failures": report.consecutive_failures if report else 0, + "last_signal_at": datetime.now(timezone.utc).isoformat() if report else None, + } + _write_json(path, state) - This function is a small wrapper over the GitHub REST API. It performs a - search for existing open issues matching a fingerprint and avoids creating - duplicates. - """ - owner, name = repo.split("/") - session = requests.Session() - session.headers.update({"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"}) - severity = getattr(report.severity, 'value', str(getattr(report.severity, 'name', report.severity))) +def _update_baseline(loader: DatasetLoader, config: Any, result: Any) -> None: + baseline_path = loader.resolve_path(config.baseline_path) + existing_payload: dict[str, Any] = {} + if baseline_path.exists(): + try: + existing_payload = json.loads(baseline_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + existing_payload = {} + now = datetime.now(timezone.utc).isoformat() + baseline_payload = { + "agent_name": config.agent_name, + "metrics": _float_metrics(result.metrics), + "baseline_id": config.resolved_baseline_id, + "dataset_version": result.details.get("dataset_version", "seed"), + "created_at": existing_payload.get("created_at", now), + "updated_at": now, + } + temporary_path = baseline_path.with_suffix(".tmp") + _write_json(temporary_path, baseline_payload) + temporary_path.replace(baseline_path) + + +def create_deduped_issue( + *, + repo: str, + token: str, + agent_name: str, + report: DriftReport, + result_path: str | None, + client: GitHubIssueClient | None = None, +) -> str | None: + """Create one drift issue per open fingerprint.""" + fingerprint = _report_fingerprint(report) - title = f"[DRIFT:{severity.upper()}] {agent} evaluation drift" - body_lines = [ - f"Agent: {agent}", - f"Severity: {severity}", - f"Breached: {', '.join(getattr(report, 'breached_thresholds', []))}", - f"Consecutive failures: {getattr(report, 'consecutive_failures', 0)}", - f"Fingerprint: {fingerprint}", + github_client = client or GitHubIssueClient(repo=repo, token=token) + try: + if github_client.search_open_drift_issue(fingerprint): + print(f"Found existing open drift issue for fingerprint {fingerprint}") + return None + severity = report.severity.value + issue_url = github_client.create_issue( + title=f"[DRIFT:{severity.upper()}] {agent_name} evaluation drift", + body=_issue_body(agent_name, report, fingerprint, result_path), + labels=["evaluation", f"drift:{severity}"], + ) + if issue_url: + print(f"Created drift issue: {issue_url}") + return issue_url + except (urllib.error.URLError, TimeoutError, OSError) as exc: + print(f"Failed to create drift issue: {exc}") + return None + + +def _issue_body( + agent_name: str, + report: DriftReport, + fingerprint: str, + result_path: str | None, +) -> str: + lines = [ + f"Agent: **{agent_name}**", + f"Severity: **{report.severity.value}**", + f"Baseline: `{report.baseline_id or 'n/a'}`", + f"Consecutive failures: `{report.consecutive_failures}`", + f"Fingerprint: `{fingerprint}`", + "", + "Breached metrics:", ] + lines.extend(f"- `{item}`" for item in report.breached_thresholds) + lines.extend(["", "Drift magnitude:"]) + lines.extend( + f"- `{key}`: `{value}`" for key, value in sorted(report.drift_metrics.items()) + ) if result_path: - body_lines.append(f"Result artifact: {result_path}") - body = "\n\n".join(body_lines) - - # Search for existing open issue by fingerprint - search_q = f'repo:{repo} state:open in:title "DRIFT" "Fingerprint: {fingerprint}"' - search_url = f'https://api.github.com/search/issues?q={requests.utils.quote(search_q)}' - try: - r = session.get(search_url) - if r.ok: - data = r.json() - if data.get('total_count', 0) > 0: - print(f"Found existing issue for fingerprint {fingerprint}; skipping creation") - return - except Exception: - # network/search failures should not raise - pass - - create_url = f'https://api.github.com/repos/{repo}/issues' - payload = {"title": title, "body": body, "labels": [f"drift:{severity}"]} - try: - r = session.post(create_url, json=payload) - if not r.ok: - print(f"Failed to create issue: {r.status_code} {r.text}") - else: - print(f"Created issue: {r.json().get('html_url')}") - except Exception: - print("Failed to create issue due to network error") + lines.extend(["", f"Evaluation result path: `{result_path}`"]) + return "\n".join(lines) -def _report_fingerprint(report) -> str: - items = sorted(getattr(report, 'drift_metrics', {}).items()) - return "|".join(f"{k}:{round(v,4)}" for k, v in items) +def _report_fingerprint(report: DriftReport) -> str: + parts = [ + *sorted(report.breached_thresholds), + *[f"{key}:{round(value, 4)}" for key, value in sorted(report.drift_metrics.items())], + ] + return "|".join(parts) or "no-drift-details" + + +def _write_log( + path: Path, + agent_name: str, + payload: dict[str, Any], + issue_url: str | None, +) -> None: + lines = [ + f"agent={agent_name}", + f"status={payload.get('status')}", + f"backend={payload.get('backend')}", + f"drift_detected={payload.get('drift_detected')}", + ] + if issue_url: + lines.append(f"issue_url={issue_url}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") def main() -> int: + """Run the monitor CLI.""" + return asyncio.run(run_monitor(parse_args())) -if __name__ == '__main__': - raise SystemExit(main()) +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/tests/ci/test_continuous_eval_monitor.py b/tests/ci/test_continuous_eval_monitor.py index f288577d..6c9a2b2b 100644 --- a/tests/ci/test_continuous_eval_monitor.py +++ b/tests/ci/test_continuous_eval_monitor.py @@ -1,56 +1,120 @@ -import asyncio +"""Tests for the continuous evaluation monitor helper.""" + +from __future__ import annotations + import json from pathlib import Path -import tempfile -import pytest +from holiday_peak_lib.evaluation import EvaluationRunResult +from holiday_peak_lib.evaluation.models import DriftReport, EvalSeverity + +from scripts.ci.continuous_eval_monitor import ( + _build_event_payload, + _float_metrics, + _report_fingerprint, + _write_state, + create_deduped_issue, +) + + +class FakeGitHubIssueClient: + def __init__(self, *, existing: bool = False) -> None: + self.existing = existing + self.created: list[dict[str, object]] = [] + + def search_open_drift_issue(self, _fingerprint: str) -> bool: + return self.existing + + def create_issue(self, *, title: str, body: str, labels: list[str]) -> str: + self.created.append({"title": title, "body": body, "labels": labels}) + return "https://github.com/example/repo/issues/1" + + +def test_float_metrics_skips_non_finite_values() -> None: + source = {"score": "1.0", "count": 3, "nan": "nan", "bad": None} + + assert _float_metrics(source) == {"score": 1.0, "count": 3.0} + + +def test_event_payload_marks_no_drift() -> None: + result = EvaluationRunResult( + status="ok", + backend="local-fallback", + metrics={"dataset_readiness": 1.0}, + details={"case_count": 2}, + score=1.0, + baseline_id="catalog-search:baseline", + ) + + payload = _build_event_payload("catalog-search", "unit", result, None) + + assert payload["agent_name"] == "catalog-search" + assert payload["drift_detected"] is False + assert payload["eval.score"] == 1.0 + assert payload["metrics"] == {"dataset_readiness": 1.0} + + +def test_write_state_records_drift_signal(tmp_path: Path) -> None: + report = _drift_report() + state_path = tmp_path / ".drift_state.json" + + _write_state(state_path, "catalog-search", report) + + state = json.loads(state_path.read_text(encoding="utf-8")) + assert state["catalog-search"]["consecutive_failures"] == 3 + assert state["catalog-search"]["last_signal_at"] -from scripts.ci.continuous_eval_monitor import _float_metrics, _report_fingerprint, create_deduped_issue +def test_create_deduped_issue_creates_when_fingerprint_is_new() -> None: + report = _drift_report() + client = FakeGitHubIssueClient(existing=False) -class DummyReport: - def __init__(self, drift_metrics, breached_thresholds=None, severity='warning', consecutive_failures=3): - self.drift_metrics = drift_metrics - self.breached_thresholds = breached_thresholds or [] - self.severity = type('S', (), {'value': severity}) - self.consecutive_failures = consecutive_failures + issue_url = create_deduped_issue( + repo="owner/repo", + token="token", + agent_name="catalog-search", + report=report, + result_path="apps/catalog/.foundry/results/run.json", + client=client, # type: ignore[arg-type] + ) + assert issue_url == "https://github.com/example/repo/issues/1" + assert client.created + assert "[DRIFT:CRITICAL] catalog-search evaluation drift" == client.created[0]["title"] + assert client.created[0]["labels"] == ["evaluation", "drift:critical"] + assert "Fingerprint:" in str(client.created[0]["body"]) -def test_float_metrics(): - src = {'a': '1.2', 'b': 3, 'c': 'nan', 'd': None} - out = _float_metrics(src) - assert out['a'] == 1.2 - assert out['b'] == 3.0 - assert 'c' not in out - assert 'd' not in out +def test_create_deduped_issue_skips_existing_fingerprint() -> None: + report = _drift_report() + client = FakeGitHubIssueClient(existing=True) -def test_report_fingerprint(): - r = DummyReport({'m1': 0.123456, 'm2': -0.5}) - fp = _report_fingerprint(r) - assert 'm1' in fp and 'm2' in fp + issue_url = create_deduped_issue( + repo="owner/repo", + token="token", + agent_name="catalog-search", + report=report, + result_path=None, + client=client, # type: ignore[arg-type] + ) + assert issue_url is None + assert client.created == [] -def test_create_deduped_issue_dry(monkeypatch, tmp_path): - # Verify that network failures do not raise and function returns gracefully - def fake_get(url): - class R: - ok = False - def json(self): - return {} - return R() - def fake_post(url, json=None): - class R: - ok = False - text = 'error' - status_code = 500 - def json(self): - return {} - return R() +def test_report_fingerprint_is_stable() -> None: + assert _report_fingerprint(_drift_report()) == "quality|quality:-0.25" - monkeypatch.setattr('requests.Session.get', lambda self, url: fake_get(url)) - monkeypatch.setattr('requests.Session.post', lambda self, url, json=None: fake_post(url, json=json)) - # Should not raise - create_deduped_issue(repo='owner/repo', token='x', agent='a', report=DummyReport({'m': 1.0}), result_path=str(tmp_path/'r.json')) \ No newline at end of file +def _drift_report() -> DriftReport: + return DriftReport( + agent_name="catalog-search", + run_name="unit", + severity=EvalSeverity.CRITICAL, + breached_thresholds=["quality"], + drift_metrics={"quality": -0.25}, + current_metrics={"quality": 0.55}, + baseline_metrics={"quality": 0.8}, + baseline_id="catalog-search:baseline", + consecutive_failures=3, + ) \ No newline at end of file