From 6a53676d9e68f488d1a6eb69e18757135a301a86 Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Fri, 13 Feb 2026 21:12:33 +0000 Subject: [PATCH 1/3] feat: add E2E smoke tests to CI Start PromptForge with mock Supabase client via scripts/e2e_server.py, then exercise prompt CRUD, version lifecycle, subscriptions, and audit via scripts/e2e.sh. No external services required. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 22 +++++++ scripts/e2e.sh | 125 +++++++++++++++++++++++++++++++++++++++ scripts/e2e_server.py | 54 +++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100755 scripts/e2e.sh create mode 100644 scripts/e2e_server.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 13104ad..4b6bfb5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,3 +18,25 @@ jobs: - run: ruff check . - run: ruff format --check . - run: pytest -v + + e2e: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - run: pip install -e ".[dev]" + - name: Start service + run: python scripts/e2e_server.py & + env: + SUPABASE_URL: "http://localhost:0" + SUPABASE_KEY: "dummy" + - name: Wait for health + run: | + for i in $(seq 1 30); do + curl -sf http://localhost:8083/health && exit 0 || sleep 1 + done + exit 1 + - name: Run E2E tests + run: bash scripts/e2e.sh diff --git a/scripts/e2e.sh b/scripts/e2e.sh new file mode 100755 index 0000000..fdce0f0 --- /dev/null +++ b/scripts/e2e.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE="http://localhost:8083" +FAIL=0 + +pass() { echo " PASS: $1"; } +fail() { echo " FAIL: $1 — $2"; FAIL=1; } + +echo "=== PromptForge E2E Smoke Tests ===" + +# 1. Health +echo "--- Health ---" +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/health") +if [ "$HTTP" = "200" ]; then + pass "GET /health → 200" +else + fail "GET /health" "expected 200, got $HTTP" +fi + +# 2. List prompts (empty) +echo "--- Prompts CRUD ---" +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/api/v1/prompts") +if [ "$HTTP" = "200" ]; then + pass "GET /api/v1/prompts → 200" +else + fail "GET /api/v1/prompts" "expected 200, got $HTTP" +fi + +# 3. Create prompt +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' -X POST \ + -H "Content-Type: application/json" \ + -d '{"slug":"e2e-test","name":"E2E Test Prompt","type":"persona","description":"Test prompt","tags":["e2e"]}' \ + "$BASE/api/v1/prompts") +if [ "$HTTP" = "201" ]; then + pass "POST /api/v1/prompts → 201" +else + fail "POST /api/v1/prompts" "expected 201, got $HTTP (body: $(cat /tmp/e2e_body))" +fi + +# 4. Get prompt by slug +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/api/v1/prompts/e2e-test") +if [ "$HTTP" = "200" ]; then + pass "GET /api/v1/prompts/e2e-test → 200" +else + fail "GET /api/v1/prompts/e2e-test" "expected 200, got $HTTP" +fi + +# 5. Create version +echo "--- Versions ---" +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' -X POST \ + -H "Content-Type: application/json" \ + -d '{"content":{"sections":[{"id":"identity","label":"Identity","content":"You are an E2E test agent."}]},"message":"v1","author":"e2e-runner"}' \ + "$BASE/api/v1/prompts/e2e-test/versions") +if [ "$HTTP" = "201" ]; then + pass "POST /api/v1/prompts/e2e-test/versions → 201" +else + fail "POST /api/v1/prompts/e2e-test/versions" "expected 201, got $HTTP (body: $(cat /tmp/e2e_body))" +fi + +# 6. Get latest version +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/api/v1/prompts/e2e-test/versions/latest") +if [ "$HTTP" = "200" ]; then + pass "GET /api/v1/prompts/e2e-test/versions/latest → 200" +else + fail "GET versions/latest" "expected 200, got $HTTP" +fi + +# 7. List versions +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/api/v1/prompts/e2e-test/versions") +if [ "$HTTP" = "200" ]; then + pass "GET /api/v1/prompts/e2e-test/versions → 200" +else + fail "GET versions" "expected 200, got $HTTP" +fi + +# 8. Subscribe +echo "--- Subscriptions ---" +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' -X POST \ + -H "X-Agent-ID: e2e-agent" \ + "$BASE/api/v1/prompts/e2e-test/subscribe") +if [ "$HTTP" = "201" ]; then + pass "POST /api/v1/prompts/e2e-test/subscribe → $HTTP" +else + fail "POST subscribe" "expected 201, got $HTTP (body: $(cat /tmp/e2e_body))" +fi + +# 9. List subscribers +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/api/v1/prompts/e2e-test/subscribers") +if [ "$HTTP" = "200" ]; then + pass "GET /api/v1/prompts/e2e-test/subscribers → 200" + if grep -q "e2e-agent" /tmp/e2e_body; then + pass "e2e-agent in subscriber list" + else + fail "subscriber check" "e2e-agent not found" + fi +else + fail "GET subscribers" "expected 200, got $HTTP" +fi + +# 10. Audit log +echo "--- Audit ---" +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' "$BASE/api/v1/audit") +if [ "$HTTP" = "200" ]; then + pass "GET /api/v1/audit → 200" +else + fail "GET /api/v1/audit" "expected 200, got $HTTP" +fi + +# 11. Delete prompt (archive returns 204) +echo "--- Cleanup ---" +HTTP=$(curl -s -o /tmp/e2e_body -w '%{http_code}' -X DELETE "$BASE/api/v1/prompts/e2e-test") +if [ "$HTTP" = "204" ]; then + pass "DELETE /api/v1/prompts/e2e-test → $HTTP" +else + fail "DELETE prompt" "expected 204, got $HTTP (body: $(cat /tmp/e2e_body))" +fi + +echo "" +if [ "$FAIL" -eq 0 ]; then + echo "All PromptForge E2E tests passed." +else + echo "Some PromptForge E2E tests FAILED." + exit 1 +fi diff --git a/scripts/e2e_server.py b/scripts/e2e_server.py new file mode 100644 index 0000000..50b537a --- /dev/null +++ b/scripts/e2e_server.py @@ -0,0 +1,54 @@ +"""Start PromptForge with mock dependencies for E2E testing. + +Patches the lru_cache'd get_supabase_client so the lifespan +can initialise without real Supabase credentials, then overrides +all FastAPI Depends so that requests also hit the mock. +""" + +import sys +from pathlib import Path + +# Ensure the repo root is on sys.path so imports resolve +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +# ── Patch the cached client BEFORE importing `app` ────────────── +from prompt_forge.db import client as _db_mod # noqa: E402 +from tests.conftest import MockSupabaseClient # noqa: E402 + +mock_db = MockSupabaseClient() + +# Clear the lru_cache and replace the function so that the +# lifespan's direct call to get_supabase_client() returns our mock. +_db_mod.get_supabase_client.cache_clear() +_original_get_supabase_client = _db_mod.get_supabase_client +_db_mod.get_supabase_client = lambda: mock_db # type: ignore[assignment] + +# ── Now import the app (triggers module-level code) ───────────── +from prompt_forge.main import app # noqa: E402 +from prompt_forge.core.audit import AuditLogger, get_audit_logger # noqa: E402 +from prompt_forge.core.composer import CompositionEngine, get_composer # noqa: E402 +from prompt_forge.core.registry import PromptRegistry, get_registry # noqa: E402 +from prompt_forge.core.resolver import PromptResolver, get_resolver # noqa: E402 +from prompt_forge.core.vcs import VersionControl, get_vcs # noqa: E402 + +# ── Build service instances backed by the mock ────────────────── +registry = PromptRegistry(mock_db) +vcs = VersionControl(mock_db) +resolver = PromptResolver(mock_db) +composer = CompositionEngine(resolver, registry) +audit = AuditLogger(mock_db) + +# ── Override FastAPI dependencies ─────────────────────────────── +# Note: get_supabase_client is already monkey-patched at module level, +# so Depends(get_supabase_client) in route handlers will call our lambda +# and return mock_db without needing an override here. +app.dependency_overrides[get_registry] = lambda: registry +app.dependency_overrides[get_vcs] = lambda: vcs +app.dependency_overrides[get_resolver] = lambda: resolver +app.dependency_overrides[get_composer] = lambda: composer +app.dependency_overrides[get_audit_logger] = lambda: audit + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8083) From c966467172a2d6cc4332c8bde4e4f01bbb32c2d0 Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Mon, 16 Feb 2026 01:32:04 +0000 Subject: [PATCH 2/3] feat: add prompt effectiveness tracking and feedback loops Adds the prompt_effectiveness table, effectiveness API endpoints (create, update, summary, model-tiers, discovery-accuracy, mission cost breakdown), NATS subscribers for token/correction/session events, and background analysers for verbose prompt detection and autonomy expansion candidates. Co-Authored-By: Claude Opus 4.6 --- prompt_forge/api/effectiveness.py | 326 ++++++++++++++++++ prompt_forge/api/models.py | 74 ++++ prompt_forge/api/router.py | 2 + prompt_forge/core/analyser.py | 131 +++++++ prompt_forge/core/autonomy.py | 119 +++++++ prompt_forge/core/subscribers.py | 188 ++++++++++ .../migrations/006_prompt_effectiveness.sql | 34 ++ prompt_forge/main.py | 48 ++- tests/test_analyser.py | 63 ++++ tests/test_autonomy.py | 76 ++++ tests/test_effectiveness.py | 218 ++++++++++++ 11 files changed, 1272 insertions(+), 7 deletions(-) create mode 100644 prompt_forge/api/effectiveness.py create mode 100644 prompt_forge/core/analyser.py create mode 100644 prompt_forge/core/autonomy.py create mode 100644 prompt_forge/core/subscribers.py create mode 100644 prompt_forge/db/migrations/006_prompt_effectiveness.sql create mode 100644 tests/test_analyser.py create mode 100644 tests/test_autonomy.py create mode 100644 tests/test_effectiveness.py diff --git a/prompt_forge/api/effectiveness.py b/prompt_forge/api/effectiveness.py new file mode 100644 index 0000000..93ba843 --- /dev/null +++ b/prompt_forge/api/effectiveness.py @@ -0,0 +1,326 @@ +"""Prompt effectiveness tracking endpoints.""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from typing import Any +from uuid import UUID + +import structlog +from fastapi import APIRouter, Depends, HTTPException, Query + +from prompt_forge.api.models import ( + EffectivenessCreate, + EffectivenessResponse, + EffectivenessSummary, + EffectivenessUpdate, + ModelEffectivenessResponse, +) +from prompt_forge.db.client import SupabaseClient, get_supabase_client + +logger = structlog.get_logger() +router = APIRouter() + + +@router.post("/effectiveness", status_code=201, response_model=EffectivenessResponse) +async def create_effectiveness( + data: EffectivenessCreate, + db: SupabaseClient = Depends(get_supabase_client), +) -> EffectivenessResponse: + """Create an effectiveness tracking record at session spawn.""" + row = db.insert( + "prompt_effectiveness", + { + "session_uuid": data.session_uuid, + "prompt_id": str(data.prompt_id) if data.prompt_id else None, + "version_id": str(data.version_id) if data.version_id else None, + "agent_id": data.agent_id, + "model_id": data.model_id, + "model_tier": data.model_tier, + "briefing_hash": data.briefing_hash, + "mission_id": data.mission_id, + "task_id": data.task_id, + }, + ) + return _row_to_response(row) + + +@router.patch("/effectiveness/{session_uuid}", response_model=EffectivenessResponse) +async def update_effectiveness( + session_uuid: str, + data: EffectivenessUpdate, + db: SupabaseClient = Depends(get_supabase_client), +) -> EffectivenessResponse: + """Update effectiveness record with tokens/corrections/outcome.""" + rows = db.select("prompt_effectiveness", filters={"session_uuid": session_uuid}) + if not rows: + raise HTTPException(status_code=404, detail=f"No record for session {session_uuid}") + + updates = {k: v for k, v in data.model_dump().items() if v is not None} + if updates.get("completed_at"): + updates["completed_at"] = updates["completed_at"].isoformat() + if not updates: + return _row_to_response(rows[0]) + + row = db.update("prompt_effectiveness", rows[0]["id"], updates) + return _row_to_response(row) + + +@router.get("/effectiveness/summary", response_model=list[EffectivenessSummary]) +async def effectiveness_summary( + group_by: str = Query(default="version_id", pattern=r"^(version_id|model_id|agent_id|model_tier)$"), + prompt_id: UUID | None = None, + model_id: str | None = None, + agent_id: str | None = None, + days: int = Query(default=30, ge=1, le=365), + db: SupabaseClient = Depends(get_supabase_client), +) -> list[EffectivenessSummary]: + """Aggregated effectiveness stats, filterable by prompt/model/agent/time.""" + cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat() + + rows = db.select("prompt_effectiveness") + filtered = [r for r in rows if r.get("created_at", "") >= cutoff] + if prompt_id: + filtered = [r for r in filtered if r.get("prompt_id") == str(prompt_id)] + if model_id: + filtered = [r for r in filtered if r.get("model_id") == model_id] + if agent_id: + filtered = [r for r in filtered if r.get("agent_id") == agent_id] + + groups: dict[str, list[dict]] = {} + for r in filtered: + key = r.get(group_by) or "unknown" + groups.setdefault(str(key), []).append(r) + + summaries = [] + for gval, records in groups.items(): + count = len(records) + tokens = [r.get("total_tokens") for r in records if r.get("total_tokens")] + costs = [r.get("cost_usd") for r in records if r.get("cost_usd")] + scores = [r.get("outcome_score") for r in records if r.get("outcome_score") is not None] + effs = [r.get("effectiveness") for r in records if r.get("effectiveness") is not None] + corrections = sum(r.get("correction_count", 0) for r in records) + + summaries.append( + EffectivenessSummary( + group_key=group_by, + group_value=gval, + session_count=count, + avg_tokens=sum(tokens) / len(tokens) if tokens else None, + avg_cost_usd=sum(costs) / len(costs) if costs else None, + avg_outcome_score=sum(scores) / len(scores) if scores else None, + avg_effectiveness=sum(effs) / len(effs) if effs else None, + total_corrections=corrections, + correction_rate=corrections / count if count > 0 else None, + ) + ) + return summaries + + +@router.get("/effectiveness/model-tiers", response_model=ModelEffectivenessResponse) +async def model_tier_effectiveness( + days: int = Query(default=30, ge=1, le=365), + db: SupabaseClient = Depends(get_supabase_client), +) -> ModelEffectivenessResponse: + """Correction rate and avg effectiveness per model tier (Dispatch consumes this).""" + cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat() + rows = db.select("prompt_effectiveness") + filtered = [r for r in rows if r.get("created_at", "") >= cutoff] + + tiers: dict[str, list[dict]] = {} + for r in filtered: + tier = r.get("model_tier") or "unknown" + tiers.setdefault(tier, []).append(r) + + result = {} + for tier_name in ("economy", "standard", "premium"): + records = tiers.get(tier_name, []) + if not records: + continue + count = len(records) + corrections = sum(r.get("correction_count", 0) for r in records) + effs = [r.get("effectiveness") for r in records if r.get("effectiveness") is not None] + scores = [r.get("outcome_score") for r in records if r.get("outcome_score") is not None] + result[tier_name] = EffectivenessSummary( + group_key="model_tier", + group_value=tier_name, + session_count=count, + avg_tokens=None, + avg_cost_usd=None, + avg_outcome_score=sum(scores) / len(scores) if scores else None, + avg_effectiveness=sum(effs) / len(effs) if effs else None, + total_corrections=corrections, + correction_rate=corrections / count if count > 0 else None, + ) + + return ModelEffectivenessResponse(**result) + + +@router.get("/effectiveness/prompt/{slug}/versions", response_model=list[EffectivenessSummary]) +async def prompt_version_effectiveness( + slug: str, + days: int = Query(default=30, ge=1, le=365), + db: SupabaseClient = Depends(get_supabase_client), +) -> list[EffectivenessSummary]: + """Compare effectiveness across prompt versions for a given slug.""" + prompts = db.select("prompts", filters={"slug": slug}) + if not prompts: + raise HTTPException(status_code=404, detail=f"Prompt '{slug}' not found") + prompt_id = prompts[0]["id"] + + cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat() + rows = db.select("prompt_effectiveness") + filtered = [ + r for r in rows + if r.get("prompt_id") == str(prompt_id) and r.get("created_at", "") >= cutoff + ] + + groups: dict[str, list[dict]] = {} + for r in filtered: + vid = str(r.get("version_id") or "unknown") + groups.setdefault(vid, []).append(r) + + summaries = [] + for vid, records in groups.items(): + count = len(records) + effs = [r.get("effectiveness") for r in records if r.get("effectiveness") is not None] + scores = [r.get("outcome_score") for r in records if r.get("outcome_score") is not None] + corrections = sum(r.get("correction_count", 0) for r in records) + summaries.append( + EffectivenessSummary( + group_key="version_id", + group_value=vid, + session_count=count, + avg_tokens=None, + avg_cost_usd=None, + avg_outcome_score=sum(scores) / len(scores) if scores else None, + avg_effectiveness=sum(effs) / len(effs) if effs else None, + total_corrections=corrections, + correction_rate=corrections / count if count > 0 else None, + ) + ) + return summaries + + +@router.get("/effectiveness/compression-candidates", response_model=list[dict[str, Any]]) +async def compression_candidates() -> list[dict[str, Any]]: + """Prompt versions flagged as verbose (>2x median tokens).""" + from prompt_forge.core.analyser import analyse_verbose_prompts + return await analyse_verbose_prompts() + + +@router.get("/effectiveness/autonomy-candidates", response_model=list[dict[str, Any]]) +async def autonomy_candidates() -> list[dict[str, Any]]: + """Agents where human intervention is low enough for autonomy expansion.""" + from prompt_forge.core.autonomy import analyse_autonomy_candidates + return await analyse_autonomy_candidates() + + +@router.get("/effectiveness/mission/{mission_id}", response_model=dict[str, Any]) +async def mission_cost_breakdown( + mission_id: str, + db: SupabaseClient = Depends(get_supabase_client), +) -> dict[str, Any]: + """Planning + execution + review cost breakdown for a mission.""" + rows = db.select("prompt_effectiveness") + mission_rows = [r for r in rows if r.get("mission_id") == mission_id] + + if not mission_rows: + raise HTTPException(status_code=404, detail=f"No records for mission {mission_id}") + + total_cost = sum(r.get("cost_usd", 0) or 0 for r in mission_rows) + total_tokens = sum(r.get("total_tokens", 0) or 0 for r in mission_rows) + total_corrections = sum(r.get("correction_count", 0) or 0 for r in mission_rows) + scores = [r["outcome_score"] for r in mission_rows if r.get("outcome_score") is not None] + + # Group by task_id to approximate stages. + by_task: dict[str, list[dict]] = {} + for r in mission_rows: + tid = r.get("task_id") or "unattributed" + by_task.setdefault(tid, []).append(r) + + stages = [] + for tid, records in by_task.items(): + stage_cost = sum(r.get("cost_usd", 0) or 0 for r in records) + stage_tokens = sum(r.get("total_tokens", 0) or 0 for r in records) + stages.append({ + "task_id": tid, + "cost_usd": stage_cost, + "total_tokens": stage_tokens, + "session_count": len(records), + }) + + return { + "mission_id": mission_id, + "total_cost_usd": total_cost, + "total_tokens": total_tokens, + "total_corrections": total_corrections, + "avg_outcome_score": sum(scores) / len(scores) if scores else None, + "session_count": len(mission_rows), + "stages": stages, + } + + +@router.get("/effectiveness/discovery-accuracy", response_model=list[dict[str, Any]]) +async def discovery_accuracy( + days: int = Query(default=30, ge=1, le=365), + db: SupabaseClient = Depends(get_supabase_client), +) -> list[dict[str, Any]]: + """Initial vs post-discovery score comparison.""" + cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat() + rows = db.select("prompt_effectiveness") + filtered = [ + r for r in rows + if r.get("created_at", "") >= cutoff + and r.get("mission_id") + and r.get("outcome_score") is not None + ] + + missions: dict[str, list[dict]] = {} + for r in filtered: + mid = r["mission_id"] + missions.setdefault(mid, []).append(r) + + results = [] + for mid, records in missions.items(): + sorted_recs = sorted(records, key=lambda x: x.get("created_at", "")) + if len(sorted_recs) < 2: + continue + initial = sorted_recs[0].get("outcome_score", 0) or 0 + final = sorted_recs[-1].get("outcome_score", 0) or 0 + accuracy = 1 - abs(initial - final) / max(initial, 0.001) if initial > 0 else None + results.append({ + "mission_id": mid, + "initial_score": initial, + "final_score": final, + "discovery_accuracy": accuracy, + "session_count": len(sorted_recs), + }) + return results + + +def _row_to_response(row: dict) -> EffectivenessResponse: + """Convert a database row to an EffectivenessResponse.""" + return EffectivenessResponse( + id=row["id"], + prompt_id=row.get("prompt_id"), + version_id=row.get("version_id"), + session_uuid=row["session_uuid"], + mission_id=row.get("mission_id"), + task_id=row.get("task_id"), + agent_id=row["agent_id"], + model_id=row["model_id"], + model_tier=row.get("model_tier"), + briefing_hash=row.get("briefing_hash"), + input_tokens=row.get("input_tokens"), + output_tokens=row.get("output_tokens"), + total_tokens=row.get("total_tokens"), + cost_usd=row.get("cost_usd"), + correction_count=row.get("correction_count", 0), + human_interventions=row.get("human_interventions", 0), + outcome=row.get("outcome", "unknown"), + outcome_score=row.get("outcome_score"), + effectiveness=row.get("effectiveness"), + created_at=row["created_at"], + completed_at=row.get("completed_at"), + ) diff --git a/prompt_forge/api/models.py b/prompt_forge/api/models.py index 8dbfaba..6a4f4f5 100644 --- a/prompt_forge/api/models.py +++ b/prompt_forge/api/models.py @@ -272,3 +272,77 @@ class UsageStatsResponse(BaseModel): success_rate: float avg_latency_ms: float | None version_breakdown: dict[str, int] + + +# --- Effectiveness --- + + +class EffectivenessCreate(BaseModel): + """Create an effectiveness tracking record at session spawn.""" + session_uuid: str + prompt_id: UUID | None = None + version_id: UUID | None = None + agent_id: str + model_id: str + model_tier: str | None = None + briefing_hash: str | None = None + mission_id: str | None = None + task_id: str | None = None + + +class EffectivenessUpdate(BaseModel): + """Partial update for token/correction/outcome data.""" + input_tokens: int | None = None + output_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + correction_count: int | None = None + human_interventions: int | None = None + outcome: str | None = Field(default=None, pattern=r"^(success|failure|partial|unknown)$") + outcome_score: float | None = Field(default=None, ge=0.0, le=1.0) + completed_at: datetime | None = None + + +class EffectivenessResponse(BaseModel): + """Full effectiveness record.""" + id: UUID + prompt_id: UUID | None + version_id: UUID | None + session_uuid: str + mission_id: str | None + task_id: str | None + agent_id: str + model_id: str + model_tier: str | None + briefing_hash: str | None + input_tokens: int | None + output_tokens: int | None + total_tokens: int | None + cost_usd: float | None + correction_count: int + human_interventions: int + outcome: str + outcome_score: float | None + effectiveness: float | None + created_at: datetime + completed_at: datetime | None + + +class EffectivenessSummary(BaseModel): + """Aggregated effectiveness stats per prompt version or model.""" + group_key: str + group_value: str + session_count: int + avg_tokens: float | None + avg_cost_usd: float | None + avg_outcome_score: float | None + avg_effectiveness: float | None + total_corrections: int + correction_rate: float | None + + +class ModelEffectivenessResponse(BaseModel): + """Per-model-tier correction rates and avg effectiveness.""" + economy: EffectivenessSummary | None = None + standard: EffectivenessSummary | None = None + premium: EffectivenessSummary | None = None diff --git a/prompt_forge/api/router.py b/prompt_forge/api/router.py index d6c15e3..610e1a4 100644 --- a/prompt_forge/api/router.py +++ b/prompt_forge/api/router.py @@ -7,6 +7,7 @@ from prompt_forge.api.audit import router as audit_router from prompt_forge.api.branches import router as branches_router from prompt_forge.api.compose import router as compose_router +from prompt_forge.api.effectiveness import router as effectiveness_router from prompt_forge.api.prompts import router as prompts_router from prompt_forge.api.scan import router as scan_router from prompt_forge.api.subscriptions import router as subscriptions_router @@ -25,3 +26,4 @@ api_router.include_router(scan_router, tags=["scanning"]) api_router.include_router(audit_router, tags=["audit"]) api_router.include_router(architect_router, tags=["architect"]) +api_router.include_router(effectiveness_router, tags=["effectiveness"]) diff --git a/prompt_forge/core/analyser.py b/prompt_forge/core/analyser.py new file mode 100644 index 0000000..aaf70c8 --- /dev/null +++ b/prompt_forge/core/analyser.py @@ -0,0 +1,131 @@ +"""Prompt verbosity analyser — Feedback Loop 3. + +Periodically analyses prompt_effectiveness data to identify prompt versions +that consume disproportionately many tokens relative to peers with similar +outcome scores. Flags verbose prompts and publishes NATS alerts. +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone +from statistics import median + +import structlog + +from prompt_forge.db.client import get_supabase_client + +logger = structlog.get_logger() + + +async def analyse_verbose_prompts() -> list[dict]: + """Identify prompt versions using >2x median tokens with similar outcome. + + Returns a list of flagged version records with context. + """ + db = get_supabase_client() + cutoff = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat() + + rows = db.select("prompt_effectiveness") + recent = [ + r for r in rows + if r.get("created_at", "") >= cutoff + and r.get("total_tokens") + and r.get("version_id") + ] + + if not recent: + return [] + + # Group by version_id. + versions: dict[str, list[dict]] = {} + for r in recent: + vid = str(r["version_id"]) + versions.setdefault(vid, []).append(r) + + # Compute per-version averages. + version_stats = [] + for vid, records in versions.items(): + tokens = [r["total_tokens"] for r in records] + scores = [r["outcome_score"] for r in records if r.get("outcome_score") is not None] + avg_tokens = sum(tokens) / len(tokens) + avg_score = sum(scores) / len(scores) if scores else None + version_stats.append({ + "version_id": vid, + "avg_tokens": avg_tokens, + "avg_score": avg_score, + "session_count": len(records), + "prompt_id": records[0].get("prompt_id"), + }) + + if len(version_stats) < 2: + return [] + + # Median token usage across all versions. + all_avg_tokens = [v["avg_tokens"] for v in version_stats] + median_tokens = median(all_avg_tokens) + + if median_tokens == 0: + return [] + + # Flag versions using >2x median tokens. + flagged = [] + for v in version_stats: + ratio = v["avg_tokens"] / median_tokens + if ratio > 2.0: + flagged.append({ + "version_id": v["version_id"], + "prompt_id": v["prompt_id"], + "avg_tokens": v["avg_tokens"], + "median_tokens": median_tokens, + "token_ratio": round(ratio, 2), + "avg_score": v["avg_score"], + "session_count": v["session_count"], + }) + + return flagged + + +async def publish_verbose_alerts(flagged: list[dict]) -> int: + """Publish swarm.prompt.verbose.detected for each flagged version.""" + if not flagged: + return 0 + + try: + from prompt_forge.core.events import get_event_publisher + publisher = get_event_publisher() + if not publisher._connected: + return 0 + except Exception: + return 0 + + published = 0 + for item in flagged: + ok = await publisher.publish( + event_type="prompt.verbose.detected", + subject="swarm.prompt.verbose.detected", + data=item, + ) + if ok: + published += 1 + + return published + + +async def run_analyser_loop() -> None: + """Background task: analyse verbose prompts every hour.""" + while True: + try: + await asyncio.sleep(3600) # 1 hour + flagged = await analyse_verbose_prompts() + if flagged: + published = await publish_verbose_alerts(flagged) + logger.info( + "analyser.verbose_detected", + flagged=len(flagged), + published=published, + ) + except asyncio.CancelledError: + break + except Exception as e: + logger.warning("analyser.error", error=str(e)) diff --git a/prompt_forge/core/autonomy.py b/prompt_forge/core/autonomy.py new file mode 100644 index 0000000..fe00da7 --- /dev/null +++ b/prompt_forge/core/autonomy.py @@ -0,0 +1,119 @@ +"""Autonomy expansion analyser — Feedback Loop 4. + +Analyses human intervention patterns to identify gates where agent +recommendations align with human decisions >90% of the time, flagging +them as candidates for increased autonomy. +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone + +import structlog + +from prompt_forge.db.client import get_supabase_client + +logger = structlog.get_logger() + + +async def analyse_autonomy_candidates() -> list[dict]: + """Identify agents/gate types where human interventions are low enough + to consider expanding autonomy. + + Uses a rolling 30-day window. An agent is a candidate if: + - At least 10 completed sessions + - human_interventions == 0 in >90% of sessions (agent recommendation accepted) + """ + db = get_supabase_client() + cutoff = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat() + + rows = db.select("prompt_effectiveness") + recent = [ + r for r in rows + if r.get("created_at", "") >= cutoff + and r.get("completed_at") + and r.get("agent_id") + ] + + if not recent: + return [] + + # Group by agent_id. + agents: dict[str, list[dict]] = {} + for r in recent: + aid = r["agent_id"] + agents.setdefault(aid, []).append(r) + + candidates = [] + for aid, records in agents.items(): + if len(records) < 10: + continue + + total = len(records) + no_intervention = sum( + 1 for r in records + if (r.get("human_interventions") or 0) == 0 + ) + alignment_rate = no_intervention / total + + if alignment_rate >= 0.9: + avg_score = None + scores = [r["outcome_score"] for r in records if r.get("outcome_score") is not None] + if scores: + avg_score = sum(scores) / len(scores) + + candidates.append({ + "agent_id": aid, + "session_count": total, + "no_intervention_count": no_intervention, + "alignment_rate": round(alignment_rate, 4), + "avg_outcome_score": round(avg_score, 4) if avg_score is not None else None, + }) + + return candidates + + +async def publish_autonomy_alerts(candidates: list[dict]) -> int: + """Publish swarm.prompt.autonomy.candidate for each candidate.""" + if not candidates: + return 0 + + try: + from prompt_forge.core.events import get_event_publisher + publisher = get_event_publisher() + if not publisher._connected: + return 0 + except Exception: + return 0 + + published = 0 + for item in candidates: + ok = await publisher.publish( + event_type="prompt.autonomy.candidate", + subject="swarm.prompt.autonomy.candidate", + data=item, + ) + if ok: + published += 1 + + return published + + +async def run_autonomy_loop() -> None: + """Background task: analyse autonomy candidates every hour.""" + while True: + try: + await asyncio.sleep(3600) # 1 hour + candidates = await analyse_autonomy_candidates() + if candidates: + published = await publish_autonomy_alerts(candidates) + logger.info( + "autonomy.candidates_detected", + candidates=len(candidates), + published=published, + ) + except asyncio.CancelledError: + break + except Exception as e: + logger.warning("autonomy.error", error=str(e)) diff --git a/prompt_forge/core/subscribers.py b/prompt_forge/core/subscribers.py new file mode 100644 index 0000000..0b5897d --- /dev/null +++ b/prompt_forge/core/subscribers.py @@ -0,0 +1,188 @@ +"""NATS subscribers for effectiveness data collection. + +Listens to: +- swarm.usage.tokens — upserts token data by session_uuid +- swarm.dredd.correction — increments correction_count by session_uuid +- swarm.cc.session.completed — updates completion metadata +""" + +from __future__ import annotations + +import json +from datetime import datetime, timezone + +import structlog + +logger = structlog.get_logger() + +_nats_available = False +try: + import nats as nats_lib + _nats_available = True +except ImportError: + pass + + +class EffectivenessSubscriber: + """Subscribes to NATS events and updates prompt_effectiveness records.""" + + def __init__(self, nats_url: str = "nats://localhost:4222") -> None: + self.nats_url = nats_url + self._nc = None + self._subs = [] + self._connected = False + + async def connect(self) -> bool: + if not _nats_available: + logger.info("subscribers.nats_not_installed") + return False + try: + self._nc = await nats_lib.connect(self.nats_url) + self._connected = True + logger.info("subscribers.connected", url=self.nats_url) + return True + except Exception as e: + logger.warning("subscribers.connect_failed", error=str(e)) + return False + + async def start(self) -> None: + if not self._connected: + return + + sub1 = await self._nc.subscribe("swarm.usage.tokens", cb=self._handle_token_usage) + sub2 = await self._nc.subscribe("swarm.dredd.correction", cb=self._handle_correction) + sub3 = await self._nc.subscribe("swarm.cc.session.completed", cb=self._handle_session_completed) + self._subs = [sub1, sub2, sub3] + logger.info("subscribers.started", subjects=["swarm.usage.tokens", "swarm.dredd.correction", "swarm.cc.session.completed"]) + + async def stop(self) -> None: + for sub in self._subs: + try: + await sub.unsubscribe() + except Exception: + pass + if self._nc and self._connected: + try: + await self._nc.close() + except Exception: + pass + self._connected = False + logger.info("subscribers.stopped") + + async def _handle_token_usage(self, msg) -> None: + """Handle swarm.usage.tokens — upsert token data by session_uuid.""" + try: + payload = json.loads(msg.data.decode()) + data = payload.get("data", payload) + session_uuid = data.get("session_uuid") + if not session_uuid: + return + + from prompt_forge.db.client import get_supabase_client + db = get_supabase_client() + + rows = db.select("prompt_effectiveness", filters={"session_uuid": session_uuid}) + if rows: + updates = {} + if data.get("input_tokens"): + updates["input_tokens"] = data["input_tokens"] + if data.get("output_tokens"): + updates["output_tokens"] = data["output_tokens"] + if data.get("total_tokens"): + updates["total_tokens"] = data["total_tokens"] + if data.get("cost_usd"): + updates["cost_usd"] = data["cost_usd"] + if data.get("model_tier") and not rows[0].get("model_tier"): + updates["model_tier"] = data["model_tier"] + if updates: + db.update("prompt_effectiveness", rows[0]["id"], updates) + logger.debug("subscribers.tokens_updated", session=session_uuid) + else: + logger.debug("subscribers.tokens_no_record", session=session_uuid) + except Exception as e: + logger.warning("subscribers.token_usage_error", error=str(e)) + + async def _handle_correction(self, msg) -> None: + """Handle swarm.dredd.correction — increment correction_count.""" + try: + payload = json.loads(msg.data.decode()) + data = payload.get("data", payload) + session_ref = data.get("session_ref") + if not session_ref: + return + + from prompt_forge.db.client import get_supabase_client + db = get_supabase_client() + + rows = db.select("prompt_effectiveness", filters={"session_uuid": session_ref}) + if not rows: + return + + row = rows[0] + updates = {"correction_count": (row.get("correction_count", 0) or 0) + 1} + + correction_type = data.get("correction_type", "") + if correction_type == "rejected": + current_score = row.get("outcome_score") + if current_score is not None: + updates["outcome_score"] = max(0.0, current_score - 0.1) + else: + updates["outcome_score"] = 0.5 + elif correction_type == "confirmed": + current_score = row.get("outcome_score") + if current_score is not None: + updates["outcome_score"] = min(1.0, current_score + 0.05) + else: + updates["outcome_score"] = 0.8 + + db.update("prompt_effectiveness", row["id"], updates) + logger.debug("subscribers.correction_applied", session=session_ref, type=correction_type) + except Exception as e: + logger.warning("subscribers.correction_error", error=str(e)) + + async def _handle_session_completed(self, msg) -> None: + """Handle swarm.cc.session.completed — update completion metadata.""" + try: + payload = json.loads(msg.data.decode()) + data = payload.get("data", payload) + session_id = data.get("session_id") + if not session_id: + return + + from prompt_forge.db.client import get_supabase_client + db = get_supabase_client() + + rows = db.select("prompt_effectiveness", filters={"session_uuid": session_id}) + if not rows: + return + + row = rows[0] + updates = {"completed_at": datetime.now(timezone.utc).isoformat()} + + exit_code = data.get("exit_code") + if exit_code == 0 and not row.get("outcome_score"): + updates["outcome"] = "success" + updates["outcome_score"] = 0.7 + elif exit_code and exit_code != 0 and not row.get("outcome_score"): + updates["outcome"] = "failure" + updates["outcome_score"] = 0.2 + + if data.get("task_id") and not row.get("task_id"): + updates["task_id"] = data["task_id"] + + db.update("prompt_effectiveness", row["id"], updates) + logger.debug("subscribers.session_completed", session=session_id) + except Exception as e: + logger.warning("subscribers.session_completed_error", error=str(e)) + + +_subscriber: EffectivenessSubscriber | None = None + + +def get_effectiveness_subscriber() -> EffectivenessSubscriber: + import os + global _subscriber + if _subscriber is None: + nats_url = os.getenv("NATS_URL", "nats://localhost:4222") + _subscriber = EffectivenessSubscriber(nats_url) + return _subscriber diff --git a/prompt_forge/db/migrations/006_prompt_effectiveness.sql b/prompt_forge/db/migrations/006_prompt_effectiveness.sql new file mode 100644 index 0000000..fe0de0a --- /dev/null +++ b/prompt_forge/db/migrations/006_prompt_effectiveness.sql @@ -0,0 +1,34 @@ +CREATE TABLE prompt_effectiveness ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + prompt_id UUID REFERENCES prompts(id), + version_id UUID REFERENCES prompt_versions(id), + session_uuid TEXT NOT NULL, + mission_id TEXT, + task_id TEXT, + agent_id TEXT NOT NULL, + model_id TEXT NOT NULL, + model_tier TEXT, + briefing_hash TEXT, + input_tokens BIGINT, + output_tokens BIGINT, + total_tokens BIGINT, + cost_usd NUMERIC(10,6), + correction_count INT DEFAULT 0, + human_interventions INT DEFAULT 0, + outcome TEXT DEFAULT 'unknown', + outcome_score FLOAT, + effectiveness FLOAT GENERATED ALWAYS AS ( + CASE WHEN cost_usd > 0 AND outcome_score IS NOT NULL + THEN outcome_score / cost_usd ELSE NULL END + ) STORED, + created_at TIMESTAMPTZ DEFAULT NOW(), + completed_at TIMESTAMPTZ +); + +CREATE INDEX idx_effectiveness_session ON prompt_effectiveness(session_uuid); +CREATE INDEX idx_effectiveness_prompt ON prompt_effectiveness(prompt_id); +CREATE INDEX idx_effectiveness_version ON prompt_effectiveness(version_id); +CREATE INDEX idx_effectiveness_model ON prompt_effectiveness(model_id); +CREATE INDEX idx_effectiveness_model_tier ON prompt_effectiveness(model_tier); +CREATE INDEX idx_effectiveness_mission ON prompt_effectiveness(mission_id); +CREATE INDEX idx_effectiveness_created ON prompt_effectiveness(created_at DESC); diff --git a/prompt_forge/main.py b/prompt_forge/main.py index 0141a5c..4800366 100644 --- a/prompt_forge/main.py +++ b/prompt_forge/main.py @@ -18,6 +18,8 @@ logger = structlog.get_logger() _cleanup_task = None +_analyser_task = None +_autonomy_task = None async def subscription_ttl_cleanup(): @@ -60,18 +62,50 @@ async def lifespan(app: FastAPI): except Exception as e: logger.info("promptforge.nats_skipped", reason=str(e)) + # Initialize NATS effectiveness subscribers (optional) + try: + from prompt_forge.core.subscribers import get_effectiveness_subscriber + subscriber = get_effectiveness_subscriber() + if await subscriber.connect(): + await subscriber.start() + except Exception as e: + logger.info("promptforge.subscribers_skipped", reason=str(e)) + # Start TTL cleanup background task _cleanup_task = asyncio.create_task(subscription_ttl_cleanup()) + # Start analyser and autonomy background tasks + global _analyser_task, _autonomy_task + try: + from prompt_forge.core.analyser import run_analyser_loop + _analyser_task = asyncio.create_task(run_analyser_loop()) + except Exception as e: + logger.info("promptforge.analyser_skipped", reason=str(e)) + + try: + from prompt_forge.core.autonomy import run_autonomy_loop + _autonomy_task = asyncio.create_task(run_autonomy_loop()) + except Exception as e: + logger.info("promptforge.autonomy_skipped", reason=str(e)) + yield - # Cancel cleanup task - if _cleanup_task: - _cleanup_task.cancel() - try: - await _cleanup_task - except asyncio.CancelledError: - pass + # Cancel background tasks + for task in (_cleanup_task, _analyser_task, _autonomy_task): + if task: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + # Disconnect NATS subscribers + try: + from prompt_forge.core.subscribers import get_effectiveness_subscriber + subscriber = get_effectiveness_subscriber() + await subscriber.stop() + except Exception: + pass # Disconnect NATS try: diff --git a/tests/test_analyser.py b/tests/test_analyser.py new file mode 100644 index 0000000..d551dbf --- /dev/null +++ b/tests/test_analyser.py @@ -0,0 +1,63 @@ +"""Tests for prompt verbosity analyser (Loop 3).""" + +import pytest + +from prompt_forge.core.analyser import analyse_verbose_prompts + + +class TestAnalyseVerbosePrompts: + def _seed(self, mock_db, version_tokens: dict[str, int]): + """Seed prompt_effectiveness with per-version average tokens.""" + for vid, avg_tokens in version_tokens.items(): + for i in range(5): + mock_db.insert( + "prompt_effectiveness", + { + "session_uuid": f"sess-{vid}-{i}", + "version_id": vid, + "prompt_id": "prompt-1", + "agent_id": "developer", + "model_id": "claude-sonnet-4-5-20250929", + "total_tokens": avg_tokens, + "outcome_score": 0.8, + "created_at": "2026-02-15T00:00:00Z", + }, + ) + + @pytest.mark.asyncio + async def test_flags_verbose_version(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db + ) + # 3 versions: A=3000, B=4000 (median), C=20000 (>2x median=4000 → ratio 5.0) + self._seed(mock_db, {"version-a": 3000, "version-b": 4000, "version-c": 20000}) + flagged = await analyse_verbose_prompts() + assert len(flagged) == 1 + assert flagged[0]["version_id"] == "version-c" + assert flagged[0]["token_ratio"] > 2.0 + + @pytest.mark.asyncio + async def test_no_flags_when_similar(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db + ) + self._seed(mock_db, {"version-a": 5000, "version-b": 6000, "version-c": 5500}) + flagged = await analyse_verbose_prompts() + assert len(flagged) == 0 + + @pytest.mark.asyncio + async def test_empty_data(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db + ) + flagged = await analyse_verbose_prompts() + assert flagged == [] + + @pytest.mark.asyncio + async def test_single_version_no_flag(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db + ) + self._seed(mock_db, {"version-a": 50000}) + flagged = await analyse_verbose_prompts() + assert flagged == [] diff --git a/tests/test_autonomy.py b/tests/test_autonomy.py new file mode 100644 index 0000000..a9e142c --- /dev/null +++ b/tests/test_autonomy.py @@ -0,0 +1,76 @@ +"""Tests for autonomy expansion analyser (Loop 4).""" + +import pytest + +from prompt_forge.core.autonomy import analyse_autonomy_candidates + + +class TestAnalyseAutonomyCandidates: + def _seed(self, mock_db, agent_id: str, count: int, interventions: int): + """Seed prompt_effectiveness with sessions for an agent.""" + for i in range(count): + mock_db.insert( + "prompt_effectiveness", + { + "session_uuid": f"sess-{agent_id}-{i}", + "agent_id": agent_id, + "model_id": "claude-sonnet-4-5-20250929", + "human_interventions": 1 if i < interventions else 0, + "outcome_score": 0.85, + "completed_at": "2026-02-15T12:00:00Z", + "created_at": "2026-02-15T00:00:00Z", + }, + ) + + @pytest.mark.asyncio + async def test_flags_high_alignment(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db + ) + # 20 sessions, 1 with intervention (95% alignment) + self._seed(mock_db, "developer", 20, 1) + candidates = await analyse_autonomy_candidates() + assert len(candidates) == 1 + assert candidates[0]["agent_id"] == "developer" + assert candidates[0]["alignment_rate"] >= 0.9 + + @pytest.mark.asyncio + async def test_no_flag_low_alignment(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db + ) + # 20 sessions, 5 with intervention (75% alignment) + self._seed(mock_db, "developer", 20, 5) + candidates = await analyse_autonomy_candidates() + assert len(candidates) == 0 + + @pytest.mark.asyncio + async def test_no_flag_insufficient_sessions(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db + ) + # Only 5 sessions (below threshold of 10) + self._seed(mock_db, "developer", 5, 0) + candidates = await analyse_autonomy_candidates() + assert len(candidates) == 0 + + @pytest.mark.asyncio + async def test_empty_data(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db + ) + candidates = await analyse_autonomy_candidates() + assert candidates == [] + + @pytest.mark.asyncio + async def test_multiple_agents(self, mock_db, monkeypatch): + monkeypatch.setattr( + "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db + ) + # developer: 95% alignment (candidate) + self._seed(mock_db, "developer", 20, 1) + # reviewer: 70% alignment (not candidate) + self._seed(mock_db, "reviewer", 20, 6) + candidates = await analyse_autonomy_candidates() + assert len(candidates) == 1 + assert candidates[0]["agent_id"] == "developer" diff --git a/tests/test_effectiveness.py b/tests/test_effectiveness.py new file mode 100644 index 0000000..761573b --- /dev/null +++ b/tests/test_effectiveness.py @@ -0,0 +1,218 @@ +"""Tests for prompt effectiveness tracking endpoints.""" + +from uuid import uuid4 + + +class TestEffectivenessCreate: + def test_create_effectiveness(self, client, mock_db): + resp = client.post( + "/api/v1/effectiveness", + json={ + "session_uuid": "sess-001", + "agent_id": "developer", + "model_id": "claude-sonnet-4-5-20250929", + "model_tier": "standard", + }, + ) + assert resp.status_code == 201 + data = resp.json() + assert data["session_uuid"] == "sess-001" + assert data["agent_id"] == "developer" + assert data["model_id"] == "claude-sonnet-4-5-20250929" + assert data["correction_count"] == 0 + assert data["outcome"] == "unknown" + + def test_create_with_all_fields(self, client, mock_db): + prompt_id = str(uuid4()) + version_id = str(uuid4()) + resp = client.post( + "/api/v1/effectiveness", + json={ + "session_uuid": "sess-002", + "prompt_id": prompt_id, + "version_id": version_id, + "agent_id": "reviewer", + "model_id": "claude-opus-4-6", + "model_tier": "premium", + "briefing_hash": "abc123", + "mission_id": "mission-1", + "task_id": "task-1", + }, + ) + assert resp.status_code == 201 + data = resp.json() + assert data["mission_id"] == "mission-1" + assert data["briefing_hash"] == "abc123" + + +class TestEffectivenessUpdate: + def _create(self, client): + resp = client.post( + "/api/v1/effectiveness", + json={ + "session_uuid": "sess-upd", + "agent_id": "developer", + "model_id": "claude-sonnet-4-5-20250929", + }, + ) + assert resp.status_code == 201 + return resp.json() + + def test_update_tokens(self, client, mock_db): + self._create(client) + resp = client.patch( + "/api/v1/effectiveness/sess-upd", + json={"input_tokens": 5000, "output_tokens": 2000, "total_tokens": 7000}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["input_tokens"] == 5000 + assert data["total_tokens"] == 7000 + + def test_update_outcome(self, client, mock_db): + self._create(client) + resp = client.patch( + "/api/v1/effectiveness/sess-upd", + json={"outcome": "success", "outcome_score": 0.85, "cost_usd": 0.05}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["outcome"] == "success" + assert data["outcome_score"] == 0.85 + + def test_update_not_found(self, client, mock_db): + resp = client.patch( + "/api/v1/effectiveness/nonexistent", + json={"outcome": "success"}, + ) + assert resp.status_code == 404 + + +class TestEffectivenessSummary: + def _seed(self, mock_db): + for i in range(5): + mock_db.insert( + "prompt_effectiveness", + { + "session_uuid": f"sess-{i}", + "agent_id": "developer" if i < 3 else "reviewer", + "model_id": "claude-sonnet-4-5-20250929", + "model_tier": "standard", + "total_tokens": 5000 + i * 1000, + "cost_usd": 0.03 + i * 0.01, + "outcome_score": 0.7 + i * 0.05, + "effectiveness": (0.7 + i * 0.05) / (0.03 + i * 0.01), + "correction_count": i % 2, + "created_at": "2026-02-15T00:00:00Z", + }, + ) + + def test_summary_by_agent(self, client, mock_db): + self._seed(mock_db) + resp = client.get("/api/v1/effectiveness/summary?group_by=agent_id") + assert resp.status_code == 200 + data = resp.json() + assert len(data) == 2 + agents = {d["group_value"] for d in data} + assert "developer" in agents + assert "reviewer" in agents + + def test_summary_by_model_tier(self, client, mock_db): + self._seed(mock_db) + resp = client.get("/api/v1/effectiveness/summary?group_by=model_tier") + assert resp.status_code == 200 + data = resp.json() + assert len(data) >= 1 + assert data[0]["group_key"] == "model_tier" + + +class TestModelTierEffectiveness: + def _seed_tiers(self, mock_db): + for tier, count in [("economy", 5), ("standard", 10), ("premium", 3)]: + for i in range(count): + mock_db.insert( + "prompt_effectiveness", + { + "session_uuid": f"sess-{tier}-{i}", + "agent_id": "developer", + "model_id": f"model-{tier}", + "model_tier": tier, + "correction_count": 1 if tier == "economy" and i < 3 else 0, + "outcome_score": 0.7 if tier == "economy" else 0.9, + "effectiveness": 10.0 if tier == "economy" else 15.0, + "created_at": "2026-02-15T00:00:00Z", + }, + ) + + def test_model_tiers(self, client, mock_db): + self._seed_tiers(mock_db) + resp = client.get("/api/v1/effectiveness/model-tiers") + assert resp.status_code == 200 + data = resp.json() + assert "economy" in data + assert "standard" in data + assert "premium" in data + assert data["economy"]["session_count"] == 5 + assert data["standard"]["session_count"] == 10 + + +class TestMissionCostBreakdown: + def _seed_mission(self, mock_db): + for i in range(4): + mock_db.insert( + "prompt_effectiveness", + { + "session_uuid": f"sess-m-{i}", + "agent_id": "developer", + "model_id": "claude-sonnet-4-5-20250929", + "mission_id": "mission-1", + "task_id": f"task-{i % 2}", + "total_tokens": 5000, + "cost_usd": 0.05, + "outcome_score": 0.8, + "correction_count": 0, + "created_at": "2026-02-15T00:00:00Z", + }, + ) + + def test_mission_breakdown(self, client, mock_db): + self._seed_mission(mock_db) + resp = client.get("/api/v1/effectiveness/mission/mission-1") + assert resp.status_code == 200 + data = resp.json() + assert data["mission_id"] == "mission-1" + assert data["total_cost_usd"] == 0.20 + assert data["total_tokens"] == 20000 + assert data["session_count"] == 4 + assert len(data["stages"]) == 2 + + def test_mission_not_found(self, client, mock_db): + resp = client.get("/api/v1/effectiveness/mission/nonexistent") + assert resp.status_code == 404 + + +class TestDiscoveryAccuracy: + def _seed_discovery(self, mock_db): + for i, score in enumerate([0.5, 0.6, 0.7, 0.85]): + mock_db.insert( + "prompt_effectiveness", + { + "session_uuid": f"sess-d-{i}", + "agent_id": "developer", + "model_id": "claude-sonnet-4-5-20250929", + "mission_id": "mission-da", + "outcome_score": score, + "created_at": f"2026-02-1{i+2}T00:00:00Z", + }, + ) + + def test_discovery_accuracy(self, client, mock_db): + self._seed_discovery(mock_db) + resp = client.get("/api/v1/effectiveness/discovery-accuracy") + assert resp.status_code == 200 + data = resp.json() + assert len(data) == 1 + assert data[0]["mission_id"] == "mission-da" + assert data[0]["initial_score"] == 0.5 + assert data[0]["final_score"] == 0.85 + assert data[0]["discovery_accuracy"] is not None From 4b614234b7f823ce121bd972052d70c06cef738f Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Mon, 16 Feb 2026 01:36:07 +0000 Subject: [PATCH 3/3] style: ruff format --- prompt_forge/api/effectiveness.py | 42 ++++++++++++++++++----------- prompt_forge/api/models.py | 5 ++++ prompt_forge/core/analyser.py | 44 +++++++++++++++++-------------- prompt_forge/core/autonomy.py | 29 ++++++++++---------- prompt_forge/core/subscribers.py | 18 ++++++++++--- prompt_forge/main.py | 4 +++ tests/test_analyser.py | 16 +++-------- tests/test_autonomy.py | 20 ++++---------- tests/test_effectiveness.py | 2 +- 9 files changed, 98 insertions(+), 82 deletions(-) diff --git a/prompt_forge/api/effectiveness.py b/prompt_forge/api/effectiveness.py index 93ba843..38bf7f8 100644 --- a/prompt_forge/api/effectiveness.py +++ b/prompt_forge/api/effectiveness.py @@ -68,7 +68,9 @@ async def update_effectiveness( @router.get("/effectiveness/summary", response_model=list[EffectivenessSummary]) async def effectiveness_summary( - group_by: str = Query(default="version_id", pattern=r"^(version_id|model_id|agent_id|model_tier)$"), + group_by: str = Query( + default="version_id", pattern=r"^(version_id|model_id|agent_id|model_tier)$" + ), prompt_id: UUID | None = None, model_id: str | None = None, agent_id: str | None = None, @@ -171,7 +173,8 @@ async def prompt_version_effectiveness( cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat() rows = db.select("prompt_effectiveness") filtered = [ - r for r in rows + r + for r in rows if r.get("prompt_id") == str(prompt_id) and r.get("created_at", "") >= cutoff ] @@ -206,6 +209,7 @@ async def prompt_version_effectiveness( async def compression_candidates() -> list[dict[str, Any]]: """Prompt versions flagged as verbose (>2x median tokens).""" from prompt_forge.core.analyser import analyse_verbose_prompts + return await analyse_verbose_prompts() @@ -213,6 +217,7 @@ async def compression_candidates() -> list[dict[str, Any]]: async def autonomy_candidates() -> list[dict[str, Any]]: """Agents where human intervention is low enough for autonomy expansion.""" from prompt_forge.core.autonomy import analyse_autonomy_candidates + return await analyse_autonomy_candidates() @@ -243,12 +248,14 @@ async def mission_cost_breakdown( for tid, records in by_task.items(): stage_cost = sum(r.get("cost_usd", 0) or 0 for r in records) stage_tokens = sum(r.get("total_tokens", 0) or 0 for r in records) - stages.append({ - "task_id": tid, - "cost_usd": stage_cost, - "total_tokens": stage_tokens, - "session_count": len(records), - }) + stages.append( + { + "task_id": tid, + "cost_usd": stage_cost, + "total_tokens": stage_tokens, + "session_count": len(records), + } + ) return { "mission_id": mission_id, @@ -270,7 +277,8 @@ async def discovery_accuracy( cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat() rows = db.select("prompt_effectiveness") filtered = [ - r for r in rows + r + for r in rows if r.get("created_at", "") >= cutoff and r.get("mission_id") and r.get("outcome_score") is not None @@ -289,13 +297,15 @@ async def discovery_accuracy( initial = sorted_recs[0].get("outcome_score", 0) or 0 final = sorted_recs[-1].get("outcome_score", 0) or 0 accuracy = 1 - abs(initial - final) / max(initial, 0.001) if initial > 0 else None - results.append({ - "mission_id": mid, - "initial_score": initial, - "final_score": final, - "discovery_accuracy": accuracy, - "session_count": len(sorted_recs), - }) + results.append( + { + "mission_id": mid, + "initial_score": initial, + "final_score": final, + "discovery_accuracy": accuracy, + "session_count": len(sorted_recs), + } + ) return results diff --git a/prompt_forge/api/models.py b/prompt_forge/api/models.py index 6a4f4f5..9b26177 100644 --- a/prompt_forge/api/models.py +++ b/prompt_forge/api/models.py @@ -279,6 +279,7 @@ class UsageStatsResponse(BaseModel): class EffectivenessCreate(BaseModel): """Create an effectiveness tracking record at session spawn.""" + session_uuid: str prompt_id: UUID | None = None version_id: UUID | None = None @@ -292,6 +293,7 @@ class EffectivenessCreate(BaseModel): class EffectivenessUpdate(BaseModel): """Partial update for token/correction/outcome data.""" + input_tokens: int | None = None output_tokens: int | None = None total_tokens: int | None = None @@ -305,6 +307,7 @@ class EffectivenessUpdate(BaseModel): class EffectivenessResponse(BaseModel): """Full effectiveness record.""" + id: UUID prompt_id: UUID | None version_id: UUID | None @@ -330,6 +333,7 @@ class EffectivenessResponse(BaseModel): class EffectivenessSummary(BaseModel): """Aggregated effectiveness stats per prompt version or model.""" + group_key: str group_value: str session_count: int @@ -343,6 +347,7 @@ class EffectivenessSummary(BaseModel): class ModelEffectivenessResponse(BaseModel): """Per-model-tier correction rates and avg effectiveness.""" + economy: EffectivenessSummary | None = None standard: EffectivenessSummary | None = None premium: EffectivenessSummary | None = None diff --git a/prompt_forge/core/analyser.py b/prompt_forge/core/analyser.py index aaf70c8..704afb4 100644 --- a/prompt_forge/core/analyser.py +++ b/prompt_forge/core/analyser.py @@ -28,10 +28,9 @@ async def analyse_verbose_prompts() -> list[dict]: rows = db.select("prompt_effectiveness") recent = [ - r for r in rows - if r.get("created_at", "") >= cutoff - and r.get("total_tokens") - and r.get("version_id") + r + for r in rows + if r.get("created_at", "") >= cutoff and r.get("total_tokens") and r.get("version_id") ] if not recent: @@ -50,13 +49,15 @@ async def analyse_verbose_prompts() -> list[dict]: scores = [r["outcome_score"] for r in records if r.get("outcome_score") is not None] avg_tokens = sum(tokens) / len(tokens) avg_score = sum(scores) / len(scores) if scores else None - version_stats.append({ - "version_id": vid, - "avg_tokens": avg_tokens, - "avg_score": avg_score, - "session_count": len(records), - "prompt_id": records[0].get("prompt_id"), - }) + version_stats.append( + { + "version_id": vid, + "avg_tokens": avg_tokens, + "avg_score": avg_score, + "session_count": len(records), + "prompt_id": records[0].get("prompt_id"), + } + ) if len(version_stats) < 2: return [] @@ -73,15 +74,17 @@ async def analyse_verbose_prompts() -> list[dict]: for v in version_stats: ratio = v["avg_tokens"] / median_tokens if ratio > 2.0: - flagged.append({ - "version_id": v["version_id"], - "prompt_id": v["prompt_id"], - "avg_tokens": v["avg_tokens"], - "median_tokens": median_tokens, - "token_ratio": round(ratio, 2), - "avg_score": v["avg_score"], - "session_count": v["session_count"], - }) + flagged.append( + { + "version_id": v["version_id"], + "prompt_id": v["prompt_id"], + "avg_tokens": v["avg_tokens"], + "median_tokens": median_tokens, + "token_ratio": round(ratio, 2), + "avg_score": v["avg_score"], + "session_count": v["session_count"], + } + ) return flagged @@ -93,6 +96,7 @@ async def publish_verbose_alerts(flagged: list[dict]) -> int: try: from prompt_forge.core.events import get_event_publisher + publisher = get_event_publisher() if not publisher._connected: return 0 diff --git a/prompt_forge/core/autonomy.py b/prompt_forge/core/autonomy.py index fe00da7..e2f125f 100644 --- a/prompt_forge/core/autonomy.py +++ b/prompt_forge/core/autonomy.py @@ -30,10 +30,9 @@ async def analyse_autonomy_candidates() -> list[dict]: rows = db.select("prompt_effectiveness") recent = [ - r for r in rows - if r.get("created_at", "") >= cutoff - and r.get("completed_at") - and r.get("agent_id") + r + for r in rows + if r.get("created_at", "") >= cutoff and r.get("completed_at") and r.get("agent_id") ] if not recent: @@ -51,10 +50,7 @@ async def analyse_autonomy_candidates() -> list[dict]: continue total = len(records) - no_intervention = sum( - 1 for r in records - if (r.get("human_interventions") or 0) == 0 - ) + no_intervention = sum(1 for r in records if (r.get("human_interventions") or 0) == 0) alignment_rate = no_intervention / total if alignment_rate >= 0.9: @@ -63,13 +59,15 @@ async def analyse_autonomy_candidates() -> list[dict]: if scores: avg_score = sum(scores) / len(scores) - candidates.append({ - "agent_id": aid, - "session_count": total, - "no_intervention_count": no_intervention, - "alignment_rate": round(alignment_rate, 4), - "avg_outcome_score": round(avg_score, 4) if avg_score is not None else None, - }) + candidates.append( + { + "agent_id": aid, + "session_count": total, + "no_intervention_count": no_intervention, + "alignment_rate": round(alignment_rate, 4), + "avg_outcome_score": round(avg_score, 4) if avg_score is not None else None, + } + ) return candidates @@ -81,6 +79,7 @@ async def publish_autonomy_alerts(candidates: list[dict]) -> int: try: from prompt_forge.core.events import get_event_publisher + publisher = get_event_publisher() if not publisher._connected: return 0 diff --git a/prompt_forge/core/subscribers.py b/prompt_forge/core/subscribers.py index 0b5897d..d4e2843 100644 --- a/prompt_forge/core/subscribers.py +++ b/prompt_forge/core/subscribers.py @@ -18,6 +18,7 @@ _nats_available = False try: import nats as nats_lib + _nats_available = True except ImportError: pass @@ -51,9 +52,14 @@ async def start(self) -> None: sub1 = await self._nc.subscribe("swarm.usage.tokens", cb=self._handle_token_usage) sub2 = await self._nc.subscribe("swarm.dredd.correction", cb=self._handle_correction) - sub3 = await self._nc.subscribe("swarm.cc.session.completed", cb=self._handle_session_completed) + sub3 = await self._nc.subscribe( + "swarm.cc.session.completed", cb=self._handle_session_completed + ) self._subs = [sub1, sub2, sub3] - logger.info("subscribers.started", subjects=["swarm.usage.tokens", "swarm.dredd.correction", "swarm.cc.session.completed"]) + logger.info( + "subscribers.started", + subjects=["swarm.usage.tokens", "swarm.dredd.correction", "swarm.cc.session.completed"], + ) async def stop(self) -> None: for sub in self._subs: @@ -79,6 +85,7 @@ async def _handle_token_usage(self, msg) -> None: return from prompt_forge.db.client import get_supabase_client + db = get_supabase_client() rows = db.select("prompt_effectiveness", filters={"session_uuid": session_uuid}) @@ -112,6 +119,7 @@ async def _handle_correction(self, msg) -> None: return from prompt_forge.db.client import get_supabase_client + db = get_supabase_client() rows = db.select("prompt_effectiveness", filters={"session_uuid": session_ref}) @@ -136,7 +144,9 @@ async def _handle_correction(self, msg) -> None: updates["outcome_score"] = 0.8 db.update("prompt_effectiveness", row["id"], updates) - logger.debug("subscribers.correction_applied", session=session_ref, type=correction_type) + logger.debug( + "subscribers.correction_applied", session=session_ref, type=correction_type + ) except Exception as e: logger.warning("subscribers.correction_error", error=str(e)) @@ -150,6 +160,7 @@ async def _handle_session_completed(self, msg) -> None: return from prompt_forge.db.client import get_supabase_client + db = get_supabase_client() rows = db.select("prompt_effectiveness", filters={"session_uuid": session_id}) @@ -181,6 +192,7 @@ async def _handle_session_completed(self, msg) -> None: def get_effectiveness_subscriber() -> EffectivenessSubscriber: import os + global _subscriber if _subscriber is None: nats_url = os.getenv("NATS_URL", "nats://localhost:4222") diff --git a/prompt_forge/main.py b/prompt_forge/main.py index 4800366..0da21b5 100644 --- a/prompt_forge/main.py +++ b/prompt_forge/main.py @@ -65,6 +65,7 @@ async def lifespan(app: FastAPI): # Initialize NATS effectiveness subscribers (optional) try: from prompt_forge.core.subscribers import get_effectiveness_subscriber + subscriber = get_effectiveness_subscriber() if await subscriber.connect(): await subscriber.start() @@ -78,12 +79,14 @@ async def lifespan(app: FastAPI): global _analyser_task, _autonomy_task try: from prompt_forge.core.analyser import run_analyser_loop + _analyser_task = asyncio.create_task(run_analyser_loop()) except Exception as e: logger.info("promptforge.analyser_skipped", reason=str(e)) try: from prompt_forge.core.autonomy import run_autonomy_loop + _autonomy_task = asyncio.create_task(run_autonomy_loop()) except Exception as e: logger.info("promptforge.autonomy_skipped", reason=str(e)) @@ -102,6 +105,7 @@ async def lifespan(app: FastAPI): # Disconnect NATS subscribers try: from prompt_forge.core.subscribers import get_effectiveness_subscriber + subscriber = get_effectiveness_subscriber() await subscriber.stop() except Exception: diff --git a/tests/test_analyser.py b/tests/test_analyser.py index d551dbf..63ab50e 100644 --- a/tests/test_analyser.py +++ b/tests/test_analyser.py @@ -26,9 +26,7 @@ def _seed(self, mock_db, version_tokens: dict[str, int]): @pytest.mark.asyncio async def test_flags_verbose_version(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.analyser.get_supabase_client", lambda: mock_db) # 3 versions: A=3000, B=4000 (median), C=20000 (>2x median=4000 → ratio 5.0) self._seed(mock_db, {"version-a": 3000, "version-b": 4000, "version-c": 20000}) flagged = await analyse_verbose_prompts() @@ -38,26 +36,20 @@ async def test_flags_verbose_version(self, mock_db, monkeypatch): @pytest.mark.asyncio async def test_no_flags_when_similar(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.analyser.get_supabase_client", lambda: mock_db) self._seed(mock_db, {"version-a": 5000, "version-b": 6000, "version-c": 5500}) flagged = await analyse_verbose_prompts() assert len(flagged) == 0 @pytest.mark.asyncio async def test_empty_data(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.analyser.get_supabase_client", lambda: mock_db) flagged = await analyse_verbose_prompts() assert flagged == [] @pytest.mark.asyncio async def test_single_version_no_flag(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.analyser.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.analyser.get_supabase_client", lambda: mock_db) self._seed(mock_db, {"version-a": 50000}) flagged = await analyse_verbose_prompts() assert flagged == [] diff --git a/tests/test_autonomy.py b/tests/test_autonomy.py index a9e142c..811df95 100644 --- a/tests/test_autonomy.py +++ b/tests/test_autonomy.py @@ -24,9 +24,7 @@ def _seed(self, mock_db, agent_id: str, count: int, interventions: int): @pytest.mark.asyncio async def test_flags_high_alignment(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db) # 20 sessions, 1 with intervention (95% alignment) self._seed(mock_db, "developer", 20, 1) candidates = await analyse_autonomy_candidates() @@ -36,9 +34,7 @@ async def test_flags_high_alignment(self, mock_db, monkeypatch): @pytest.mark.asyncio async def test_no_flag_low_alignment(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db) # 20 sessions, 5 with intervention (75% alignment) self._seed(mock_db, "developer", 20, 5) candidates = await analyse_autonomy_candidates() @@ -46,9 +42,7 @@ async def test_no_flag_low_alignment(self, mock_db, monkeypatch): @pytest.mark.asyncio async def test_no_flag_insufficient_sessions(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db) # Only 5 sessions (below threshold of 10) self._seed(mock_db, "developer", 5, 0) candidates = await analyse_autonomy_candidates() @@ -56,17 +50,13 @@ async def test_no_flag_insufficient_sessions(self, mock_db, monkeypatch): @pytest.mark.asyncio async def test_empty_data(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db) candidates = await analyse_autonomy_candidates() assert candidates == [] @pytest.mark.asyncio async def test_multiple_agents(self, mock_db, monkeypatch): - monkeypatch.setattr( - "prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db - ) + monkeypatch.setattr("prompt_forge.core.autonomy.get_supabase_client", lambda: mock_db) # developer: 95% alignment (candidate) self._seed(mock_db, "developer", 20, 1) # reviewer: 70% alignment (not candidate) diff --git a/tests/test_effectiveness.py b/tests/test_effectiveness.py index 761573b..c308e94 100644 --- a/tests/test_effectiveness.py +++ b/tests/test_effectiveness.py @@ -202,7 +202,7 @@ def _seed_discovery(self, mock_db): "model_id": "claude-sonnet-4-5-20250929", "mission_id": "mission-da", "outcome_score": score, - "created_at": f"2026-02-1{i+2}T00:00:00Z", + "created_at": f"2026-02-1{i + 2}T00:00:00Z", }, )