From c160cc28261404508ac2cda723ff0b7fdf5a05a8 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Sun, 29 Mar 2026 23:14:52 -0500
Subject: [PATCH 01/12] feat(ci): add CI checks for config validation,
 migrations, and collectors

Add comprehensive CI workflow with quality gates:
- Lint, format-check, type-check, and test jobs
- Config validation smoke tests
- Migration validation tests
- Collector validation tests
- Quality gate that blocks merges on failures

Add Makefile targets for validation:
- validate-config: run config validation tests
- validate-migrations: run migration tests
- validate-collectors: run collector tests
- validate-all: run all validation tests

Add tests/validation/ directory with:
- test_config_validation.py: validates all YAML configs
- test_migrations.py: validates migration system
- test_collectors.py: validates collector modules

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/ci.yml                      | 203 +++++++++++++++++
 Makefile                                      |  18 +-
 src/benchmark_core/db/models.py               |   3 +-
 src/benchmark_core/db/repositories.py         |  12 +-
 .../repositories/artifact_repository.py       |   4 +-
 .../repositories/request_repository.py        |   4 +-
 src/benchmark_core/services/health_service.py |   8 +-
 src/benchmark_core/services/rendering.py      |   1 +
 .../services/session_service.py               |   1 +
 src/benchmark_core/services_abc.py            |  11 +-
 src/cli/commands/artifact.py                  |   4 +-
 src/cli/commands/normalize.py                 |   6 +-
 src/cli/commands/render.py                    |  24 +-
 src/cli/commands/session.py                   |   5 +-
 src/collectors/litellm_collector.py           |   8 +-
 src/collectors/normalize_requests.py          |  74 +++---
 src/collectors/rollup_job.py                  |   8 +-
 src/reporting/comparison.py                   |   4 +-
 src/reporting/export_service.py               |  26 +--
 tests/unit/test_artifact_commands.py          |  11 +-
 tests/unit/test_collectors.py                 |  12 +-
 tests/unit/test_diagnostics_service.py        |   2 +
 tests/unit/test_export_commands.py            |  10 +-
 tests/unit/test_health_service.py             |   3 +
 tests/unit/test_reporting.py                  |   4 +
 tests/unit/test_repositories.py               |  25 +-
 tests/unit/test_session_commands.py           |   5 +-
 tests/validation/__init__.py                  |   7 +
 tests/validation/test_collectors.py           | 142 ++++++++++++
 tests/validation/test_config_validation.py    | 213 ++++++++++++++++++
 tests/validation/test_migrations.py           | 187 +++++++++++++++
 31 files changed, 927 insertions(+), 118 deletions(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 tests/validation/__init__.py
 create mode 100644 tests/validation/test_collectors.py
 create mode 100644 tests/validation/test_config_validation.py
 create mode 100644 tests/validation/test_migrations.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..dd06b28
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,203 @@
+name: ci
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    name: Lint
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Run linter
+        run: make lint
+
+  format-check:
+    name: Format Check
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Check formatting
+        run: make format-check
+
+  type-check:
+    name: Type Check
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Run type checker
+        run: make type-check
+
+  test:
+    name: Test
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Run tests
+        run: make test
+
+  config-validation:
+    name: Config Validation
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Run config validation tests
+        run: pytest tests/validation/test_config_validation.py -v
+
+  migration-check:
+    name: Migration Check
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Run migration tests
+        run: pytest tests/validation/test_migrations.py -v
+
+  collector-check:
+    name: Collector Check
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[dev]"
+
+      - name: Run collector tests
+        run: pytest tests/validation/test_collectors.py -v
+
+  quality:
+    name: Quality Gate
+    runs-on: ubuntu-24.04
+    needs: [lint, format-check, type-check, test, config-validation, migration-check, collector-check]
+    if: always()
+    steps:
+      - name: Check all jobs passed
+        run: |
+          if [[ "${{ needs.lint.result }}" != "success" || \
+                "${{ needs.format-check.result }}" != "success" || \
+                "${{ needs.type-check.result }}" != "success" || \
+                "${{ needs.test.result }}" != "success" || \
+                "${{ needs.config-validation.result }}" != "success" || \
+                "${{ needs.migration-check.result }}" != "success" || \
+                "${{ needs.collector-check.result }}" != "success" ]]; then
+            echo "::error::One or more quality checks failed"
+            exit 1
+          fi
+          echo "✓ All quality checks passed"
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 62c9e6b..0698896 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help install install-dev sync lint format format-check type-check test test-unit test-integration test-cov clean quality dev-setup dev-check
+.PHONY: help install install-dev sync lint format format-check type-check test test-unit test-integration test-cov clean quality dev-setup dev-check validate-config validate-migrations validate-collectors validate-all
 
 # Set PYTHONPATH for all targets (handle empty PYTHONPATH case)
 export PYTHONPATH := $(PWD)/src$(if $(PYTHONPATH),:$(PYTHONPATH),)
@@ -84,3 +84,19 @@ dev-check: ## Quick check before committing
 	@make lint
 	@make type-check
 	@make test-unit
+
+# Validation tests for CI
+validate-config: ## Run config validation tests
+	@echo "Running config validation tests..."
+	pytest tests/validation/test_config_validation.py -v
+
+validate-migrations: ## Run migration validation tests
+	@echo "Running migration validation tests..."
+	pytest tests/validation/test_migrations.py -v
+
+validate-collectors: ## Run collector validation tests
+	@echo "Running collector validation tests..."
+	pytest tests/validation/test_collectors.py -v
+
+validate-all: validate-config validate-migrations validate-collectors ## Run all validation tests
+	@echo "✓ All validation tests passed"
diff --git a/src/benchmark_core/db/models.py b/src/benchmark_core/db/models.py
index e752324..fafd6ec 100644
--- a/src/benchmark_core/db/models.py
+++ b/src/benchmark_core/db/models.py
@@ -347,8 +347,7 @@ class Artifact(Base):
     __table_args__ = (
         # Ensure at least one of session_id or experiment_id is provided
         CheckConstraint(
-            'session_id IS NOT NULL OR experiment_id IS NOT NULL',
-            name='ck_artifact_scope'
+            "session_id IS NOT NULL OR experiment_id IS NOT NULL", name="ck_artifact_scope"
         ),
     )
 
diff --git a/src/benchmark_core/db/repositories.py b/src/benchmark_core/db/repositories.py
index 26af54d..30254c2 100644
--- a/src/benchmark_core/db/repositories.py
+++ b/src/benchmark_core/db/repositories.py
@@ -413,11 +413,7 @@ async def update(self, credential: ProxyCredential) -> ProxyCredential:
         Returns:
             Updated credential
         """
-        orm = (
-            self._session.query(ProxyCredentialORM)
-            .filter_by(id=credential.credential_id)
-            .first()
-        )
+        orm = self._session.query(ProxyCredentialORM).filter_by(id=credential.credential_id).first()
         if orm is None:
             raise ValueError(f"Credential {credential.credential_id} not found")
 
@@ -438,11 +434,7 @@ async def revoke(self, session_id: UUID) -> ProxyCredential | None:
         Returns:
             Updated credential metadata or None if not found
         """
-        orm = (
-            self._session.query(ProxyCredentialORM)
-            .filter_by(session_id=session_id)
-            .first()
-        )
+        orm = self._session.query(ProxyCredentialORM).filter_by(session_id=session_id).first()
         if orm is None:
             return None
 
diff --git a/src/benchmark_core/repositories/artifact_repository.py b/src/benchmark_core/repositories/artifact_repository.py
index 9e34f2b..a41e56b 100644
--- a/src/benchmark_core/repositories/artifact_repository.py
+++ b/src/benchmark_core/repositories/artifact_repository.py
@@ -117,7 +117,9 @@ async def delete(self, id: UUID) -> bool:
         """
         return await super().delete(id)
 
-    async def list_by_session(self, session_id: UUID, limit: int = 100, offset: int = 0) -> list[ArtifactORM]:
+    async def list_by_session(
+        self, session_id: UUID, limit: int = 100, offset: int = 0
+    ) -> list[ArtifactORM]:
         """List all artifacts for a specific session.
 
         Args:
diff --git a/src/benchmark_core/repositories/request_repository.py b/src/benchmark_core/repositories/request_repository.py
index 604d94f..78bccbe 100644
--- a/src/benchmark_core/repositories/request_repository.py
+++ b/src/benchmark_core/repositories/request_repository.py
@@ -145,9 +145,7 @@ async def create_many_strict(self, requests: list[RequestORM]) -> list[RequestOR
             error_msg = str(e).lower()
 
             if "requests_request_id_key" in str(e) or "unique constraint" in error_msg:
-                raise DuplicateIdentifierError(
-                    "One or more requests already exist in batch"
-                ) from e
+                raise DuplicateIdentifierError("One or more requests already exist in batch") from e
 
             if "foreign key constraint failed" in error_msg:
                 raise ReferentialIntegrityError(
diff --git a/src/benchmark_core/services/health_service.py b/src/benchmark_core/services/health_service.py
index 1758a98..efd60d5 100644
--- a/src/benchmark_core/services/health_service.py
+++ b/src/benchmark_core/services/health_service.py
@@ -133,9 +133,11 @@ def check_litellm_proxy(self) -> HealthCheckResult:
             response = requests.get(health_url, timeout=5)
 
             if response.status_code == 200:
-                data = response.json() if response.headers.get("content-type", "").startswith(
-                    "application/json"
-                ) else {}
+                data = (
+                    response.json()
+                    if response.headers.get("content-type", "").startswith("application/json")
+                    else {}
+                )
 
                 return HealthCheckResult(
                     name="litellm_proxy",
diff --git a/src/benchmark_core/services/rendering.py b/src/benchmark_core/services/rendering.py
index 54a2794..80ab5f3 100644
--- a/src/benchmark_core/services/rendering.py
+++ b/src/benchmark_core/services/rendering.py
@@ -349,6 +349,7 @@ def _substitute_template(self, value: str, context: dict[str, str]) -> str:
         Raises:
             RenderingError: If a template variable is not found in context.
         """
+
         def replace_match(match: re.Match) -> str:
             var_name = match.group(1)
             if var_name not in context:
diff --git a/src/benchmark_core/services/session_service.py b/src/benchmark_core/services/session_service.py
index d9a5f2a..e7bde53 100644
--- a/src/benchmark_core/services/session_service.py
+++ b/src/benchmark_core/services/session_service.py
@@ -223,6 +223,7 @@ async def finalize_session(
         # Validate outcome_state if provided
         if outcome_state is not None:
             from benchmark_core.models import SessionOutcomeState
+
             if not isinstance(outcome_state, SessionOutcomeState):
                 raise SessionValidationError(
                     f"outcome_state must be a SessionOutcomeState, got {type(outcome_state).__name__}"
diff --git a/src/benchmark_core/services_abc.py b/src/benchmark_core/services_abc.py
index c5c6950..d032bbe 100644
--- a/src/benchmark_core/services_abc.py
+++ b/src/benchmark_core/services_abc.py
@@ -116,7 +116,9 @@ async def create_session(
                 await self._credential_repo.create(credential)
 
                 # Update session with credential reference
-                session = session.model_copy(update={"proxy_credential_alias": credential.key_alias})
+                session = session.model_copy(
+                    update={"proxy_credential_alias": credential.key_alias}
+                )
                 session = await self._session_repo.update(session)
             except Exception:
                 # Re-raise the exception to propagate the error
@@ -196,8 +198,7 @@ def __init__(
             ("https://", "http://localhost", "http://127.0.0.1")
         ):
             raise ValueError(
-                "LiteLLM URL must use HTTPS in production environments. "
-                f"Got: {litellm_base_url}"
+                f"LiteLLM URL must use HTTPS in production environments. Got: {litellm_base_url}"
             )
 
     def _generate_key_alias(
@@ -314,9 +315,7 @@ async def issue_credential(
         litellm_key_id = data.get("key_id")
 
         if not api_key:
-            raise RuntimeError(
-                f"LiteLLM API response missing 'key' field: {data}"
-            )
+            raise RuntimeError(f"LiteLLM API response missing 'key' field: {data}")
 
         # Create credential domain model
         credential = ProxyCredential(
diff --git a/src/cli/commands/artifact.py b/src/cli/commands/artifact.py
index 52833ce..fb7b46a 100644
--- a/src/cli/commands/artifact.py
+++ b/src/cli/commands/artifact.py
@@ -234,7 +234,9 @@ def remove(
 
             # Confirm deletion
             if not force:
-                confirm = console.input(f"Remove artifact '{art.name}' ({artifact_id[:8]})? [y/N]: ")
+                confirm = console.input(
+                    f"Remove artifact '{art.name}' ({artifact_id[:8]})? [y/N]: "
+                )
                 if confirm.lower() != "y":
                     console.print("[yellow]Cancelled[/yellow]")
                     raise typer.Exit(0)
diff --git a/src/cli/commands/normalize.py b/src/cli/commands/normalize.py
index df02102..92edfdc 100644
--- a/src/cli/commands/normalize.py
+++ b/src/cli/commands/normalize.py
@@ -92,7 +92,9 @@ def run_normalization(
         raise typer.Exit(1) from err
 
     if not litellm_key:
-        console.print("[yellow]Warning: No LiteLLM API key provided. Set LITELLM_API_KEY env var.[/yellow]")
+        console.print(
+            "[yellow]Warning: No LiteLLM API key provided. Set LITELLM_API_KEY env var.[/yellow]"
+        )
 
     async def _run() -> tuple[int, ReconciliationReport]:
         # Fetch raw requests from LiteLLM
@@ -233,7 +235,7 @@ def show_reconciliation_report(
         # Query for any requests with errors
         error_stmt = select(func.count()).where(
             RequestORM.session_id == UUID(session_id),
-            RequestORM.error == True  # noqa: E712
+            RequestORM.error == True,  # noqa: E712
         )
         error_count = db_session.execute(error_stmt).scalar() or 0
 
diff --git a/src/cli/commands/render.py b/src/cli/commands/render.py
index 7cd93cf..61074bf 100644
--- a/src/cli/commands/render.py
+++ b/src/cli/commands/render.py
@@ -67,7 +67,9 @@ def render_env(
         variant_config = loader.load_variant(variant_name)
         if variant_config is None:
             console.print(f"[red]Error: Variant '{variant_name}' not found[/red]")
-            console.print(f"[dim]Looking in: {configs_dir / 'variants' / f'{variant_name}.yaml'}[/dim]")
+            console.print(
+                f"[dim]Looking in: {configs_dir / 'variants' / f'{variant_name}.yaml'}[/dim]"
+            )
             raise typer.Exit(1)
 
         variant = Variant(**variant_config)
@@ -79,7 +81,9 @@ def render_env(
         profile_config = loader.load_harness_profile(profile_name)
         if profile_config is None:
             console.print(f"[red]Error: Harness profile '{profile_name}' not found[/red]")
-            console.print(f"[dim]Looking in: {configs_dir / 'harnesses' / f'{profile_name}.yaml'}[/dim]")
+            console.print(
+                f"[dim]Looking in: {configs_dir / 'harnesses' / f'{profile_name}.yaml'}[/dim]"
+            )
             raise typer.Exit(1)
 
         profile = HarnessProfile(**profile_config)
@@ -97,7 +101,9 @@ def render_env(
 
         # Validate format
         if output_format not in ("shell", "dotenv"):
-            console.print(f"[red]Error: Invalid format '{output_format}'. Use 'shell' or 'dotenv'.[/red]")
+            console.print(
+                f"[red]Error: Invalid format '{output_format}'. Use 'shell' or 'dotenv'.[/red]"
+            )
             raise typer.Exit(1)
 
         # Render environment snippet
@@ -231,7 +237,9 @@ def list_profiles(
                 console.print(f"  [green]{profile.name}[/green]")
                 console.print(f"    Protocol: {profile.protocol_surface}")
                 console.print(f"    Format: {profile.render_format}")
-                console.print(f"    Env vars: {profile.base_url_env}, {profile.api_key_env}, {profile.model_env}")
+                console.print(
+                    f"    Env vars: {profile.base_url_env}, {profile.api_key_env}, {profile.model_env}"
+                )
                 if profile.extra_env:
                     console.print(f"    Extra: {', '.join(profile.extra_env.keys())}")
                 console.print()
@@ -269,7 +277,9 @@ def check_compatibility(
         # Load harness profile
         profile_config = loader.load_harness_profile(variant.harness_profile)
         if profile_config is None:
-            console.print(f"[red]Error: Harness profile '{variant.harness_profile}' not found[/red]")
+            console.print(
+                f"[red]Error: Harness profile '{variant.harness_profile}' not found[/red]"
+            )
             raise typer.Exit(1)
 
         profile = HarnessProfile(**profile_config)
@@ -290,7 +300,9 @@ def check_compatibility(
             for error in errors:
                 console.print(f"  ✗ {error}")
             console.print()
-            console.print("[yellow]Recommendation: Fix variant configuration or use a compatible profile[/yellow]")
+            console.print(
+                "[yellow]Recommendation: Fix variant configuration or use a compatible profile[/yellow]"
+            )
             raise typer.Exit(1)
         else:
             console.print("[green]✓ Variant and profile are compatible[/green]")
diff --git a/src/cli/commands/session.py b/src/cli/commands/session.py
index 4ba93be..4cd350c 100644
--- a/src/cli/commands/session.py
+++ b/src/cli/commands/session.py
@@ -320,6 +320,7 @@ def finalize(
 
             # Finalize with status, outcome state, and end time
             from benchmark_core.models import SessionOutcomeState
+
             outcome_enum = SessionOutcomeState(outcome)
             updated = asyncio.run(
                 service.finalize_session(
@@ -413,9 +414,7 @@ def add_notes(
             new_notes = f"{session.notes}\n{notes}" if append and session.notes else notes
 
             # Update notes
-            updated = asyncio.run(
-                service.update_session_notes(sess_uuid, new_notes)
-            )
+            updated = asyncio.run(service.update_session_notes(sess_uuid, new_notes))
             if updated is None:
                 console.print(f"[red]Failed to update notes for session: {session_id}[/red]")
                 raise typer.Exit(1)
diff --git a/src/collectors/litellm_collector.py b/src/collectors/litellm_collector.py
index 486957f..4ab9368 100644
--- a/src/collectors/litellm_collector.py
+++ b/src/collectors/litellm_collector.py
@@ -195,7 +195,9 @@ async def _fetch_raw_requests(
                     return []
 
         except httpx.HTTPStatusError as e:
-            diagnostics.add_error(f"HTTP error fetching logs: {e.response.status_code} - {e.response.text}")
+            diagnostics.add_error(
+                f"HTTP error fetching logs: {e.response.status_code} - {e.response.text}"
+            )
             return []
         except httpx.RequestError as e:
             diagnostics.add_error(f"Request error fetching logs: {e}")
@@ -235,7 +237,9 @@ def normalize_request(
             return None
 
         # Extract timestamp (required)
-        timestamp_str = raw_data.get("startTime") or raw_data.get("timestamp") or raw_data.get("created_at")
+        timestamp_str = (
+            raw_data.get("startTime") or raw_data.get("timestamp") or raw_data.get("created_at")
+        )
         if not timestamp_str:
             if diagnostics:
                 diagnostics.record_missing_field("timestamp")
diff --git a/src/collectors/normalize_requests.py b/src/collectors/normalize_requests.py
index 96272bb..5d7a2cb 100644
--- a/src/collectors/normalize_requests.py
+++ b/src/collectors/normalize_requests.py
@@ -72,9 +72,7 @@ def add_unmapped(
         # Track error counts by category
         if error_message:
             error_category = self._categorize_error(error_message)
-            self.error_counts[error_category] = (
-                self.error_counts.get(error_category, 0) + 1
-            )
+            self.error_counts[error_category] = self.error_counts.get(error_category, 0) + 1
 
         # Store detailed diagnostics (limit to first 100 for memory efficiency)
         if len(self.unmapped_diagnostics) < 100:
@@ -145,12 +143,14 @@ def to_markdown(self) -> str:
         ]
 
         if self.missing_field_counts:
-            lines.extend([
-                "## Missing Field Counts",
-                "",
-                "| Field | Count |",
-                "|-------|-------|",
-            ])
+            lines.extend(
+                [
+                    "## Missing Field Counts",
+                    "",
+                    "| Field | Count |",
+                    "|-------|-------|",
+                ]
+            )
             for field, count in sorted(
                 self.missing_field_counts.items(), key=lambda x: x[1], reverse=True
             ):
@@ -158,12 +158,14 @@ def to_markdown(self) -> str:
             lines.append("")
 
         if self.error_counts:
-            lines.extend([
-                "## Error Categories",
-                "",
-                "| Category | Count |",
-                "|----------|-------|",
-            ])
+            lines.extend(
+                [
+                    "## Error Categories",
+                    "",
+                    "| Category | Count |",
+                    "|----------|-------|",
+                ]
+            )
             for category, count in sorted(
                 self.error_counts.items(), key=lambda x: x[1], reverse=True
             ):
@@ -171,16 +173,20 @@ def to_markdown(self) -> str:
             lines.append("")
 
         if self.unmapped_diagnostics:
-            lines.extend([
-                "## Sample Unmapped Rows (First 10)",
-                "",
-            ])
-            for i, diag in enumerate(self.unmapped_diagnostics[:10]):
-                lines.extend([
-                    f"### Row {diag.row_index or i + 1}",
+            lines.extend(
+                [
+                    "## Sample Unmapped Rows (First 10)",
                     "",
-                    f"- **Reason**: {diag.reason}",
-                ])
+                ]
+            )
+            for i, diag in enumerate(self.unmapped_diagnostics[:10]):
+                lines.extend(
+                    [
+                        f"### Row {diag.row_index or i + 1}",
+                        "",
+                        f"- **Reason**: {diag.reason}",
+                    ]
+                )
                 if diag.missing_fields:
                     lines.append(f"- **Missing Fields**: {', '.join(diag.missing_fields)}")
                 if diag.error_message:
@@ -202,9 +208,7 @@ def to_dict(self) -> dict[str, Any]:
             },
             "missing_field_counts": self.missing_field_counts,
             "error_counts": self.error_counts,
-            "unmapped_rows": [
-                diag.to_dict() for diag in self.unmapped_diagnostics[:50]
-            ],
+            "unmapped_rows": [diag.to_dict() for diag in self.unmapped_diagnostics[:50]],
         }
 
 
@@ -262,9 +266,7 @@ def normalize(
 
         # Extract timestamp (required)
         timestamp_str = (
-            raw_data.get("startTime")
-            or raw_data.get("timestamp")
-            or raw_data.get("created_at")
+            raw_data.get("startTime") or raw_data.get("timestamp") or raw_data.get("created_at")
         )
         if not timestamp_str:
             missing_fields.append("timestamp")
@@ -283,9 +285,7 @@ def normalize(
             if isinstance(timestamp_str, (int, float)):
                 timestamp = datetime.fromtimestamp(timestamp_str, tz=UTC)
             elif isinstance(timestamp_str, str):
-                timestamp = datetime.fromisoformat(
-                    timestamp_str.replace("Z", "+00:00")
-                )
+                timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
             else:
                 return None, UnmappedRowDiagnostics(
                     raw_data=raw_data,
@@ -375,9 +375,7 @@ def _extract_ttft(self, raw_data: dict[str, Any]) -> float | None:
                 pass
         return None
 
-    def _extract_tokens(
-        self, raw_data: dict[str, Any]
-    ) -> tuple[int | None, int | None]:
+    def _extract_tokens(self, raw_data: dict[str, Any]) -> tuple[int | None, int | None]:
         """Extract prompt and completion token counts from raw data."""
         tokens_prompt = None
         tokens_completion = None
@@ -524,7 +522,9 @@ async def run(
                 # Data quality issues were already tracked during normalization above.
                 # Here we just need to report the infrastructure failure.
                 # Re-raise to let caller handle (e.g., transaction rollback, alerting)
-                raise RuntimeError(f"Bulk insert failed after normalizing {len(requests_to_ingest)} requests: {e}") from e
+                raise RuntimeError(
+                    f"Bulk insert failed after normalizing {len(requests_to_ingest)} requests: {e}"
+                ) from e
 
         return [], report
 
diff --git a/src/collectors/rollup_job.py b/src/collectors/rollup_job.py
index 41e967b..38b6be5 100644
--- a/src/collectors/rollup_job.py
+++ b/src/collectors/rollup_job.py
@@ -299,9 +299,11 @@ async def compute_session_metrics(
         if ttfts:
             sorted_ttft = sorted(ttfts)
             n_ttft = len(sorted_ttft)
-            median_ttft = sorted_ttft[n_ttft // 2] if n_ttft % 2 == 1 else (
-                sorted_ttft[n_ttft // 2 - 1] + sorted_ttft[n_ttft // 2]
-            ) / 2
+            median_ttft = (
+                sorted_ttft[n_ttft // 2]
+                if n_ttft % 2 == 1
+                else (sorted_ttft[n_ttft // 2 - 1] + sorted_ttft[n_ttft // 2]) / 2
+            )
 
             rollups.append(
                 MetricRollup(
diff --git a/src/reporting/comparison.py b/src/reporting/comparison.py
index 227f702..2b10bd6 100644
--- a/src/reporting/comparison.py
+++ b/src/reporting/comparison.py
@@ -462,7 +462,9 @@ async def compare_models(
         }.get(order_by, Variant.model_alias)
 
         if order_by in ["session_count", "avg_latency_ms", "error_rate"]:
-            stmt = stmt.order_by(order_column.desc().nulls_last(), Variant.provider.asc(), Variant.model_alias.asc())
+            stmt = stmt.order_by(
+                order_column.desc().nulls_last(), Variant.provider.asc(), Variant.model_alias.asc()
+            )
         else:
             stmt = stmt.order_by(order_column.asc(), Variant.provider.asc())
 
diff --git a/src/reporting/export_service.py b/src/reporting/export_service.py
index db5b881..b1a91fc 100644
--- a/src/reporting/export_service.py
+++ b/src/reporting/export_service.py
@@ -151,9 +151,7 @@ def export_session(
         # Include requests if requested
         if include_requests:
             requests = self._fetch_session_requests(session_id)
-            export["requests"] = [
-                self._format_request(req) for req in requests
-            ]
+            export["requests"] = [self._format_request(req) for req in requests]
 
             # Add summary statistics
             export["summary"] = self._calculate_session_summary(requests)
@@ -223,9 +221,7 @@ def export_experiment(
                 # Include requests for each session if requested
                 if include_requests:
                     requests = self._fetch_session_requests(session.id)
-                    session_export["requests"] = [
-                        self._format_request(req) for req in requests
-                    ]
+                    session_export["requests"] = [self._format_request(req) for req in requests]
                     session_export["request_count"] = len(requests)
 
             export["sessions"] = sessions_list
@@ -244,11 +240,7 @@ def _fetch_session_requests(self, session_id: UUID) -> list[Request]:
         Returns:
             List of Request objects.
         """
-        stmt = (
-            select(Request)
-            .where(Request.session_id == session_id)
-            .order_by(Request.timestamp)
-        )
+        stmt = select(Request).where(Request.session_id == session_id).order_by(Request.timestamp)
         result = self._db_session.execute(stmt).scalars().all()
         return list(result)
 
@@ -428,7 +420,14 @@ def to_csv(
             fieldnames = ExportService.REQUEST_EXPORT_FIELDS
         elif record_type == "sessions" and "sessions" in data:
             records = data["sessions"]
-            fieldnames = ["id", "variant_id", "status", "started_at", "ended_at", "duration_seconds"]
+            fieldnames = [
+                "id",
+                "variant_id",
+                "status",
+                "started_at",
+                "ended_at",
+                "duration_seconds",
+            ]
         else:
             # Fallback to session-level data for single session export
             if "session" in data:
@@ -476,8 +475,7 @@ def to_parquet(
             import pyarrow.parquet as pq
         except ImportError as e:
             raise ImportError(
-                "Parquet export requires pyarrow. "
-                "Install with: pip install pyarrow"
+                "Parquet export requires pyarrow. Install with: pip install pyarrow"
             ) from e
 
         output_path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tests/unit/test_artifact_commands.py b/tests/unit/test_artifact_commands.py
index 873510b..469facb 100644
--- a/tests/unit/test_artifact_commands.py
+++ b/tests/unit/test_artifact_commands.py
@@ -1,6 +1,5 @@
 """Tests for artifact CLI commands."""
 
-
 import pytest
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
@@ -58,7 +57,6 @@ def mock_env_db_url(test_engine, monkeypatch):
     # Patch the session factory used by CLI commands
     from benchmark_core.db import session as db_session_module
 
-
     def mock_create_engine(url, **kwargs):
         return test_engine
 
@@ -66,7 +64,10 @@ def mock_create_engine(url, **kwargs):
 
     # Also patch in the artifact module
     from cli.commands import artifact as artifact_module
-    monkeypatch.setattr(artifact_module, "get_db_session", lambda: db_session_module.get_db_session(test_engine))
+
+    monkeypatch.setattr(
+        artifact_module, "get_db_session", lambda: db_session_module.get_db_session(test_engine)
+    )
 
     yield test_engine
 
@@ -173,7 +174,9 @@ def test_register_artifact_with_experiment(self, db_session, mock_env_db_url, ru
         assert artifact.experiment_id == experiment.id
         assert artifact.session_id is None
 
-    def test_register_artifact_requires_session_or_experiment(self, mock_env_db_url, runner, tmp_path):
+    def test_register_artifact_requires_session_or_experiment(
+        self, mock_env_db_url, runner, tmp_path
+    ):
         """Test that artifact registration requires session or experiment."""
         test_file = tmp_path / "test.txt"
         test_file.write_text("test")
diff --git a/tests/unit/test_collectors.py b/tests/unit/test_collectors.py
index c5b3c19..4915704 100644
--- a/tests/unit/test_collectors.py
+++ b/tests/unit/test_collectors.py
@@ -561,9 +561,7 @@ async def test_collect_requests_idempotent_insert() -> None:
     ]
     collector._fetch_raw_requests = AsyncMock(return_value=raw_requests)  # type: ignore
 
-    collected, diagnostics, watermark = await collector.collect_requests(
-        session_id=session_id
-    )
+    collected, diagnostics, watermark = await collector.collect_requests(session_id=session_id)
 
     # Verify repository.create_many was called
     mock_repo.create_many.assert_called_once()
@@ -611,7 +609,9 @@ async def test_fetch_raw_requests_watermark_respects_start_time() -> None:
     assert route.called
     request = route.calls.last.request
     # The actual start_time used should be max(start_time, watermark.last_timestamp)
-    assert "start_time=2025-03-27T12%3A00%3A00" in str(request.url) or "start_time=2025-03-27T12:00:00" in str(request.url)
+    assert "start_time=2025-03-27T12%3A00%3A00" in str(
+        request.url
+    ) or "start_time=2025-03-27T12:00:00" in str(request.url)
 
 
 @pytest.mark.asyncio
@@ -644,7 +644,9 @@ async def test_fetch_raw_requests_uses_start_time_when_no_watermark() -> None:
     # Verify request uses the provided start_time
     assert route.called
     request = route.calls.last.request
-    assert "start_time=2025-03-26T10%3A00%3A00" in str(request.url) or "start_time=2025-03-26T10:00:00" in str(request.url)
+    assert "start_time=2025-03-26T10%3A00%3A00" in str(
+        request.url
+    ) or "start_time=2025-03-26T10:00:00" in str(request.url)
 
 
 # =============================================================================
diff --git a/tests/unit/test_diagnostics_service.py b/tests/unit/test_diagnostics_service.py
index 96d2503..826acbf 100644
--- a/tests/unit/test_diagnostics_service.py
+++ b/tests/unit/test_diagnostics_service.py
@@ -170,6 +170,7 @@ def test_diagnose_services_litellm_success(self, diagnostics_service):
     def test_diagnose_services_litellm_connection_error(self, diagnostics_service):
         """Test LiteLLM service diagnostics with connection error."""
         import requests
+
         with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
             diag = diagnostics_service._diagnose_litellm()
 
@@ -195,6 +196,7 @@ def test_diagnose_services_prometheus_success(self, diagnostics_service):
     def test_diagnose_services_prometheus_connection_error(self, diagnostics_service):
         """Test Prometheus service diagnostics with connection error."""
         import requests
+
         with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
             diag = diagnostics_service._diagnose_prometheus()
 
diff --git a/tests/unit/test_export_commands.py b/tests/unit/test_export_commands.py
index 9b832e1..b76e22a 100644
--- a/tests/unit/test_export_commands.py
+++ b/tests/unit/test_export_commands.py
@@ -230,9 +230,7 @@ def test_export_session_with_artifact_registration(
         # Verify artifact was created
         # Need to refresh the session to see the artifact
         db_session.expire_all()
-        artifacts = db_session.query(Artifact).filter_by(
-            session_id=sample_data["session"].id
-        ).all()
+        artifacts = db_session.query(Artifact).filter_by(session_id=sample_data["session"].id).all()
         assert len(artifacts) >= 1
         assert any(a.artifact_type == "export" for a in artifacts)
 
@@ -399,9 +397,9 @@ def test_export_experiment_with_artifact_registration(
 
         # Verify artifact was created
         db_session.expire_all()
-        artifacts = db_session.query(Artifact).filter_by(
-            experiment_id=sample_data["experiment"].id
-        ).all()
+        artifacts = (
+            db_session.query(Artifact).filter_by(experiment_id=sample_data["experiment"].id).all()
+        )
         assert len(artifacts) >= 1
 
 
diff --git a/tests/unit/test_health_service.py b/tests/unit/test_health_service.py
index 8f8a81d..d43d0b3 100644
--- a/tests/unit/test_health_service.py
+++ b/tests/unit/test_health_service.py
@@ -80,6 +80,7 @@ def test_check_litellm_proxy_success(self, health_service):
     def test_check_litellm_proxy_connection_error(self, health_service):
         """Test LiteLLM proxy check with connection error."""
         import requests
+
         with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
             result = health_service.check_litellm_proxy()
 
@@ -90,6 +91,7 @@ def test_check_litellm_proxy_connection_error(self, health_service):
     def test_check_litellm_proxy_timeout(self, health_service):
         """Test LiteLLM proxy check with timeout."""
         import requests
+
         with patch("requests.get", side_effect=requests.exceptions.Timeout()):
             result = health_service.check_litellm_proxy()
 
@@ -121,6 +123,7 @@ def test_check_prometheus_success(self, health_service):
     def test_check_prometheus_connection_error(self, health_service):
         """Test Prometheus check with connection error."""
         import requests
+
         with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
             result = health_service.check_prometheus()
 
diff --git a/tests/unit/test_reporting.py b/tests/unit/test_reporting.py
index 55b865d..fe59f5a 100644
--- a/tests/unit/test_reporting.py
+++ b/tests/unit/test_reporting.py
@@ -200,6 +200,7 @@ def test_comparison_service_empty_sessions() -> None:
 
     # Test with empty session list
     import asyncio
+
     result = asyncio.run(service.compare_sessions([]))
 
     assert result == {"sessions": [], "summary": {}}
@@ -221,6 +222,7 @@ def test_comparison_service_missing_experiment() -> None:
     service = ComparisonService(db_session=mock_session)
 
     import asyncio
+
     result = asyncio.run(service.compare_variants(uuid4()))
 
     assert result == []
@@ -251,6 +253,7 @@ def test_comparison_service_variant_query_construction() -> None:
 
     exp_id = uuid4()
     import asyncio
+
     result = asyncio.run(service.compare_variants(exp_id, include_invalid=False))
 
     assert result == []
@@ -284,6 +287,7 @@ def test_comparison_service_with_include_invalid() -> None:
 
     # Call with include_invalid=True should not add outcome_state filter
     import asyncio
+
     _ = asyncio.run(service.compare_variants(uuid4(), include_invalid=True))
 
     # Get the SQL that was executed
diff --git a/tests/unit/test_repositories.py b/tests/unit/test_repositories.py
index 3557bf7..f05a54b 100644
--- a/tests/unit/test_repositories.py
+++ b/tests/unit/test_repositories.py
@@ -547,6 +547,7 @@ class TestProxyCredentialRepository:
     def credential_repo(self, db_session):
         """Create a credential repository."""
         from benchmark_core.db.repositories import ProxyCredentialRepository
+
         return ProxyCredentialRepository(db_session)
 
     @pytest.fixture
@@ -596,7 +597,9 @@ async def setup_session_with_credential(self, db_session):
 
         return session, experiment, variant
 
-    async def test_create_credential(self, credential_repo, db_session, setup_session_with_credential):
+    async def test_create_credential(
+        self, credential_repo, db_session, setup_session_with_credential
+    ):
         """Test creating a credential metadata record."""
         from datetime import UTC, datetime, timedelta
 
@@ -690,7 +693,9 @@ async def test_get_by_alias(self, credential_repo, db_session, setup_session_wit
         assert found.key_alias == "unique-test-alias-12345"
         assert found.litellm_key_id == "litellm-789"
 
-    async def test_revoke_credential(self, credential_repo, db_session, setup_session_with_credential):
+    async def test_revoke_credential(
+        self, credential_repo, db_session, setup_session_with_credential
+    ):
         """Test revoking a credential."""
         from datetime import UTC, datetime, timedelta
 
@@ -777,7 +782,9 @@ def setup_experiment_and_session(self, db_session):
         return experiment, session
 
     @pytest.mark.asyncio
-    async def test_create_artifact_with_session(self, db_session, artifact_repo, setup_experiment_and_session):
+    async def test_create_artifact_with_session(
+        self, db_session, artifact_repo, setup_experiment_and_session
+    ):
         """Test creating an artifact linked to a session."""
         experiment, session = setup_experiment_and_session
 
@@ -799,7 +806,9 @@ async def test_create_artifact_with_session(self, db_session, artifact_repo, set
         assert created.experiment_id is None
 
     @pytest.mark.asyncio
-    async def test_create_artifact_with_experiment(self, db_session, artifact_repo, setup_experiment_and_session):
+    async def test_create_artifact_with_experiment(
+        self, db_session, artifact_repo, setup_experiment_and_session
+    ):
         """Test creating an artifact linked to an experiment."""
         experiment, _ = setup_experiment_and_session
 
@@ -819,7 +828,9 @@ async def test_create_artifact_with_experiment(self, db_session, artifact_repo,
         assert created.session_id is None
 
     @pytest.mark.asyncio
-    async def test_get_artifact_by_id(self, db_session, artifact_repo, setup_experiment_and_session):
+    async def test_get_artifact_by_id(
+        self, db_session, artifact_repo, setup_experiment_and_session
+    ):
         """Test retrieving an artifact by ID."""
         experiment, session = setup_experiment_and_session
 
@@ -874,7 +885,9 @@ async def test_list_by_session(self, db_session, artifact_repo, setup_experiment
             assert result.session_id == session.id
 
     @pytest.mark.asyncio
-    async def test_list_by_experiment(self, db_session, artifact_repo, setup_experiment_and_session):
+    async def test_list_by_experiment(
+        self, db_session, artifact_repo, setup_experiment_and_session
+    ):
         """Test listing artifacts by experiment."""
         experiment, session = setup_experiment_and_session
 
diff --git a/tests/unit/test_session_commands.py b/tests/unit/test_session_commands.py
index d07ee88..17a404a 100644
--- a/tests/unit/test_session_commands.py
+++ b/tests/unit/test_session_commands.py
@@ -431,7 +431,9 @@ def test_finalize_session_with_custom_status(self, db_session, mock_env_db_url,
         db_session.add(session)
         db_session.commit()
 
-        result = runner.invoke(app, ["session", "finalize", str(session.id), "--outcome", "invalid"])
+        result = runner.invoke(
+            app, ["session", "finalize", str(session.id), "--outcome", "invalid"]
+        )
 
         assert result.exit_code == 0, f"Exit code: {result.exit_code}, Output: {result.output}"
 
@@ -540,7 +542,6 @@ def test_env_session_not_found(self, mock_env_db_url, runner):
         assert "Session not found" in result.output or "Exit" in result.output
 
 
-
 class TestSessionAddNotesCommand:
     """Tests for session add-notes CLI command."""
 
diff --git a/tests/validation/__init__.py b/tests/validation/__init__.py
new file mode 100644
index 0000000..b63eba7
--- /dev/null
+++ b/tests/validation/__init__.py
@@ -0,0 +1,7 @@
+"""Validation tests for CI pipeline.
+
+These tests verify:
+- Config file schemas are valid
+- Database migrations work correctly
+- Collector implementations are properly structured
+"""
diff --git a/tests/validation/test_collectors.py b/tests/validation/test_collectors.py
new file mode 100644
index 0000000..0787640
--- /dev/null
+++ b/tests/validation/test_collectors.py
@@ -0,0 +1,142 @@
+"""Collector validation smoke tests for CI.
+
+Validates collector implementations are properly structured and importable.
+This ensures collector regressions are caught automatically.
+"""
+
+from pathlib import Path
+
+import pytest
+
+
+class TestCollectorStructure:
+    """Test collector module structure."""
+
+    def test_collectors_directory_exists(self) -> None:
+        """Verify collectors directory exists."""
+        collectors_dir = Path(__file__).parent.parent.parent / "src" / "collectors"
+        assert collectors_dir.exists(), f"Collectors directory not found: {collectors_dir}"
+
+    def test_collectors_init_exists(self) -> None:
+        """Verify collectors __init__.py exists."""
+        init_file = Path(__file__).parent.parent.parent / "src" / "collectors" / "__init__.py"
+        assert init_file.exists(), f"Collectors __init__.py not found: {init_file}"
+
+
+class TestCollectorImports:
+    """Test that all collector modules can be imported."""
+
+    def test_import_litellm_collector(self) -> None:
+        """Test importing litellm_collector module."""
+        try:
+            from collectors import litellm_collector
+
+            assert litellm_collector is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import litellm_collector: {e}")
+
+    def test_import_prometheus_collector(self) -> None:
+        """Test importing prometheus_collector module."""
+        try:
+            from collectors import prometheus_collector
+
+            assert prometheus_collector is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import prometheus_collector: {e}")
+
+    def test_import_normalization(self) -> None:
+        """Test importing normalization module."""
+        try:
+            from collectors import normalization
+
+            assert normalization is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import normalization: {e}")
+
+    def test_import_normalize_requests(self) -> None:
+        """Test importing normalize_requests module."""
+        try:
+            from collectors import normalize_requests
+
+            assert normalize_requests is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import normalize_requests: {e}")
+
+    def test_import_metric_catalog(self) -> None:
+        """Test importing metric_catalog module."""
+        try:
+            from collectors import metric_catalog
+
+            assert metric_catalog is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import metric_catalog: {e}")
+
+    def test_import_rollup_job(self) -> None:
+        """Test importing rollup_job module."""
+        try:
+            from collectors import rollup_job
+
+            assert rollup_job is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import rollup_job: {e}")
+
+    def test_import_retention_cleanup(self) -> None:
+        """Test importing retention_cleanup module."""
+        try:
+            from collectors import retention_cleanup
+
+            assert retention_cleanup is not None
+        except ImportError as e:
+            pytest.fail(f"Failed to import retention_cleanup: {e}")
+
+
+class TestCollectorFunctionSignatures:
+    """Test that collector functions have expected signatures."""
+
+    def test_litellm_collector_has_collect_function(self) -> None:
+        """Test litellm_collector has collection function."""
+        from collectors import litellm_collector
+
+        # Check for expected functions/classes
+        # This will vary based on actual implementation
+        assert hasattr(litellm_collector, "__all__") or any(
+            callable(getattr(litellm_collector, name))
+            for name in dir(litellm_collector)
+            if not name.startswith("_")
+        )
+
+    def test_prometheus_collector_has_collect_function(self) -> None:
+        """Test prometheus_collector has collection function."""
+        from collectors import prometheus_collector
+
+        # Check for expected functions/classes
+        assert hasattr(prometheus_collector, "__all__") or any(
+            callable(getattr(prometheus_collector, name))
+            for name in dir(prometheus_collector)
+            if not name.startswith("_")
+        )
+
+
+class TestCollectorModuleDocstrings:
+    """Test that collector modules have proper documentation."""
+
+    def test_litellm_collector_has_docstring(self) -> None:
+        """Test litellm_collector module has docstring."""
+        from collectors import litellm_collector
+
+        assert litellm_collector.__doc__ is not None
+        assert len(litellm_collector.__doc__) > 0
+
+    def test_prometheus_collector_has_docstring(self) -> None:
+        """Test prometheus_collector module has docstring."""
+        from collectors import prometheus_collector
+
+        assert prometheus_collector.__doc__ is not None
+        assert len(prometheus_collector.__doc__) > 0
+
+    def test_normalization_has_docstring(self) -> None:
+        """Test normalization module has docstring."""
+        from collectors import normalization
+
+        assert normalization.__doc__ is not None
+        assert len(normalization.__doc__) > 0
diff --git a/tests/validation/test_config_validation.py b/tests/validation/test_config_validation.py
new file mode 100644
index 0000000..eaa24d6
--- /dev/null
+++ b/tests/validation/test_config_validation.py
@@ -0,0 +1,213 @@
+"""Config validation smoke tests for CI.
+
+Validates all YAML config files against their typed Pydantic schemas.
+This ensures config regressions are caught automatically.
+"""
+
+from pathlib import Path
+
+import pytest
+import yaml
+
+from benchmark_core.config import (
+    Experiment,
+    HarnessProfile,
+    ProviderConfig,
+    TaskCard,
+    Variant,
+)
+from benchmark_core.config_loader import ConfigLoader
+
+# Config directories
+CONFIG_ROOT = Path(__file__).parent.parent.parent / "configs"
+PROVIDERS_DIR = CONFIG_ROOT / "providers"
+HARNESSES_DIR = CONFIG_ROOT / "harnesses"
+VARIANTS_DIR = CONFIG_ROOT / "variants"
+EXPERIMENTS_DIR = CONFIG_ROOT / "experiments"
+TASK_CARDS_DIR = CONFIG_ROOT / "task-cards"
+
+
+class TestProviderConfigs:
+    """Validate all provider config files."""
+
+    @pytest.fixture
+    def provider_files(self) -> list[Path]:
+        """Get all provider config files."""
+        if not PROVIDERS_DIR.exists():
+            return []
+        return list(PROVIDERS_DIR.glob("*.yaml"))
+
+    def test_providers_directory_exists(self) -> None:
+        """Verify providers config directory exists."""
+        assert PROVIDERS_DIR.exists(), f"Providers config directory not found: {PROVIDERS_DIR}"
+
+    def test_all_provider_configs_valid(self, provider_files: list[Path]) -> None:
+        """Validate all provider config files parse and validate correctly."""
+        if not provider_files:
+            pytest.skip("No provider config files found")
+
+        for config_file in provider_files:
+            with open(config_file) as f:
+                data = yaml.safe_load(f)
+
+            # Validate against schema
+            try:
+                ProviderConfig(**data)
+            except Exception as e:
+                pytest.fail(f"Provider config {config_file.name} failed validation: {e}")
+
+
+class TestHarnessProfiles:
+    """Validate all harness profile config files."""
+
+    @pytest.fixture
+    def harness_files(self) -> list[Path]:
+        """Get all harness profile config files."""
+        if not HARNESSES_DIR.exists():
+            return []
+        return list(HARNESSES_DIR.glob("*.yaml"))
+
+    def test_harnesses_directory_exists(self) -> None:
+        """Verify harnesses config directory exists."""
+        assert HARNESSES_DIR.exists(), f"Harnesses config directory not found: {HARNESSES_DIR}"
+
+    def test_all_harness_configs_valid(self, harness_files: list[Path]) -> None:
+        """Validate all harness profile config files parse and validate correctly."""
+        if not harness_files:
+            pytest.skip("No harness config files found")
+
+        for config_file in harness_files:
+            with open(config_file) as f:
+                data = yaml.safe_load(f)
+
+            # Validate against schema
+            try:
+                HarnessProfile(**data)
+            except Exception as e:
+                pytest.fail(f"Harness profile {config_file.name} failed validation: {e}")
+
+
+class TestVariantConfigs:
+    """Validate all variant config files."""
+
+    @pytest.fixture
+    def variant_files(self) -> list[Path]:
+        """Get all variant config files."""
+        if not VARIANTS_DIR.exists():
+            return []
+        return list(VARIANTS_DIR.glob("*.yaml"))
+
+    def test_variants_directory_exists(self) -> None:
+        """Verify variants config directory exists."""
+        assert VARIANTS_DIR.exists(), f"Variants config directory not found: {VARIANTS_DIR}"
+
+    def test_all_variant_configs_valid(self, variant_files: list[Path]) -> None:
+        """Validate all variant config files parse and validate correctly."""
+        if not variant_files:
+            pytest.skip("No variant config files found")
+
+        for config_file in variant_files:
+            with open(config_file) as f:
+                data = yaml.safe_load(f)
+
+            # Validate against schema
+            try:
+                Variant(**data)
+            except Exception as e:
+                pytest.fail(f"Variant config {config_file.name} failed validation: {e}")
+
+
+class TestExperimentConfigs:
+    """Validate all experiment config files."""
+
+    @pytest.fixture
+    def experiment_files(self) -> list[Path]:
+        """Get all experiment config files."""
+        if not EXPERIMENTS_DIR.exists():
+            return []
+        return list(EXPERIMENTS_DIR.glob("*.yaml"))
+
+    def test_experiments_directory_exists(self) -> None:
+        """Verify experiments config directory exists."""
+        assert EXPERIMENTS_DIR.exists(), (
+            f"Experiments config directory not found: {EXPERIMENTS_DIR}"
+        )
+
+    def test_all_experiment_configs_valid(self, experiment_files: list[Path]) -> None:
+        """Validate all experiment config files parse and validate correctly."""
+        if not experiment_files:
+            pytest.skip("No experiment config files found")
+
+        for config_file in experiment_files:
+            with open(config_file) as f:
+                data = yaml.safe_load(f)
+
+            # Validate against schema
+            try:
+                Experiment(**data)
+            except Exception as e:
+                pytest.fail(f"Experiment config {config_file.name} failed validation: {e}")
+
+
+class TestTaskCardConfigs:
+    """Validate all task card config files."""
+
+    @pytest.fixture
+    def task_card_files(self) -> list[Path]:
+        """Get all task card config files."""
+        if not TASK_CARDS_DIR.exists():
+            return []
+        return list(TASK_CARDS_DIR.glob("*.yaml"))
+
+    def test_task_cards_directory_exists(self) -> None:
+        """Verify task cards config directory exists."""
+        assert TASK_CARDS_DIR.exists(), f"Task cards config directory not found: {TASK_CARDS_DIR}"
+
+    def test_all_task_card_configs_valid(self, task_card_files: list[Path]) -> None:
+        """Validate all task card config files parse and validate correctly."""
+        if not task_card_files:
+            pytest.skip("No task card config files found")
+
+        for config_file in task_card_files:
+            with open(config_file) as f:
+                data = yaml.safe_load(f)
+
+            # Validate against schema
+            try:
+                TaskCard(**data)
+            except Exception as e:
+                pytest.fail(f"Task card config {config_file.name} failed validation: {e}")
+
+
+class TestConfigLoaderIntegration:
+    """Test ConfigLoader can load all configs."""
+
+    def test_config_loader_providers(self) -> None:
+        """Test loading all provider configs."""
+        loader = ConfigLoader(PROVIDERS_DIR.parent)
+        if PROVIDERS_DIR.exists():
+            providers = loader.load_providers()
+            assert isinstance(providers, dict)
+            # Should have loaded at least one provider if files exist
+            if list(PROVIDERS_DIR.glob("*.yaml")):
+                assert len(providers) > 0, "No providers loaded from config directory"
+
+    def test_config_loader_harness_profiles(self) -> None:
+        """Test loading all harness profile configs."""
+        loader = ConfigLoader(HARNESSES_DIR.parent)
+        if HARNESSES_DIR.exists():
+            profiles = loader.load_harness_profiles()
+            assert isinstance(profiles, dict)
+            # Should have loaded at least one profile if files exist
+            if list(HARNESSES_DIR.glob("*.yaml")):
+                assert len(profiles) > 0, "No harness profiles loaded from config directory"
+
+    def test_config_loader_variants(self) -> None:
+        """Test loading all variant configs."""
+        loader = ConfigLoader(VARIANTS_DIR.parent)
+        if VARIANTS_DIR.exists():
+            variants = loader.load_variants()
+            assert isinstance(variants, dict)
+            # Should have loaded at least one variant if files exist
+            if list(VARIANTS_DIR.glob("*.yaml")):
+                assert len(variants) > 0, "No variants loaded from config directory"
diff --git a/tests/validation/test_migrations.py b/tests/validation/test_migrations.py
new file mode 100644
index 0000000..75dfe46
--- /dev/null
+++ b/tests/validation/test_migrations.py
@@ -0,0 +1,187 @@
+"""Migration smoke tests for CI.
+
+Validates database migrations work correctly in a clean environment.
+This ensures migration regressions are caught automatically.
+"""
+
+from pathlib import Path
+
+import pytest
+from sqlalchemy import create_engine, inspect
+from sqlalchemy.orm import Session
+
+from benchmark_core.db.models import (
+    HarnessProfile,
+    Provider,
+    ProviderModel,
+    Variant,
+)
+from benchmark_core.db.session import init_db
+
+
+class TestMigrations:
+    """Test database migrations."""
+
+    @pytest.fixture
+    def temp_db(self, tmp_path: Path):
+        """Create a temporary database for testing."""
+        db_file = tmp_path / "test.db"
+        database_url = f"sqlite:///{db_file}"
+
+        # Create engine with foreign key support
+        engine = create_engine(
+            database_url,
+            connect_args={"check_same_thread": False},
+        )
+
+        # Enable foreign keys for SQLite
+        from sqlalchemy import event
+
+        @event.listens_for(engine, "connect")
+        def _fk_pragma_on_connect(dbapi_conn, connection_record):
+            dbapi_conn.execute("PRAGMA foreign_keys=ON")
+
+        yield engine
+
+        engine.dispose()
+
+    def test_init_db_creates_all_tables(self, temp_db) -> None:
+        """Verify init_db creates all expected tables."""
+        init_db(temp_db)
+
+        inspector = inspect(temp_db)
+        tables = inspector.get_table_names()
+
+        expected_tables = [
+            "providers",
+            "provider_models",
+            "harness_profiles",
+            "variants",
+            "experiments",
+            "experiment_variants",
+            "task_cards",
+            "sessions",
+            "requests",
+            "rollups",
+            "artifacts",
+        ]
+
+        for table in expected_tables:
+            assert table in tables, f"Missing table: {table}"
+
+    def test_migration_files_exist(self) -> None:
+        """Verify migration files exist in the repository."""
+        migrations_dir = Path(__file__).parent.parent.parent / "migrations" / "versions"
+        assert migrations_dir.exists(), f"Migrations directory not found: {migrations_dir}"
+
+        migration_files = list(migrations_dir.glob("*.py"))
+        assert len(migration_files) > 0, "No migration files found"
+
+    def test_alembic_config_exists(self) -> None:
+        """Verify alembic.ini exists."""
+        alembic_ini = Path(__file__).parent.parent.parent / "alembic.ini"
+        assert alembic_ini.exists(), f"alembic.ini not found: {alembic_ini}"
+
+    def test_migrations_env_exists(self) -> None:
+        """Verify migrations/env.py exists."""
+        env_py = Path(__file__).parent.parent.parent / "migrations" / "env.py"
+        assert env_py.exists(), f"migrations/env.py not found: {env_py}"
+
+
+class TestDatabaseIntegrity:
+    """Test database referential integrity and relationships."""
+
+    @pytest.fixture
+    def temp_db_with_data(self, tmp_path: Path):
+        """Create a temporary database with sample data."""
+        db_file = tmp_path / "test.db"
+        database_url = f"sqlite:///{db_file}"
+
+        engine = create_engine(
+            database_url,
+            connect_args={"check_same_thread": False},
+        )
+
+        # Enable foreign keys for SQLite
+        from sqlalchemy import event
+
+        @event.listens_for(engine, "connect")
+        def _fk_pragma_on_connect(dbapi_conn, connection_record):
+            dbapi_conn.execute("PRAGMA foreign_keys=ON")
+
+        init_db(engine)
+
+        # Add sample data
+        with Session(engine) as session:
+            provider = Provider(
+                name="test-provider",
+                protocol_surface="openai_responses",
+                upstream_base_url_env="TEST_URL",
+                api_key_env="TEST_KEY",
+            )
+            session.add(provider)
+            session.flush()
+
+            model = ProviderModel(
+                provider_id=provider.id,
+                alias="test-model",
+                upstream_model="test-model-upstream",
+            )
+            session.add(model)
+
+            profile = HarnessProfile(
+                name="test-profile",
+                protocol_surface="openai_responses",
+                base_url_env="PROXY_URL",
+                api_key_env="PROXY_KEY",
+                model_env="PROXY_MODEL",
+            )
+            session.add(profile)
+
+            variant = Variant(
+                name="test-variant",
+                provider="test-provider",
+                model_alias="test-model",
+                harness_profile="test-profile",
+                benchmark_tags={"test": "true"},
+            )
+            session.add(variant)
+            session.commit()
+
+        yield engine
+
+        engine.dispose()
+
+    def test_provider_model_relationship(self, temp_db_with_data) -> None:
+        """Test Provider -> ProviderModel relationship."""
+        with Session(temp_db_with_data) as session:
+            provider = session.query(Provider).filter_by(name="test-provider").first()
+            assert provider is not None
+            assert len(provider.models) == 1
+            assert provider.models[0].alias == "test-model"
+
+    def test_model_provider_relationship(self, temp_db_with_data) -> None:
+        """Test ProviderModel -> Provider relationship."""
+        with Session(temp_db_with_data) as session:
+            model = session.query(ProviderModel).first()
+            assert model is not None
+            assert model.provider.name == "test-provider"
+
+
+class TestMigrationRollback:
+    """Test migration rollback functionality."""
+
+    def test_alembic_downgrade_available(self) -> None:
+        """Verify alembic downgrade commands are available in migrations."""
+        migrations_dir = Path(__file__).parent.parent.parent / "migrations" / "versions"
+
+        # Check that at least one migration file exists
+        migration_files = list(migrations_dir.glob("*.py"))
+        if not migration_files:
+            pytest.skip("No migration files to test")
+
+        # Each migration should have upgrade and downgrade functions
+        for migration_file in migration_files[:1]:  # Check first one
+            content = migration_file.read_text()
+            assert "def upgrade()" in content or "upgrade()" in content
+            assert "def downgrade()" in content or "downgrade()" in content

From a25499f64082e2a90cfdb1ef98a19e2ba7848a01 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Sun, 29 Mar 2026 23:22:43 -0500
Subject: [PATCH 02/12] fix(ci): use Makefile targets in CI workflow and add
 EOF newline

- Change validation jobs to use 'make validate-*' commands
- Ensures consistency between CI and local commands
- Add newline at end of file

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/ci.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dd06b28..4e950e2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -132,7 +132,7 @@ jobs:
         run: uv pip install --system -e ".[dev]"
 
       - name: Run config validation tests
-        run: pytest tests/validation/test_config_validation.py -v
+        run: make validate-config
 
   migration-check:
     name: Migration Check
@@ -156,7 +156,7 @@ jobs:
         run: uv pip install --system -e ".[dev]"
 
       - name: Run migration tests
-        run: pytest tests/validation/test_migrations.py -v
+        run: make validate-migrations
 
   collector-check:
     name: Collector Check
@@ -180,7 +180,7 @@ jobs:
         run: uv pip install --system -e ".[dev]"
 
       - name: Run collector tests
-        run: pytest tests/validation/test_collectors.py -v
+        run: make validate-collectors
 
   quality:
     name: Quality Gate
@@ -200,4 +200,4 @@ jobs:
             echo "::error::One or more quality checks failed"
             exit 1
           fi
-          echo "✓ All quality checks passed"
\ No newline at end of file
+          echo "✓ All quality checks passed"

From b59d20dc908aec238edea2f6e12f1da6620e3d59 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 01:11:03 -0500
Subject: [PATCH 03/12] Strengthen collector validation tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace weak generic callable checks with specific class imports
- Check for LiteLLMCollector and CollectionDiagnostics classes
- Check for PrometheusCollector class
- Addresses AI review feedback on weak tests

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/validation/test_collectors.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/tests/validation/test_collectors.py b/tests/validation/test_collectors.py
index 0787640..bae8999 100644
--- a/tests/validation/test_collectors.py
+++ b/tests/validation/test_collectors.py
@@ -95,26 +95,16 @@ class TestCollectorFunctionSignatures:
 
     def test_litellm_collector_has_collect_function(self) -> None:
         """Test litellm_collector has collection function."""
-        from collectors import litellm_collector
+        from collectors.litellm_collector import LiteLLMCollector, CollectionDiagnostics
 
-        # Check for expected functions/classes
-        # This will vary based on actual implementation
-        assert hasattr(litellm_collector, "__all__") or any(
-            callable(getattr(litellm_collector, name))
-            for name in dir(litellm_collector)
-            if not name.startswith("_")
-        )
+        assert LiteLLMCollector is not None
+        assert CollectionDiagnostics is not None
 
     def test_prometheus_collector_has_collect_function(self) -> None:
         """Test prometheus_collector has collection function."""
-        from collectors import prometheus_collector
+        from collectors.prometheus_collector import PrometheusCollector
 
-        # Check for expected functions/classes
-        assert hasattr(prometheus_collector, "__all__") or any(
-            callable(getattr(prometheus_collector, name))
-            for name in dir(prometheus_collector)
-            if not name.startswith("_")
-        )
+        assert PrometheusCollector is not None
 
 
 class TestCollectorModuleDocstrings:

From cc29d58492449dc90dddf3fedcb7f014eccee760 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:11:31 -0500
Subject: [PATCH 04/12] fix: resolve config validation and import ordering
 issues

- Fix variant config: rename openai-gpt-4o-cli to openai-gpt-5.4-cli
- Update provider models to match available models (gpt-5.4, gpt-5.4-mini)
- Fix test references to use new variant name
- Fix import ordering in test_collectors.py (move imports to top)

All validation tests now pass:
- Config Validation: 13 tests passed
- Migration Check: 7 tests passed
- Collector Check: 14 tests passed
---
 .../{openai-gpt-4o-cli.yaml => openai-gpt-5.4-cli.yaml}   | 6 +++---
 tests/unit/test_config.py                                 | 6 +++---
 tests/validation/test_collectors.py                       | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)
 rename configs/variants/{openai-gpt-4o-cli.yaml => openai-gpt-5.4-cli.yaml} (75%)

diff --git a/configs/variants/openai-gpt-4o-cli.yaml b/configs/variants/openai-gpt-5.4-cli.yaml
similarity index 75%
rename from configs/variants/openai-gpt-4o-cli.yaml
rename to configs/variants/openai-gpt-5.4-cli.yaml
index be7379a..fa5610b 100644
--- a/configs/variants/openai-gpt-4o-cli.yaml
+++ b/configs/variants/openai-gpt-5.4-cli.yaml
@@ -1,12 +1,12 @@
-name: openai-gpt-4o-cli
+name: openai-gpt-5.4-cli
 provider: openai
 provider_route: openai-main
-model_alias: gpt-4o
+model_alias: gpt-5.4
 harness_profile: openai-cli
 harness_env_overrides:
   OPENAI_TIMEOUT: "120"
 benchmark_tags:
   harness: openai-cli
   provider: openai
-  model: gpt-4o
+  model: gpt-5.4
   config: default
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index a438baf..22b1bc9 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -447,7 +447,7 @@ def test_load_all_configs(self) -> None:
 
         # Verify variants loaded
         assert "fireworks-kimi-k2-5-claude-code" in registry.variants
-        assert "openai-gpt-4o-cli" in registry.variants
+        assert "openai-gpt-5.4-cli" in registry.variants
 
         # Verify experiments loaded
         assert "fireworks-terminal-agents-comparison" in registry.experiments
@@ -492,8 +492,8 @@ def test_valid_protocol_compatibility(self) -> None:
         assert variant1.provider == "fireworks"
         assert variant1.harness_profile == "claude-code"
 
-        # OpenAI-GPT-4o-CLI: openai_responses + openai_responses ✓
-        variant2 = registry.variants["openai-gpt-4o-cli"]
+        # OpenAI-GPT-5.4-CLI: openai_responses + openai_responses ✓
+        variant2 = registry.variants["openai-gpt-5.4-cli"]
         assert variant2.provider == "openai"
         assert variant2.harness_profile == "openai-cli"
 
diff --git a/tests/validation/test_collectors.py b/tests/validation/test_collectors.py
index bae8999..376ad7e 100644
--- a/tests/validation/test_collectors.py
+++ b/tests/validation/test_collectors.py
@@ -8,6 +8,10 @@
 
 import pytest
 
+# Late imports tested separately
+from collectors.litellm_collector import CollectionDiagnostics, LiteLLMCollector
+from collectors.prometheus_collector import PrometheusCollector
+
 
 class TestCollectorStructure:
     """Test collector module structure."""
@@ -95,15 +99,11 @@ class TestCollectorFunctionSignatures:
 
     def test_litellm_collector_has_collect_function(self) -> None:
         """Test litellm_collector has collection function."""
-        from collectors.litellm_collector import LiteLLMCollector, CollectionDiagnostics
-
         assert LiteLLMCollector is not None
         assert CollectionDiagnostics is not None
 
     def test_prometheus_collector_has_collect_function(self) -> None:
         """Test prometheus_collector has collection function."""
-        from collectors.prometheus_collector import PrometheusCollector
-
         assert PrometheusCollector is not None
 
 

From aa8e6b8b8f12781b70bab15eb76fc1c5b6d31e89 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:16:39 -0500
Subject: [PATCH 05/12] fix(deps): add missing requests dependency

- Add requests>=2.31.0 to production dependencies
- Add types-requests>=2.31.0 to dev dependencies for mypy

Fixes CI test failures due to ModuleNotFoundError: No module named 'requests'
---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index c4dcd35..3d3ca13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     "typer>=0.9.0",
     "rich>=13.0.0",
     "httpx>=0.25.0",
+    "requests>=2.31.0",
     "sqlalchemy>=2.0.0",
     "alembic>=1.12.0",
     "psycopg2-binary>=2.9.0",
@@ -35,6 +36,7 @@ dev = [
     "ruff>=0.1.0",
     "mypy>=1.7.0",
     "types-PyYAML>=6.0.0",
+    "types-requests>=2.31.0",
     "pytest>=7.4.0",
     "pytest-asyncio>=0.21.0",
     "pytest-cov>=4.1.0",

From 4309c71439f062ebb8c44d1d221f0ad0a33c15d6 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:19:57 -0500
Subject: [PATCH 06/12] fix: resolve lint errors across codebase

- Fix health_service.py: use enum.StrEnum instead of (str, enum.Enum)
- Fix collect.py: import ordering, remove unused import, use OSError
- Fix test_rendering.py: use specific ValidationError instead of blind Exception
- Fix test_rollup_repository.py: import ordering

All lint checks now pass.
---
 src/benchmark_core/services/health_service.py |  2 +-
 src/cli/commands/collect.py                   | 28 +++++++++++--------
 tests/unit/test_rendering.py                  |  9 +++---
 tests/unit/test_rollup_repository.py          |  8 +++---
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/benchmark_core/services/health_service.py b/src/benchmark_core/services/health_service.py
index efd60d5..e40d9f0 100644
--- a/src/benchmark_core/services/health_service.py
+++ b/src/benchmark_core/services/health_service.py
@@ -10,7 +10,7 @@
 from benchmark_core.db.session import create_database_engine, get_database_url
 
 
-class HealthStatus(str, enum.Enum):
+class HealthStatus(enum.StrEnum):
     """Health check status values."""
 
     HEALTHY = "healthy"
diff --git a/src/cli/commands/collect.py b/src/cli/commands/collect.py
index 1e00453..56377d7 100644
--- a/src/cli/commands/collect.py
+++ b/src/cli/commands/collect.py
@@ -1,5 +1,6 @@
 """CLI commands for data collection from LiteLLM and Prometheus."""
 
+from datetime import UTC
 from typing import Annotated
 from uuid import UUID
 
@@ -12,7 +13,7 @@
 from benchmark_core.db.session import get_db_session
 from benchmark_core.repositories.request_repository import SQLRequestRepository
 from benchmark_core.repositories.rollup_repository import SQLRollupRepository
-from collectors.litellm_collector import CollectionDiagnostics, LiteLLMCollector
+from collectors.litellm_collector import CollectionDiagnostics
 from collectors.normalize_requests import RequestNormalizerJob
 from collectors.prometheus_collector import PrometheusCollector
 from collectors.rollup_job import RollupJob
@@ -134,7 +135,7 @@ async def _run_async() -> tuple[int, CollectionDiagnostics]:
             db_session.commit()
             return len(written), diagnostics
 
-        except (ValueError, IOError, httpx.HTTPError) as err:
+        except (OSError, ValueError, httpx.HTTPError) as err:
             db_session.rollback()
             console.print(f"[red]Error during collection: {err}[/red]")
             raise typer.Exit(1) from err
@@ -221,8 +222,7 @@ def collect_prometheus(
     including latency percentiles, throughput, and error rates.
     """
     import asyncio
-
-    from datetime import datetime, timezone, timedelta
+    from datetime import datetime, timedelta
 
     try:
         session_uuid = UUID(session_id)
@@ -232,9 +232,9 @@ def collect_prometheus(
 
     # Default time range if not provided
     if not start_time:
-        start_time = (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat()
+        start_time = (datetime.now(UTC) - timedelta(hours=1)).isoformat()
     if not end_time:
-        end_time = datetime.now(timezone.utc).isoformat()
+        end_time = datetime.now(UTC).isoformat()
 
     async def _run_async() -> int:
         db_session: SQLAlchemySession = get_db_session()
@@ -257,7 +257,7 @@ async def _run_async() -> int:
             db_session.commit()
             return len(written)
 
-        except (ValueError, IOError, httpx.HTTPError) as err:
+        except (OSError, ValueError, httpx.HTTPError) as err:
             db_session.rollback()
             console.print(f"[red]Error during Prometheus collection: {err}[/red]")
             raise typer.Exit(1) from err
@@ -319,6 +319,7 @@ def compute_rollups(
     import asyncio
 
     from sqlalchemy import select
+
     from benchmark_core.db.models import Request as RequestORM
 
     try:
@@ -385,7 +386,7 @@ async def _run_async() -> tuple[int, int]:
 
             return request_rollup_count, session_rollup_count
 
-        except (ValueError, IOError) as err:
+        except (OSError, ValueError) as err:
             db_session.rollback()
             console.print(f"[red]Error during rollup computation: {err}[/red]")
             raise typer.Exit(1) from err
@@ -432,6 +433,7 @@ def compute_variant_rollups(
     import asyncio
 
     from sqlalchemy import select
+
     from benchmark_core.db.models import Session as SessionORM
 
     async def _run_async() -> int:
@@ -476,7 +478,7 @@ async def _run_async() -> int:
 
             return len(rollups)
 
-        except (ValueError, IOError) as err:
+        except (OSError, ValueError) as err:
             db_session.rollback()
             console.print(f"[red]Error during variant rollup computation: {err}[/red]")
             raise typer.Exit(1) from err
@@ -522,7 +524,9 @@ def compute_experiment_rollups(
     import asyncio
 
     from sqlalchemy import select
-    from benchmark_core.db.models import Session as SessionORM, Variant as VariantORM
+
+    from benchmark_core.db.models import Session as SessionORM
+    from benchmark_core.db.models import Variant as VariantORM
 
     async def _run_async() -> int:
         db_session: SQLAlchemySession = get_db_session()
@@ -534,7 +538,7 @@ async def _run_async() -> int:
             sessions = result.scalars().all()
 
             # Get unique variant IDs
-            variant_ids = list(set(s.variant_id for s in sessions if s.variant_id))
+            variant_ids = list({s.variant_id for s in sessions if s.variant_id})
 
             # Build variant data
             variants_data = []
@@ -566,7 +570,7 @@ async def _run_async() -> int:
 
             return len(rollups)
 
-        except (ValueError, IOError) as err:
+        except (OSError, ValueError) as err:
             db_session.rollback()
             console.print(f"[red]Error during experiment rollup computation: {err}[/red]")
             raise typer.Exit(1) from err
diff --git a/tests/unit/test_rendering.py b/tests/unit/test_rendering.py
index ce9cc8c..804b6c2 100644
--- a/tests/unit/test_rendering.py
+++ b/tests/unit/test_rendering.py
@@ -4,6 +4,7 @@
 """
 
 import pytest
+from pydantic import ValidationError
 
 from benchmark_core.config import HarnessProfile, Variant
 from benchmark_core.services.rendering import (
@@ -341,7 +342,7 @@ def test_missing_name_fails(
     ) -> None:
         """Profile without name fails validation at Pydantic level."""
         # Pydantic validates this at model construction time
-        with pytest.raises(Exception):  # Pydantic ValidationError
+        with pytest.raises(ValidationError):
             HarnessProfile(
                 name="",  # Empty name
                 protocol_surface="openai_responses",
@@ -356,7 +357,7 @@ def test_missing_required_env_vars_fails(
     ) -> None:
         """Profile without required env vars fails validation at Pydantic level."""
         # Pydantic validates this at model construction time
-        with pytest.raises(Exception):  # Pydantic ValidationError
+        with pytest.raises(ValidationError):
             HarnessProfile(
                 name="test-profile",
                 protocol_surface="openai_responses",
@@ -387,7 +388,7 @@ def test_invalid_protocol_surface_fails(
     ) -> None:
         """Profile with invalid protocol surface fails at Pydantic level."""
         # Pydantic validates Literal types at model construction time
-        with pytest.raises(Exception):  # Pydantic ValidationError
+        with pytest.raises(ValidationError):
             HarnessProfile(
                 name="test-profile",
                 protocol_surface="invalid_protocol",  # type: ignore
@@ -402,7 +403,7 @@ def test_invalid_render_format_fails(
     ) -> None:
         """Profile with invalid render format fails at Pydantic level."""
         # Pydantic validates Literal types at model construction time
-        with pytest.raises(Exception):  # Pydantic ValidationError
+        with pytest.raises(ValidationError):
             HarnessProfile(
                 name="test-profile",
                 protocol_surface="openai_responses",
diff --git a/tests/unit/test_rollup_repository.py b/tests/unit/test_rollup_repository.py
index 58d5756..2474509 100644
--- a/tests/unit/test_rollup_repository.py
+++ b/tests/unit/test_rollup_repository.py
@@ -4,8 +4,6 @@
 from unittest.mock import MagicMock
 from uuid import uuid4
 
-import pytest
-
 from benchmark_core.models import MetricRollup
 from benchmark_core.repositories.rollup_repository import SQLRollupRepository
 
@@ -39,9 +37,10 @@ def test_to_orm_conversion(self) -> None:
 
     def test_to_domain_conversion(self) -> None:
         """Test ORM to domain model conversion."""
-        from benchmark_core.db.models import MetricRollup as MetricRollupORM
         import uuid
 
+        from benchmark_core.db.models import MetricRollup as MetricRollupORM
+
         orm = MetricRollupORM(
             id=uuid.uuid4(),
             dimension_type="request",
@@ -102,9 +101,10 @@ def test_create_many_with_rollups(self) -> None:
 
     def test_get_by_dimension_returns_domain_models(self) -> None:
         """Test get_by_dimension returns domain models."""
-        from benchmark_core.db.models import MetricRollup as MetricRollupORM
         import uuid
 
+        from benchmark_core.db.models import MetricRollup as MetricRollupORM
+
         # Create mock ORM objects with all required fields
         mock_orm1 = MetricRollupORM(
             id=uuid.uuid4(),

From 98b3f47e97cde95027d071146d8d083b827904b0 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:23:07 -0500
Subject: [PATCH 07/12] test: improve collector validation tests - replace
 tautological assertions with structural tests

---
 tests/validation/test_collectors.py | 43 +++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/tests/validation/test_collectors.py b/tests/validation/test_collectors.py
index 376ad7e..6603aa3 100644
--- a/tests/validation/test_collectors.py
+++ b/tests/validation/test_collectors.py
@@ -94,17 +94,38 @@ def test_import_retention_cleanup(self) -> None:
             pytest.fail(f"Failed to import retention_cleanup: {e}")
 
 
-class TestCollectorFunctionSignatures:
-    """Test that collector functions have expected signatures."""
-
-    def test_litellm_collector_has_collect_function(self) -> None:
-        """Test litellm_collector has collection function."""
-        assert LiteLLMCollector is not None
-        assert CollectionDiagnostics is not None
-
-    def test_prometheus_collector_has_collect_function(self) -> None:
-        """Test prometheus_collector has collection function."""
-        assert PrometheusCollector is not None
+class TestCollectorClassStructure:
+    """Test that collector classes have expected structure and methods."""
+
+    def test_litellm_collector_has_required_attributes(self) -> None:
+        """Test LiteLLMCollector has expected constructor parameters."""
+        # Verify the class can be inspected for init parameters
+        import inspect
+        sig = inspect.signature(LiteLLMCollector.__init__)
+        params = list(sig.parameters.keys())
+        assert 'self' in params, "LiteLLMCollector missing self parameter"
+        assert 'repository' in params, "LiteLLMCollector missing repository parameter"
+        assert 'api_key' in params, "LiteLLMCollector missing api_key parameter"
+
+    def test_litellm_collector_collection_diagnostics_structure(self) -> None:
+        """Test CollectionDiagnostics has expected fields."""
+        import dataclasses
+        assert dataclasses.is_dataclass(CollectionDiagnostics), "CollectionDiagnostics should be a dataclass"
+        fields = {f.name for f in dataclasses.fields(CollectionDiagnostics)}
+        # Verify actual field names from the implementation
+        assert 'total_raw_records' in fields, "CollectionDiagnostics missing total_raw_records"
+        assert 'errors' in fields, "CollectionDiagnostics missing errors"
+        assert 'normalized_count' in fields, "CollectionDiagnostics missing normalized_count"
+
+    def test_prometheus_collector_has_required_attributes(self) -> None:
+        """Test PrometheusCollector has expected constructor parameters."""
+        import inspect
+        sig = inspect.signature(PrometheusCollector.__init__)
+        params = list(sig.parameters.keys())
+        assert 'self' in params, "PrometheusCollector missing self parameter"
+        # PrometheusCollector uses different parameter names
+        assert 'base_url' in params, "PrometheusCollector missing base_url parameter"
+        assert 'session_id' in params, "PrometheusCollector missing session_id parameter"
 
 
 class TestCollectorModuleDocstrings:

From d1f889ea983c5072d1a93a5c64e5f40d472b34d4 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:46:24 -0500
Subject: [PATCH 08/12] style: fix formatting in migrations, scripts, and test
 files

---
 migrations/env.py                             |   4 +-
 ...8f3a7_initial_schema_providers_harness_.py | 358 ++++++++++--------
 ...n_notes_outcome_and_artifact_experiment.py |  45 ++-
 ...f2ee53388a2_add_proxy_credentials_table.py |   6 +-
 scripts/demo_collector.py                     |   3 +-
 scripts/demo_comparison.py                    |  36 +-
 scripts/demo_config_loading.py                |   8 +-
 scripts/demo_credential_service.py            |   4 +-
 scripts/demo_end_to_end.py                    |  23 +-
 scripts/demo_security.py                      |   6 +-
 scripts/init_benchmark_seed.py                |  44 ++-
 scripts/verify_db_schema.py                   |   2 +
 scripts/verify_normalization.py               |  20 +-
 tests/validation/test_collectors.py           |  25 +-
 14 files changed, 323 insertions(+), 261 deletions(-)

diff --git a/migrations/env.py b/migrations/env.py
index 76fda48..4ceb592 100644
--- a/migrations/env.py
+++ b/migrations/env.py
@@ -75,9 +75,7 @@ def run_migrations_online() -> None:
     )
 
     with connectable.connect() as connection:
-        context.configure(
-            connection=connection, target_metadata=target_metadata
-        )
+        context.configure(connection=connection, target_metadata=target_metadata)
 
         with context.begin_transaction():
             context.run_migrations()
diff --git a/migrations/versions/03e22a58f3a7_initial_schema_providers_harness_.py b/migrations/versions/03e22a58f3a7_initial_schema_providers_harness_.py
index f85ef30..e01d04d 100644
--- a/migrations/versions/03e22a58f3a7_initial_schema_providers_harness_.py
+++ b/migrations/versions/03e22a58f3a7_initial_schema_providers_harness_.py
@@ -5,13 +5,14 @@
 Create Date: 2026-03-26 07:36:03.280384
 
 """
+
 from collections.abc import Sequence
 
 import sqlalchemy as sa
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = '03e22a58f3a7'
+revision: str = "03e22a58f3a7"
 down_revision: str | Sequence[str] | None = None
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
@@ -21,190 +22,211 @@ def upgrade() -> None:
     """Upgrade schema."""
     # ### commands auto generated by Alembic - please adjust! ###
     # Phase 1: Independent tables (no FKs)
-    op.create_table('experiments',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('name', sa.String(length=255), nullable=False),
-    sa.Column('description', sa.Text(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('name')
+    op.create_table(
+        "experiments",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("name", sa.String(length=255), nullable=False),
+        sa.Column("description", sa.Text(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
     )
-    op.create_table('harness_profiles',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('name', sa.String(length=255), nullable=False),
-    sa.Column('protocol_surface', sa.String(length=50), nullable=False),
-    sa.Column('base_url_env', sa.String(length=255), nullable=False),
-    sa.Column('api_key_env', sa.String(length=255), nullable=False),
-    sa.Column('model_env', sa.String(length=255), nullable=False),
-    sa.Column('extra_env', sa.JSON(), nullable=False),
-    sa.Column('render_format', sa.String(length=20), nullable=False),
-    sa.Column('launch_checks', sa.JSON(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('name')
+    op.create_table(
+        "harness_profiles",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("name", sa.String(length=255), nullable=False),
+        sa.Column("protocol_surface", sa.String(length=50), nullable=False),
+        sa.Column("base_url_env", sa.String(length=255), nullable=False),
+        sa.Column("api_key_env", sa.String(length=255), nullable=False),
+        sa.Column("model_env", sa.String(length=255), nullable=False),
+        sa.Column("extra_env", sa.JSON(), nullable=False),
+        sa.Column("render_format", sa.String(length=20), nullable=False),
+        sa.Column("launch_checks", sa.JSON(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
     )
-    op.create_table('providers',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('name', sa.String(length=255), nullable=False),
-    sa.Column('route_name', sa.String(length=255), nullable=True),
-    sa.Column('protocol_surface', sa.String(length=50), nullable=False),
-    sa.Column('upstream_base_url_env', sa.String(length=255), nullable=False),
-    sa.Column('api_key_env', sa.String(length=255), nullable=False),
-    sa.Column('routing_defaults', sa.JSON(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('name')
+    op.create_table(
+        "providers",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("name", sa.String(length=255), nullable=False),
+        sa.Column("route_name", sa.String(length=255), nullable=True),
+        sa.Column("protocol_surface", sa.String(length=50), nullable=False),
+        sa.Column("upstream_base_url_env", sa.String(length=255), nullable=False),
+        sa.Column("api_key_env", sa.String(length=255), nullable=False),
+        sa.Column("routing_defaults", sa.JSON(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
     )
-    op.create_table('rollups',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('dimension_type', sa.String(length=50), nullable=False),
-    sa.Column('dimension_id', sa.String(length=255), nullable=False),
-    sa.Column('metric_name', sa.String(length=255), nullable=False),
-    sa.Column('metric_value', sa.Float(), nullable=False),
-    sa.Column('sample_count', sa.Integer(), nullable=False),
-    sa.Column('computed_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "rollups",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("dimension_type", sa.String(length=50), nullable=False),
+        sa.Column("dimension_id", sa.String(length=255), nullable=False),
+        sa.Column("metric_name", sa.String(length=255), nullable=False),
+        sa.Column("metric_value", sa.Float(), nullable=False),
+        sa.Column("sample_count", sa.Integer(), nullable=False),
+        sa.Column("computed_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_index(op.f('ix_rollups_dimension_id'), 'rollups', ['dimension_id'], unique=False)
-    op.create_table('task_cards',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('name', sa.String(length=255), nullable=False),
-    sa.Column('repo_path', sa.String(length=1024), nullable=True),
-    sa.Column('goal', sa.Text(), nullable=False),
-    sa.Column('starting_prompt', sa.Text(), nullable=False),
-    sa.Column('stop_condition', sa.Text(), nullable=False),
-    sa.Column('session_timebox_minutes', sa.Integer(), nullable=True),
-    sa.Column('notes', sa.JSON(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('name')
+    op.create_index(op.f("ix_rollups_dimension_id"), "rollups", ["dimension_id"], unique=False)
+    op.create_table(
+        "task_cards",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("name", sa.String(length=255), nullable=False),
+        sa.Column("repo_path", sa.String(length=1024), nullable=True),
+        sa.Column("goal", sa.Text(), nullable=False),
+        sa.Column("starting_prompt", sa.Text(), nullable=False),
+        sa.Column("stop_condition", sa.Text(), nullable=False),
+        sa.Column("session_timebox_minutes", sa.Integer(), nullable=True),
+        sa.Column("notes", sa.JSON(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
     )
-    op.create_table('variants',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('name', sa.String(length=255), nullable=False),
-    sa.Column('provider', sa.String(length=255), nullable=False),
-    sa.Column('provider_route', sa.String(length=255), nullable=True),
-    sa.Column('model_alias', sa.String(length=255), nullable=False),
-    sa.Column('harness_profile', sa.String(length=255), nullable=False),
-    sa.Column('harness_env_overrides', sa.JSON(), nullable=False),
-    sa.Column('benchmark_tags', sa.JSON(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('name')
+    op.create_table(
+        "variants",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("name", sa.String(length=255), nullable=False),
+        sa.Column("provider", sa.String(length=255), nullable=False),
+        sa.Column("provider_route", sa.String(length=255), nullable=True),
+        sa.Column("model_alias", sa.String(length=255), nullable=False),
+        sa.Column("harness_profile", sa.String(length=255), nullable=False),
+        sa.Column("harness_env_overrides", sa.JSON(), nullable=False),
+        sa.Column("benchmark_tags", sa.JSON(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
     )
     # Phase 2: Sessions (depends on experiments, task_cards, variants)
-    op.create_table('sessions',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('experiment_id', sa.Uuid(), nullable=False),
-    sa.Column('variant_id', sa.Uuid(), nullable=False),
-    sa.Column('task_card_id', sa.Uuid(), nullable=False),
-    sa.Column('harness_profile', sa.String(length=255), nullable=False),
-    sa.Column('repo_path', sa.String(length=1024), nullable=False),
-    sa.Column('git_branch', sa.String(length=255), nullable=False),
-    sa.Column('git_commit', sa.String(length=64), nullable=False),
-    sa.Column('git_dirty', sa.Boolean(), nullable=False),
-    sa.Column('operator_label', sa.String(length=255), nullable=True),
-    sa.Column('proxy_credential_id', sa.String(length=255), nullable=True),
-    sa.Column('started_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('ended_at', sa.DateTime(timezone=True), nullable=True),
-    sa.Column('status', sa.String(length=50), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
-    sa.ForeignKeyConstraint(['experiment_id'], ['experiments.id'], ondelete='RESTRICT'),
-    sa.ForeignKeyConstraint(['task_card_id'], ['task_cards.id'], ondelete='RESTRICT'),
-    sa.ForeignKeyConstraint(['variant_id'], ['variants.id'], ondelete='RESTRICT'),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "sessions",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("experiment_id", sa.Uuid(), nullable=False),
+        sa.Column("variant_id", sa.Uuid(), nullable=False),
+        sa.Column("task_card_id", sa.Uuid(), nullable=False),
+        sa.Column("harness_profile", sa.String(length=255), nullable=False),
+        sa.Column("repo_path", sa.String(length=1024), nullable=False),
+        sa.Column("git_branch", sa.String(length=255), nullable=False),
+        sa.Column("git_commit", sa.String(length=64), nullable=False),
+        sa.Column("git_dirty", sa.Boolean(), nullable=False),
+        sa.Column("operator_label", sa.String(length=255), nullable=True),
+        sa.Column("proxy_credential_id", sa.String(length=255), nullable=True),
+        sa.Column("started_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("ended_at", sa.DateTime(timezone=True), nullable=True),
+        sa.Column("status", sa.String(length=50), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+        sa.ForeignKeyConstraint(["experiment_id"], ["experiments.id"], ondelete="RESTRICT"),
+        sa.ForeignKeyConstraint(["task_card_id"], ["task_cards.id"], ondelete="RESTRICT"),
+        sa.ForeignKeyConstraint(["variant_id"], ["variants.id"], ondelete="RESTRICT"),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_index(op.f('ix_sessions_experiment_id'), 'sessions', ['experiment_id'], unique=False)
-    op.create_index(op.f('ix_sessions_task_card_id'), 'sessions', ['task_card_id'], unique=False)
-    op.create_index(op.f('ix_sessions_variant_id'), 'sessions', ['variant_id'], unique=False)
+    op.create_index(op.f("ix_sessions_experiment_id"), "sessions", ["experiment_id"], unique=False)
+    op.create_index(op.f("ix_sessions_task_card_id"), "sessions", ["task_card_id"], unique=False)
+    op.create_index(op.f("ix_sessions_variant_id"), "sessions", ["variant_id"], unique=False)
     # Phase 3: Tables with FKs to sessions and experiments
-    op.create_table('artifacts',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('session_id', sa.Uuid(), nullable=False),
-    sa.Column('artifact_type', sa.String(length=100), nullable=False),
-    sa.Column('name', sa.String(length=255), nullable=False),
-    sa.Column('content_type', sa.String(length=100), nullable=False),
-    sa.Column('storage_path', sa.String(length=2048), nullable=False),
-    sa.Column('size_bytes', sa.Integer(), nullable=True),
-    sa.Column('artifact_metadata', sa.JSON(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.ForeignKeyConstraint(['session_id'], ['sessions.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "artifacts",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("session_id", sa.Uuid(), nullable=False),
+        sa.Column("artifact_type", sa.String(length=100), nullable=False),
+        sa.Column("name", sa.String(length=255), nullable=False),
+        sa.Column("content_type", sa.String(length=100), nullable=False),
+        sa.Column("storage_path", sa.String(length=2048), nullable=False),
+        sa.Column("size_bytes", sa.Integer(), nullable=True),
+        sa.Column("artifact_metadata", sa.JSON(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.ForeignKeyConstraint(["session_id"], ["sessions.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(op.f("ix_artifacts_session_id"), "artifacts", ["session_id"], unique=False)
+    op.create_table(
+        "experiment_variants",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("experiment_id", sa.Uuid(), nullable=False),
+        sa.Column("variant_id", sa.Uuid(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.ForeignKeyConstraint(["experiment_id"], ["experiments.id"], ondelete="CASCADE"),
+        sa.ForeignKeyConstraint(["variant_id"], ["variants.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("experiment_id", "variant_id", name="uq_experiment_variant"),
+    )
+    op.create_index(
+        op.f("ix_experiment_variants_experiment_id"),
+        "experiment_variants",
+        ["experiment_id"],
+        unique=False,
     )
-    op.create_index(op.f('ix_artifacts_session_id'), 'artifacts', ['session_id'], unique=False)
-    op.create_table('experiment_variants',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('experiment_id', sa.Uuid(), nullable=False),
-    sa.Column('variant_id', sa.Uuid(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.ForeignKeyConstraint(['experiment_id'], ['experiments.id'], ondelete='CASCADE'),
-    sa.ForeignKeyConstraint(['variant_id'], ['variants.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('experiment_id', 'variant_id', name='uq_experiment_variant')
+    op.create_index(
+        op.f("ix_experiment_variants_variant_id"),
+        "experiment_variants",
+        ["variant_id"],
+        unique=False,
     )
-    op.create_index(op.f('ix_experiment_variants_experiment_id'), 'experiment_variants', ['experiment_id'], unique=False)
-    op.create_index(op.f('ix_experiment_variants_variant_id'), 'experiment_variants', ['variant_id'], unique=False)
-    op.create_table('provider_models',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('provider_id', sa.Uuid(), nullable=False),
-    sa.Column('alias', sa.String(length=255), nullable=False),
-    sa.Column('upstream_model', sa.String(length=255), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.ForeignKeyConstraint(['provider_id'], ['providers.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "provider_models",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("provider_id", sa.Uuid(), nullable=False),
+        sa.Column("alias", sa.String(length=255), nullable=False),
+        sa.Column("upstream_model", sa.String(length=255), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.ForeignKeyConstraint(["provider_id"], ["providers.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_table('requests',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('request_id', sa.String(length=255), nullable=False),
-    sa.Column('session_id', sa.Uuid(), nullable=False),
-    sa.Column('provider', sa.String(length=255), nullable=False),
-    sa.Column('model', sa.String(length=255), nullable=False),
-    sa.Column('timestamp', sa.DateTime(timezone=True), nullable=False),
-    sa.Column('latency_ms', sa.Float(), nullable=True),
-    sa.Column('ttft_ms', sa.Float(), nullable=True),
-    sa.Column('tokens_prompt', sa.Integer(), nullable=True),
-    sa.Column('tokens_completion', sa.Integer(), nullable=True),
-    sa.Column('error', sa.Boolean(), nullable=False),
-    sa.Column('error_message', sa.Text(), nullable=True),
-    sa.Column('cache_hit', sa.Boolean(), nullable=True),
-    sa.Column('request_metadata', sa.JSON(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
-    sa.ForeignKeyConstraint(['session_id'], ['sessions.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "requests",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("request_id", sa.String(length=255), nullable=False),
+        sa.Column("session_id", sa.Uuid(), nullable=False),
+        sa.Column("provider", sa.String(length=255), nullable=False),
+        sa.Column("model", sa.String(length=255), nullable=False),
+        sa.Column("timestamp", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("latency_ms", sa.Float(), nullable=True),
+        sa.Column("ttft_ms", sa.Float(), nullable=True),
+        sa.Column("tokens_prompt", sa.Integer(), nullable=True),
+        sa.Column("tokens_completion", sa.Integer(), nullable=True),
+        sa.Column("error", sa.Boolean(), nullable=False),
+        sa.Column("error_message", sa.Text(), nullable=True),
+        sa.Column("cache_hit", sa.Boolean(), nullable=True),
+        sa.Column("request_metadata", sa.JSON(), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+        sa.ForeignKeyConstraint(["session_id"], ["sessions.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.create_index(op.f('ix_requests_request_id'), 'requests', ['request_id'], unique=False)
-    op.create_index(op.f('ix_requests_session_id'), 'requests', ['session_id'], unique=False)
+    op.create_index(op.f("ix_requests_request_id"), "requests", ["request_id"], unique=False)
+    op.create_index(op.f("ix_requests_session_id"), "requests", ["session_id"], unique=False)
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     """Downgrade schema."""
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index(op.f('ix_requests_session_id'), table_name='requests')
-    op.drop_index(op.f('ix_requests_request_id'), table_name='requests')
-    op.drop_table('requests')
-    op.drop_table('provider_models')
-    op.drop_index(op.f('ix_experiment_variants_variant_id'), table_name='experiment_variants')
-    op.drop_index(op.f('ix_experiment_variants_experiment_id'), table_name='experiment_variants')
-    op.drop_table('experiment_variants')
-    op.drop_index(op.f('ix_artifacts_session_id'), table_name='artifacts')
-    op.drop_table('artifacts')
-    op.drop_table('variants')
-    op.drop_table('task_cards')
-    op.drop_index(op.f('ix_sessions_task_card_id'), table_name='sessions')
-    op.drop_index(op.f('ix_sessions_variant_id'), table_name='sessions')
-    op.drop_index(op.f('ix_sessions_experiment_id'), table_name='sessions')
-    op.drop_table('sessions')
-    op.drop_index(op.f('ix_rollups_dimension_id'), table_name='rollups')
-    op.drop_table('rollups')
-    op.drop_table('providers')
-    op.drop_table('harness_profiles')
-    op.drop_table('experiments')
+    op.drop_index(op.f("ix_requests_session_id"), table_name="requests")
+    op.drop_index(op.f("ix_requests_request_id"), table_name="requests")
+    op.drop_table("requests")
+    op.drop_table("provider_models")
+    op.drop_index(op.f("ix_experiment_variants_variant_id"), table_name="experiment_variants")
+    op.drop_index(op.f("ix_experiment_variants_experiment_id"), table_name="experiment_variants")
+    op.drop_table("experiment_variants")
+    op.drop_index(op.f("ix_artifacts_session_id"), table_name="artifacts")
+    op.drop_table("artifacts")
+    op.drop_table("variants")
+    op.drop_table("task_cards")
+    op.drop_index(op.f("ix_sessions_task_card_id"), table_name="sessions")
+    op.drop_index(op.f("ix_sessions_variant_id"), table_name="sessions")
+    op.drop_index(op.f("ix_sessions_experiment_id"), table_name="sessions")
+    op.drop_table("sessions")
+    op.drop_index(op.f("ix_rollups_dimension_id"), table_name="rollups")
+    op.drop_table("rollups")
+    op.drop_table("providers")
+    op.drop_table("harness_profiles")
+    op.drop_table("experiments")
     # ### end Alembic commands ###
diff --git a/migrations/versions/20260326_210252_add_session_notes_outcome_and_artifact_experiment.py b/migrations/versions/20260326_210252_add_session_notes_outcome_and_artifact_experiment.py
index bda2edb..abb7a36 100644
--- a/migrations/versions/20260326_210252_add_session_notes_outcome_and_artifact_experiment.py
+++ b/migrations/versions/20260326_210252_add_session_notes_outcome_and_artifact_experiment.py
@@ -5,14 +5,15 @@
 Create Date: 2025-03-26 21:02:52.000000
 
 """
+
 from collections.abc import Sequence
 
 import sqlalchemy as sa
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = '520517cac40b'
-down_revision: str | Sequence[str] | None = '03e22a58f3a7'
+revision: str = "520517cac40b"
+down_revision: str | Sequence[str] | None = "03e22a58f3a7"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 
@@ -20,43 +21,41 @@
 def upgrade() -> None:
     """Upgrade schema."""
     # ### Add session notes and outcome_state ###
-    op.add_column('sessions', sa.Column('notes', sa.Text(), nullable=True))
-    op.add_column('sessions', sa.Column('outcome_state', sa.String(length=50), nullable=True))
-    op.create_index('ix_sessions_outcome_state', 'sessions', ['outcome_state'])
+    op.add_column("sessions", sa.Column("notes", sa.Text(), nullable=True))
+    op.add_column("sessions", sa.Column("outcome_state", sa.String(length=50), nullable=True))
+    op.create_index("ix_sessions_outcome_state", "sessions", ["outcome_state"])
 
     # ### Add experiment_id to artifacts and make session_id nullable ###
-    op.add_column('artifacts', sa.Column('experiment_id', sa.Uuid(), nullable=True))
-    op.create_index('ix_artifacts_experiment_id', 'artifacts', ['experiment_id'])
+    op.add_column("artifacts", sa.Column("experiment_id", sa.Uuid(), nullable=True))
+    op.create_index("ix_artifacts_experiment_id", "artifacts", ["experiment_id"])
 
     # Make session_id nullable for artifacts
-    op.alter_column('artifacts', 'session_id',
-                    existing_type=sa.Uuid(),
-                    nullable=True)
+    op.alter_column("artifacts", "session_id", existing_type=sa.Uuid(), nullable=True)
 
     # Add FK constraint for experiment_id
     op.create_foreign_key(
-        'fk_artifacts_experiment_id',
-        'artifacts', 'experiments',
-        ['experiment_id'], ['id'],
-        ondelete='CASCADE'
+        "fk_artifacts_experiment_id",
+        "artifacts",
+        "experiments",
+        ["experiment_id"],
+        ["id"],
+        ondelete="CASCADE",
     )
 
 
 def downgrade() -> None:
     """Downgrade schema."""
     # Drop FK constraint
-    op.drop_constraint('fk_artifacts_experiment_id', 'artifacts', type_='foreignkey')
+    op.drop_constraint("fk_artifacts_experiment_id", "artifacts", type_="foreignkey")
 
     # Revert session_id to non-nullable
-    op.alter_column('artifacts', 'session_id',
-                    existing_type=sa.Uuid(),
-                    nullable=False)
+    op.alter_column("artifacts", "session_id", existing_type=sa.Uuid(), nullable=False)
 
     # Drop artifact changes
-    op.drop_index('ix_artifacts_experiment_id', table_name='artifacts')
-    op.drop_column('artifacts', 'experiment_id')
+    op.drop_index("ix_artifacts_experiment_id", table_name="artifacts")
+    op.drop_column("artifacts", "experiment_id")
 
     # Drop session changes
-    op.drop_index('ix_sessions_outcome_state', table_name='sessions')
-    op.drop_column('sessions', 'outcome_state')
-    op.drop_column('sessions', 'notes')
+    op.drop_index("ix_sessions_outcome_state", table_name="sessions")
+    op.drop_column("sessions", "outcome_state")
+    op.drop_column("sessions", "notes")
diff --git a/migrations/versions/2f2ee53388a2_add_proxy_credentials_table.py b/migrations/versions/2f2ee53388a2_add_proxy_credentials_table.py
index 12740c4..daf87b0 100644
--- a/migrations/versions/2f2ee53388a2_add_proxy_credentials_table.py
+++ b/migrations/versions/2f2ee53388a2_add_proxy_credentials_table.py
@@ -5,6 +5,7 @@
 Create Date: 2025-03-27 02:30:00.000000+00:00
 
 """
+
 from typing import Sequence, Union
 
 from alembic import op
@@ -52,10 +53,7 @@ def upgrade() -> None:
     op.create_index("ix_proxy_credentials_session_id", "proxy_credentials", ["session_id"])
 
     # Add column to sessions table for credential alias reference (not a FK)
-    op.add_column(
-        "sessions",
-        sa.Column("proxy_credential_alias", sa.String(255), nullable=True)
-    )
+    op.add_column("sessions", sa.Column("proxy_credential_alias", sa.String(255), nullable=True))
 
     # Create index on the alias column for joins
     op.create_index("ix_sessions_proxy_credential_alias", "sessions", ["proxy_credential_alias"])
diff --git a/scripts/demo_collector.py b/scripts/demo_collector.py
index cdb45ce..5fe0565 100644
--- a/scripts/demo_collector.py
+++ b/scripts/demo_collector.py
@@ -133,8 +133,7 @@ async def demo_collector():
 
     print("\n--- Correlation Keys Preserved ---")
     for req in normalized_requests:
-        correlation_keys = {k: v for k, v in req.metadata.items()
-                          if k not in ["litellm_raw_keys"]}
+        correlation_keys = {k: v for k, v in req.metadata.items() if k not in ["litellm_raw_keys"]}
         if correlation_keys:
             print(f"\nRequest {req.request_id}:")
             for key, value in correlation_keys.items():
diff --git a/scripts/demo_comparison.py b/scripts/demo_comparison.py
index 0511ba0..e9c7e21 100644
--- a/scripts/demo_comparison.py
+++ b/scripts/demo_comparison.py
@@ -9,7 +9,7 @@
 
 # Add project src to path - use absolute path
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, os.path.join(project_root, 'src'))
+sys.path.insert(0, os.path.join(project_root, "src"))
 
 import asyncio
 from uuid import uuid4
@@ -31,22 +31,22 @@ def verify_service():
     """Verify ComparisonService can be instantiated and used."""
     # Create mock database session
     mock_db = MagicMock()
-    
+
     # Create service
     service = ComparisonService(db_session=mock_db)
     print(f"✓ ComparisonService created successfully")
-    
+
     # Verify compare_sessions handles empty input
     result = asyncio.run(service.compare_sessions([]))
     assert result == {"sessions": [], "summary": {}}
     print(f"✓ compare_sessions([]) returns empty result")
-    
+
     # Verify compare_variants returns empty for missing experiment
     mock_db.get.return_value = None
     result = asyncio.run(service.compare_variants(uuid4()))
     assert result == []
     print(f"✓ compare_variants returns [] for missing experiment")
-    
+
     return service
 
 
@@ -64,16 +64,16 @@ def verify_models():
         avg_latency_ms=245.5,
         avg_ttft_ms=120.3,
         total_errors=3,
-        error_rate=0.02
+        error_rate=0.02,
     )
     print(f"✓ VariantComparison model works: {vc.variant_name}")
-    
+
     # Verify JSON serialization
     json_str = vc.model_dump_json()
     assert "gpt-4-turbo" in json_str
     assert str(variant_id) in json_str
     print(f"✓ JSON serialization works: {len(json_str)} chars")
-    
+
     # Verify ExperimentComparisonResult
     exp_id = uuid4()
     ecr = ExperimentComparisonResult(
@@ -82,12 +82,12 @@ def verify_models():
         variants=[vc],
         providers=[],
         models=[],
-        harness_profiles=[]
+        harness_profiles=[],
     )
     assert ecr.experiment_name == "test-experiment"
     assert len(ecr.variants) == 1
     print(f"✓ ExperimentComparisonResult model works")
-    
+
     return vc, ecr
 
 
@@ -101,19 +101,19 @@ def verify_queries():
     assert "ORDER BY v.name ASC" in sql
     assert params == {"experiment_id": None}
     print(f"✓ variant_summary_valid_only() generates correct SQL")
-    
+
     # Verify provider summary query
     sql, params = DashboardQueries.provider_summary_valid_only()
     assert "v.provider" in sql
     assert "ORDER BY v.provider ASC" in sql
     print(f"✓ provider_summary_valid_only() generates correct SQL")
-    
+
     # Verify model summary query
     sql, params = DashboardQueries.model_summary_valid_only()
     assert "v.model_alias" in sql
     assert "ORDER BY v.provider ASC, v.model_alias ASC" in sql
     print(f"✓ model_summary_valid_only() generates correct SQL")
-    
+
     # Verify harness profile summary query
     sql, params = DashboardQueries.harness_profile_summary_valid_only()
     assert "v.harness_profile" in sql
@@ -124,18 +124,18 @@ def verify_queries():
 def main():
     """Run all verification checks."""
     print("=== COE-314 Comparison Service Verification ===\n")
-    
+
     print("1. Verifying ComparisonService...")
     verify_service()
-    
+
     print("\n2. Verifying Pydantic models...")
     verify_models()
-    
+
     print("\n3. Verifying DashboardQueries...")
     verify_queries()
-    
+
     print("\n=== All verification checks passed! ===")
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/scripts/demo_config_loading.py b/scripts/demo_config_loading.py
index a0ca2b9..1946b46 100644
--- a/scripts/demo_config_loading.py
+++ b/scripts/demo_config_loading.py
@@ -56,9 +56,7 @@ def main() -> None:
     print("\n6. Protocol Surface Coverage:")
     print("-" * 40)
     anthropic_providers = [
-        n
-        for n, p in registry.providers.items()
-        if p.protocol_surface == "anthropic_messages"
+        n for n, p in registry.providers.items() if p.protocol_surface == "anthropic_messages"
     ]
     anthropic_harnesses = [
         n
@@ -69,9 +67,7 @@ def main() -> None:
         n for n, p in registry.providers.items() if p.protocol_surface == "openai_responses"
     ]
     openai_harnesses = [
-        n
-        for n, h in registry.harness_profiles.items()
-        if h.protocol_surface == "openai_responses"
+        n for n, h in registry.harness_profiles.items() if h.protocol_surface == "openai_responses"
     ]
 
     print("  Anthropic surfaces:")
diff --git a/scripts/demo_credential_service.py b/scripts/demo_credential_service.py
index c507a48..2c982e1 100644
--- a/scripts/demo_credential_service.py
+++ b/scripts/demo_credential_service.py
@@ -95,9 +95,7 @@ async def demo_credential_issuance():
 
     # Show metadata tags
     print(f"\n4. Metadata Tags (for LiteLLM correlation):")
-    metadata = service._build_metadata_tags(
-        session_id, experiment_id, variant_id, harness_profile
-    )
+    metadata = service._build_metadata_tags(session_id, experiment_id, variant_id, harness_profile)
     for key, value in metadata.items():
         print(f"   {key}: {value}")
 
diff --git a/scripts/demo_end_to_end.py b/scripts/demo_end_to_end.py
index 903ddfd..12d636b 100644
--- a/scripts/demo_end_to_end.py
+++ b/scripts/demo_end_to_end.py
@@ -39,7 +39,7 @@ async def main():
         request_id = str(uuid4())
         latency = 500 + (i * 50)  # 500ms to 950ms
         ttft = 100 + (i * 10)  # 100ms to 190ms
-        
+
         requests.append(
             Request(
                 request_id=request_id,
@@ -66,7 +66,7 @@ async def main():
     # 2. Compute rollups using RollupJob
     print("2. Computing rollups using RollupJob...")
     rollup_job = RollupJob()
-    
+
     # Compute request-level rollups
     print("   a) Request-level rollups...")
     all_rollups: list[MetricRollup] = []
@@ -74,7 +74,7 @@ async def main():
         request_rollups = await rollup_job.compute_request_metrics(request)
         all_rollups.extend(request_rollups)
     print(f"      ✓ Computed {len(all_rollups)} request-level rollups")
-    
+
     # Compute session-level rollups
     print("   b) Session-level rollups...")
     session_rollups = await rollup_job.compute_session_metrics(test_session_id, requests)
@@ -85,9 +85,10 @@ async def main():
     # 3. Show repository conversion logic
     print("3. Demonstrating SQLRollupRepository conversion logic...")
     from unittest.mock import MagicMock
+
     mock_session = MagicMock()
     repository = SQLRollupRepository(mock_session)
-    
+
     # Show domain-to-ORM conversion
     print(f"   a) Converting {len(all_rollups)} MetricRollup domain models to ORM...")
     orm_entities = [repository._to_orm(r) for r in all_rollups[:5]]  # Just first 5 for demo
@@ -96,11 +97,11 @@ async def main():
     print(f"      - Example ORM metric_value: {orm_entities[0].metric_value}")
     print(f"      - Example ORM dimension_type: {orm_entities[0].dimension_type}")
     print()
-    
+
     # 4. Display key metrics
     print("4. Key Session-Level Metrics Summary:")
     print("-" * 70)
-    
+
     for rollup in session_rollups:
         if rollup.metric_name == "latency_median_ms":
             print(f"   Latency Median:    {rollup.metric_value:.1f}ms (ACCEPTANCE CRITERIA ✅)")
@@ -118,7 +119,7 @@ async def main():
             print(f"   Cache Hit Rate:    {rollup.metric_value:.2%}")
         elif rollup.metric_name == "request_count":
             print(f"   Request Count:     {int(rollup.metric_value)}")
-    
+
     print("-" * 70)
     print()
 
@@ -126,11 +127,11 @@ async def main():
     print("5. All Computed Metric Names:")
     request_metrics = {r.metric_name for r in all_rollups if r.dimension_type == "request"}
     session_metrics = {r.metric_name for r in session_rollups}
-    
+
     print("   Request-level metrics:")
     for name in sorted(request_metrics):
         print(f"     - {name}")
-    
+
     print()
     print("   Session-level metrics:")
     for name in sorted(session_metrics):
@@ -153,9 +154,9 @@ async def main():
     print("  ✓ latency_p95_ms computed")
     print("  ✓ error_rate computed")
     print()
-    
+
     return 0
 
 
 if __name__ == "__main__":
-    sys.exit(asyncio.run(main()))
\ No newline at end of file
+    sys.exit(asyncio.run(main()))
diff --git a/scripts/demo_security.py b/scripts/demo_security.py
index 556d31a..c3ea70c 100644
--- a/scripts/demo_security.py
+++ b/scripts/demo_security.py
@@ -136,7 +136,9 @@ def demo_retention() -> None:
     print(f"normalized_requests: {settings.normalized_requests.retention_days} days")
     print(f"sessions: {settings.sessions.retention_days} days")
     print(f"session_credentials: {settings.session_credentials.retention_days} days")
-    print(f"artifacts: {settings.artifacts.retention_days} days (archive: {settings.artifacts.archive_before_delete})")
+    print(
+        f"artifacts: {settings.artifacts.retention_days} days (archive: {settings.artifacts.archive_before_delete})"
+    )
     print(f"metric_rollups: {settings.metric_rollups.retention_days} days")
 
     # Test cutoff date calculation
@@ -187,4 +189,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/scripts/init_benchmark_seed.py b/scripts/init_benchmark_seed.py
index 507faf5..1247834 100644
--- a/scripts/init_benchmark_seed.py
+++ b/scripts/init_benchmark_seed.py
@@ -72,13 +72,47 @@ def main() -> None:
         )
 
         session_specs = [
-            (variants[0], "completed", "success", 18, [480, 520, 610], [130, 150, 170], [320, 340, 360]),
-            (variants[0], "completed", "success", 12, [450, 470, 500], [120, 135, 145], [300, 315, 325]),
-            (variants[1], "completed", "failed", 27, [720, 760, 910], [210, 240, None], [280, 290, 300]),
-            (variants[1], "completed", "success", 22, [640, 680, 710], [180, 195, 205], [295, 305, 315]),
+            (
+                variants[0],
+                "completed",
+                "success",
+                18,
+                [480, 520, 610],
+                [130, 150, 170],
+                [320, 340, 360],
+            ),
+            (
+                variants[0],
+                "completed",
+                "success",
+                12,
+                [450, 470, 500],
+                [120, 135, 145],
+                [300, 315, 325],
+            ),
+            (
+                variants[1],
+                "completed",
+                "failed",
+                27,
+                [720, 760, 910],
+                [210, 240, None],
+                [280, 290, 300],
+            ),
+            (
+                variants[1],
+                "completed",
+                "success",
+                22,
+                [640, 680, 710],
+                [180, 195, 205],
+                [295, 305, 315],
+            ),
         ]
 
-        for index, (variant, status, outcome, minutes, latencies, ttfts, completions) in enumerate(session_specs):
+        for index, (variant, status, outcome, minutes, latencies, ttfts, completions) in enumerate(
+            session_specs
+        ):
             started_at = now - timedelta(hours=index + 1)
             benchmark_session = BenchmarkSession(
                 experiment_id=experiment.id,
diff --git a/scripts/verify_db_schema.py b/scripts/verify_db_schema.py
index 3232c2c..f1670f7 100644
--- a/scripts/verify_db_schema.py
+++ b/scripts/verify_db_schema.py
@@ -55,9 +55,11 @@ def verify_schema():
     )
     # Enable foreign keys for SQLite
     from sqlalchemy import event
+
     @event.listens_for(engine, "connect")
     def _fk_pragma_on_connect(dbapi_conn, connection_record):
         dbapi_conn.execute("PRAGMA foreign_keys=ON")
+
     init_db(engine)
     print("   ✓ Database initialized using init_db()")
 
diff --git a/scripts/verify_normalization.py b/scripts/verify_normalization.py
index e8946f3..9b066a9 100644
--- a/scripts/verify_normalization.py
+++ b/scripts/verify_normalization.py
@@ -131,7 +131,9 @@ def demo_error_handling() -> None:
             **error_fields,
         }
         result, _ = normalizer.normalize(raw)
-        print(f"  {'✓' if result else '✗'} {name}: error={result.error}, message={result.error_message}")
+        print(
+            f"  {'✓' if result else '✗'} {name}: error={result.error}, message={result.error_message}"
+        )
 
 
 def demo_reconciliation_report() -> None:
@@ -203,10 +205,12 @@ def demo_markdown_report() -> None:
         for i in range(95)
     ]
     # Add some failures
-    requests.extend([
-        {"startTime": "2025-03-26T10:30:00+00:00"},  # missing request_id
-        {"request_id": "bad-timestamp", "startTime": "invalid"},  # bad timestamp
-    ])
+    requests.extend(
+        [
+            {"startTime": "2025-03-26T10:30:00+00:00"},  # missing request_id
+            {"request_id": "bad-timestamp", "startTime": "invalid"},  # bad timestamp
+        ]
+    )
 
     for i, raw in enumerate(requests):
         normalized, diag = normalizer.normalize(raw, row_index=i)
@@ -244,7 +248,10 @@ def demo_json_output() -> None:
         {"request_id": "req-001", "startTime": "2025-03-26T10:30:00+00:00", "model": "gpt-4"},
         {"startTime": "2025-03-26T10:30:00+00:00"},  # missing request_id
         {"request_id": "req-003", "model": "gpt-4"},  # missing timestamp
-        {"request_id": "req-004", "startTime": "2025-03-26T10:30:00+00:00"},  # missing model (defaults to unknown)
+        {
+            "request_id": "req-004",
+            "startTime": "2025-03-26T10:30:00+00:00",
+        },  # missing model (defaults to unknown)
     ]
 
     for i, raw in enumerate(requests):
@@ -295,6 +302,7 @@ def main() -> None:
     except Exception as e:
         print(f"\n✗ Error during verification: {e}")
         import traceback
+
         traceback.print_exc()
         return 1
 
diff --git a/tests/validation/test_collectors.py b/tests/validation/test_collectors.py
index 6603aa3..ef4d985 100644
--- a/tests/validation/test_collectors.py
+++ b/tests/validation/test_collectors.py
@@ -101,31 +101,36 @@ def test_litellm_collector_has_required_attributes(self) -> None:
         """Test LiteLLMCollector has expected constructor parameters."""
         # Verify the class can be inspected for init parameters
         import inspect
+
         sig = inspect.signature(LiteLLMCollector.__init__)
         params = list(sig.parameters.keys())
-        assert 'self' in params, "LiteLLMCollector missing self parameter"
-        assert 'repository' in params, "LiteLLMCollector missing repository parameter"
-        assert 'api_key' in params, "LiteLLMCollector missing api_key parameter"
+        assert "self" in params, "LiteLLMCollector missing self parameter"
+        assert "repository" in params, "LiteLLMCollector missing repository parameter"
+        assert "api_key" in params, "LiteLLMCollector missing api_key parameter"
 
     def test_litellm_collector_collection_diagnostics_structure(self) -> None:
         """Test CollectionDiagnostics has expected fields."""
         import dataclasses
-        assert dataclasses.is_dataclass(CollectionDiagnostics), "CollectionDiagnostics should be a dataclass"
+
+        assert dataclasses.is_dataclass(CollectionDiagnostics), (
+            "CollectionDiagnostics should be a dataclass"
+        )
         fields = {f.name for f in dataclasses.fields(CollectionDiagnostics)}
         # Verify actual field names from the implementation
-        assert 'total_raw_records' in fields, "CollectionDiagnostics missing total_raw_records"
-        assert 'errors' in fields, "CollectionDiagnostics missing errors"
-        assert 'normalized_count' in fields, "CollectionDiagnostics missing normalized_count"
+        assert "total_raw_records" in fields, "CollectionDiagnostics missing total_raw_records"
+        assert "errors" in fields, "CollectionDiagnostics missing errors"
+        assert "normalized_count" in fields, "CollectionDiagnostics missing normalized_count"
 
     def test_prometheus_collector_has_required_attributes(self) -> None:
         """Test PrometheusCollector has expected constructor parameters."""
         import inspect
+
         sig = inspect.signature(PrometheusCollector.__init__)
         params = list(sig.parameters.keys())
-        assert 'self' in params, "PrometheusCollector missing self parameter"
+        assert "self" in params, "PrometheusCollector missing self parameter"
         # PrometheusCollector uses different parameter names
-        assert 'base_url' in params, "PrometheusCollector missing base_url parameter"
-        assert 'session_id' in params, "PrometheusCollector missing session_id parameter"
+        assert "base_url" in params, "PrometheusCollector missing base_url parameter"
+        assert "session_id" in params, "PrometheusCollector missing session_id parameter"
 
 
 class TestCollectorModuleDocstrings:

From 44a5a8c87bfff3f28ac52f948abe7c8e02599c43 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:48:33 -0500
Subject: [PATCH 09/12] fix(types): add type ignore comments for requests stubs
 and session assignments

---
 src/benchmark_core/repositories/rollup_repository.py | 2 +-
 src/benchmark_core/services/diagnostics_service.py   | 5 +++--
 src/benchmark_core/services/health_service.py        | 2 +-
 src/cli/commands/normalize.py                        | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/benchmark_core/repositories/rollup_repository.py b/src/benchmark_core/repositories/rollup_repository.py
index 2e5f53b..7d6a0de 100644
--- a/src/benchmark_core/repositories/rollup_repository.py
+++ b/src/benchmark_core/repositories/rollup_repository.py
@@ -197,4 +197,4 @@ def delete_by_dimension(
             MetricRollupORM.dimension_id == dimension_id,
         )
         result = self._session.execute(stmt)
-        return result.rowcount
+        return result.rowcount  # type: ignore[attr-defined, no-any-return]
diff --git a/src/benchmark_core/services/diagnostics_service.py b/src/benchmark_core/services/diagnostics_service.py
index ea02ec2..dd14748 100644
--- a/src/benchmark_core/services/diagnostics_service.py
+++ b/src/benchmark_core/services/diagnostics_service.py
@@ -423,7 +423,8 @@ def _diagnose_database(self) -> DiagnosticResult:
                         else "SELECT count(*) FROM information_schema.tables WHERE table_schema = 'public'"
                     )
                 )
-                table_count = result.fetchone()[0]
+                row = result.fetchone()
+                table_count = row[0] if row else 0
 
             return DiagnosticResult(
                 category="services",
@@ -450,7 +451,7 @@ def _diagnose_litellm(self) -> DiagnosticResult:
             DiagnosticResult with LiteLLM status.
         """
         try:
-            import requests
+            import requests  # type: ignore[import-untyped]
 
             base_url = os.getenv("LITELLM_BASE_URL", "http://localhost:4000")
             api_key = os.getenv("LITELLM_MASTER_KEY")
diff --git a/src/benchmark_core/services/health_service.py b/src/benchmark_core/services/health_service.py
index e40d9f0..7504609 100644
--- a/src/benchmark_core/services/health_service.py
+++ b/src/benchmark_core/services/health_service.py
@@ -126,7 +126,7 @@ def check_litellm_proxy(self) -> HealthCheckResult:
             HealthCheckResult with LiteLLM proxy status.
         """
         try:
-            import requests
+            import requests  # type: ignore[import-untyped]
 
             health_url = f"{self._litellm_base_url}/health/liveliness"
 
diff --git a/src/cli/commands/normalize.py b/src/cli/commands/normalize.py
index 92edfdc..61500dd 100644
--- a/src/cli/commands/normalize.py
+++ b/src/cli/commands/normalize.py
@@ -127,7 +127,7 @@ async def _run() -> tuple[int, ReconciliationReport]:
             return 0, report
 
         # Normal mode: write to database
-        db_session: SQLAlchemySession = get_db_session()
+        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
         try:
             repository = SQLRequestRepository(db_session)
             normalizer_job = RequestNormalizerJob(
@@ -226,7 +226,7 @@ def show_reconciliation_report(
         raise typer.Exit(1) from err
 
     # Get request count for the session
-    db_session: SQLAlchemySession = get_db_session()
+    db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
     try:
         # Query the actual count of normalized requests for this session
         stmt = select(func.count()).where(RequestORM.session_id == UUID(session_id))

From 41a42a13320f67aac0b61507ac5d4f719c07fe85 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 03:52:19 -0500
Subject: [PATCH 10/12] fix(types): resolve remaining type errors in src/

- Fixed SQLAlchemy context manager type mismatches with type: ignore
- Fixed return value type mismatches in session_service.py
- Fixed invalid IngestWatermark parameter (cursor -> last_request_id)
- Fixed wrong argument types in session.py (str -> UUID)
- Fixed Optional[str] to str | None in collect.py for ruff UP007
---
 .../services/session_service.py               | 10 +++---
 src/cli/commands/collect.py                   | 32 +++++++++----------
 src/cli/commands/normalize.py                 |  2 +-
 src/cli/commands/session.py                   |  6 ++--
 src/collectors/litellm_collector.py           |  4 +--
 src/collectors/normalization.py               |  2 +-
 6 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/benchmark_core/services/session_service.py b/src/benchmark_core/services/session_service.py
index e7bde53..d993292 100644
--- a/src/benchmark_core/services/session_service.py
+++ b/src/benchmark_core/services/session_service.py
@@ -368,7 +368,7 @@ async def list_sessions_by_experiment(
         Returns:
             List of sessions.
         """
-        return await self._session_repo.list_by_experiment(experiment_id, limit, offset)
+        return await self._session_repo.list_by_experiment(experiment_id, limit, offset)  # type: ignore[return-value]
 
     async def list_active_sessions(self, limit: int = 100) -> list[Session]:
         """List all active sessions.
@@ -379,7 +379,7 @@ async def list_active_sessions(self, limit: int = 100) -> list[Session]:
         Returns:
             List of active sessions.
         """
-        return await self._session_repo.list_active(limit)
+        return await self._session_repo.list_active(limit)  # type: ignore[return-value]
 
     async def validate_session_exists(self, session_id: UUID) -> bool:
         """Check if a session exists.
@@ -476,7 +476,7 @@ def __init__(
         self._collector = LiteLLMCollector(
             base_url=litellm_base_url,
             api_key=litellm_api_key,
-            repository=repository,
+            repository=repository,  # type: ignore[arg-type]
         )
         self._repository = repository
 
@@ -505,7 +505,7 @@ async def run_collection_job(
 
         try:
             # Fetch and normalize requests from LiteLLM
-            requests, new_watermark = await self._collector.collect(
+            requests, new_watermark = await self._collector.collect(  # type: ignore[attr-defined]
                 session_id=session_id,
                 start_time=start_time,
                 end_time=end_time,
@@ -539,7 +539,7 @@ async def run_collection_job(
                 requests_collected=0,
                 requests_normalized=0,
                 diagnostics=diagnostics,
-                watermark=IngestWatermark(cursor=""),
+                watermark=IngestWatermark(last_request_id=""),
                 error_message=str(e),
             )
 
diff --git a/src/cli/commands/collect.py b/src/cli/commands/collect.py
index 56377d7..a97e149 100644
--- a/src/cli/commands/collect.py
+++ b/src/cli/commands/collect.py
@@ -95,7 +95,7 @@ def collect_litellm(
         )
 
     async def _run_async() -> tuple[int, CollectionDiagnostics]:
-        db_session: SQLAlchemySession = get_db_session()
+        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
         try:
             repository = SQLRequestRepository(db_session)
 
@@ -192,7 +192,7 @@ def collect_prometheus(
         ),
     ] = "http://localhost:9090",
     start_time: Annotated[
-        str,
+        str | None,
         typer.Option(
             "--start-time",
             "-s",
@@ -200,7 +200,7 @@ def collect_prometheus(
         ),
     ] = None,
     end_time: Annotated[
-        str,
+        str | None,
         typer.Option(
             "--end-time",
             "-e",
@@ -237,7 +237,7 @@ def collect_prometheus(
         end_time = datetime.now(UTC).isoformat()
 
     async def _run_async() -> int:
-        db_session: SQLAlchemySession = get_db_session()
+        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
         try:
             repository = SQLRollupRepository(db_session)
             collector = PrometheusCollector(
@@ -329,7 +329,7 @@ def compute_rollups(
         raise typer.Exit(1) from err
 
     async def _run_async() -> tuple[int, int]:
-        db_session: SQLAlchemySession = get_db_session()
+        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
         try:
             # Fetch all requests for the session
             stmt = select(RequestORM).where(RequestORM.session_id == session_uuid)
@@ -437,7 +437,7 @@ def compute_variant_rollups(
     from benchmark_core.db.models import Session as SessionORM
 
     async def _run_async() -> int:
-        db_session: SQLAlchemySession = get_db_session()
+        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
         try:
             # Fetch all sessions for the variant
             stmt = select(SessionORM).where(SessionORM.variant_id == variant_id)
@@ -448,18 +448,18 @@ async def _run_async() -> int:
             from benchmark_core.models import Session
 
             sessions = [
-                Session(
-                    session_id=s.session_id,
-                    experiment_id=s.experiment_id,
-                    variant_id=s.variant_id,
-                    task_card_id=s.task_card_id,
+                Session(  # type: ignore[call-arg]
+                    session_id=s.session_id,  # type: ignore[attr-defined]
+                    experiment_id=s.experiment_id,  # type: ignore[arg-type]
+                    variant_id=s.variant_id,  # type: ignore[arg-type]
+                    task_card_id=s.task_card_id,  # type: ignore[arg-type]
                     status=s.status,
                     started_at=s.started_at,
                     ended_at=s.ended_at,
                     operator_label=s.operator_label or "",
-                    repo_root=s.repo_root or "",
+                    repo_root=s.repo_root or "",  # type: ignore[attr-defined]
                     git_branch=s.git_branch or "",
-                    git_commit_sha=s.git_commit_sha or "",
+                    git_commit_sha=s.git_commit_sha or "",  # type: ignore[attr-defined]
                     git_dirty=s.git_dirty or False,
                 )
                 for s in sessions_orm
@@ -529,7 +529,7 @@ def compute_experiment_rollups(
     from benchmark_core.db.models import Variant as VariantORM
 
     async def _run_async() -> int:
-        db_session: SQLAlchemySession = get_db_session()
+        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
         try:
             # Fetch all variants for the experiment
             # First get all sessions for the experiment
@@ -544,7 +544,7 @@ async def _run_async() -> int:
             variants_data = []
             for vid in variant_ids:
                 # Get variant details
-                vstmt = select(VariantORM).where(VariantORM.variant_id == vid)
+                vstmt = select(VariantORM).where(VariantORM.variant_id == vid)  # type: ignore[attr-defined]
                 vresult = db_session.execute(vstmt)
                 variant = vresult.scalar_one_or_none()
 
@@ -625,6 +625,6 @@ async def _fetch_litellm_requests(
         if isinstance(data, list):
             return data
         elif isinstance(data, dict) and "logs" in data:
-            return data["logs"]
+            return data["logs"]  # type: ignore[no-any-return]
         else:
             return []
diff --git a/src/cli/commands/normalize.py b/src/cli/commands/normalize.py
index 61500dd..40fd0ce 100644
--- a/src/cli/commands/normalize.py
+++ b/src/cli/commands/normalize.py
@@ -300,6 +300,6 @@ async def _fetch_litellm_requests(
         if isinstance(data, list):
             return data
         elif isinstance(data, dict) and "logs" in data:
-            return data["logs"]
+            return data["logs"]  # type: ignore[no-any-return]
         else:
             return []
diff --git a/src/cli/commands/session.py b/src/cli/commands/session.py
index 4cd350c..95e99d1 100644
--- a/src/cli/commands/session.py
+++ b/src/cli/commands/session.py
@@ -123,9 +123,9 @@ def create(
             # Create session with git metadata and notes
             session = asyncio.run(
                 service.create_session(
-                    experiment_id=str(exp_id),
-                    variant_id=str(var_id),
-                    task_card_id=str(task_id),
+                    experiment_id=exp_id,
+                    variant_id=var_id,
+                    task_card_id=task_id,
                     harness_profile=harness_profile,
                     repo_path=git_metadata.repo_path if git_metadata else (repo_path or "."),
                     git_branch=git_metadata.branch if git_metadata else "unknown",
diff --git a/src/collectors/litellm_collector.py b/src/collectors/litellm_collector.py
index 4ab9368..fdc26fb 100644
--- a/src/collectors/litellm_collector.py
+++ b/src/collectors/litellm_collector.py
@@ -128,7 +128,7 @@ async def collect_requests(
 
         # Idempotent bulk insert - repository handles duplicates
         try:
-            ingested = await self._repository.create_many(requests_to_ingest)
+            ingested = await self._repository.create_many(requests_to_ingest)  # type: ignore[attr-defined]
         except Exception as e:
             diagnostics.add_error(f"Repository bulk insert failed: {e}")
             return [], diagnostics, new_watermark
@@ -189,7 +189,7 @@ async def _fetch_raw_requests(
                 if isinstance(data, list):
                     return data
                 elif isinstance(data, dict) and "logs" in data:
-                    return data["logs"]
+                    return data["logs"]  # type: ignore[no-any-return]
                 else:
                     diagnostics.add_error(f"Unexpected API response format: {type(data)}")
                     return []
diff --git a/src/collectors/normalization.py b/src/collectors/normalization.py
index 168bcbd..0ff6218 100644
--- a/src/collectors/normalization.py
+++ b/src/collectors/normalization.py
@@ -61,7 +61,7 @@ async def run(
                 requests.append(request)
 
         # Bulk insert with idempotency handling
-        return await self._repository.create_many(requests)
+        return await self._repository.create_many(requests)  # type: ignore[attr-defined, no-any-return]
 
     def _normalize(self, raw: dict[str, Any], session_id: UUID) -> Request | None:
         """Normalize a single raw request.

From 99e5ad913d8967a31b56945527c0daf3edf049fc Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 09:15:12 -0500
Subject: [PATCH 11/12] fix(ci): make quality checks pass on benchmark services
 and collectors

Summary:
- replace health and diagnostics HTTP probes with `httpx` and update
  the corresponding unit tests
- fix collector, session-service, and CLI typing/interface mismatches
  surfaced by the new lint and type-check CI jobs
- align repository interfaces and SQLAlchemy result handling with the
  current implementation

Rationale:
- COE-319 adds CI gates, so the branch now needs to satisfy the checks
  it introduced instead of suppressing the failures as legacy debt
- several command and service paths had drifted from the current
  collector/session APIs, which caused mypy failures and brittle test
  behavior

Tests:
- uv run ruff check src tests
- uv run mypy src
- uv run ruff format --check src tests
- uv run pytest tests/ -q
---
 .../services/diagnostics_service.py           |  35 ++----
 src/benchmark_core/services/health_service.py |  31 ++----
 .../services/session_service.py               |   9 +-
 src/cli/commands/collect.py                   | 105 ++++--------------
 src/cli/commands/normalize.py                 |  26 ++---
 tests/unit/test_diagnostics_service.py        |  27 +++--
 tests/unit/test_health_service.py             |  43 ++++---
 7 files changed, 90 insertions(+), 186 deletions(-)

diff --git a/src/benchmark_core/services/diagnostics_service.py b/src/benchmark_core/services/diagnostics_service.py
index dd14748..7951289 100644
--- a/src/benchmark_core/services/diagnostics_service.py
+++ b/src/benchmark_core/services/diagnostics_service.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Any
 
+import httpx
 from sqlalchemy import text
 from sqlalchemy.orm import Session
 
@@ -451,14 +452,12 @@ def _diagnose_litellm(self) -> DiagnosticResult:
             DiagnosticResult with LiteLLM status.
         """
         try:
-            import requests  # type: ignore[import-untyped]
-
             base_url = os.getenv("LITELLM_BASE_URL", "http://localhost:4000")
             api_key = os.getenv("LITELLM_MASTER_KEY")
 
             # Check health endpoint
             health_url = f"{base_url}/health/liveliness"
-            response = requests.get(health_url, timeout=5)
+            response = httpx.get(health_url, timeout=5)
 
             if response.status_code != 200:
                 return DiagnosticResult(
@@ -476,7 +475,7 @@ def _diagnose_litellm(self) -> DiagnosticResult:
                 try:
                     models_url = f"{base_url}/v1/models"
                     headers = {"Authorization": f"Bearer {api_key}"}
-                    models_response = requests.get(models_url, headers=headers, timeout=5)
+                    models_response = httpx.get(models_url, headers=headers, timeout=5)
                     if models_response.status_code == 200:
                         models_data = models_response.json()
                         models = [m.get("id", m.get("model")) for m in models_data.get("data", [])]
@@ -491,16 +490,7 @@ def _diagnose_litellm(self) -> DiagnosticResult:
                 message=f"LiteLLM proxy healthy, {len(models)} model(s) configured",
             )
 
-        except ImportError:
-            return DiagnosticResult(
-                category="services",
-                name="litellm_proxy",
-                status="error",
-                value=None,
-                message="requests library not available",
-                suggestion="Install requests: pip install requests",
-            )
-        except requests.exceptions.ConnectionError:
+        except httpx.ConnectError:
             return DiagnosticResult(
                 category="services",
                 name="litellm_proxy",
@@ -526,13 +516,11 @@ def _diagnose_prometheus(self) -> DiagnosticResult:
             DiagnosticResult with Prometheus status.
         """
         try:
-            import requests
-
             base_url = os.getenv("PROMETHEUS_URL", "http://localhost:9090")
 
             # Check health endpoint
             health_url = f"{base_url}/-/healthy"
-            response = requests.get(health_url, timeout=5)
+            response = httpx.get(health_url, timeout=5)
 
             if response.status_code != 200:
                 return DiagnosticResult(
@@ -548,7 +536,7 @@ def _diagnose_prometheus(self) -> DiagnosticResult:
             targets = []
             try:
                 targets_url = f"{base_url}/api/v1/targets"
-                targets_response = requests.get(targets_url, timeout=5)
+                targets_response = httpx.get(targets_url, timeout=5)
                 if targets_response.status_code == 200:
                     targets_data = targets_response.json()
                     targets = [
@@ -566,16 +554,7 @@ def _diagnose_prometheus(self) -> DiagnosticResult:
                 message=f"Prometheus healthy, {len(targets)} target(s) configured",
             )
 
-        except ImportError:
-            return DiagnosticResult(
-                category="services",
-                name="prometheus",
-                status="error",
-                value=None,
-                message="requests library not available",
-                suggestion="Install requests: pip install requests",
-            )
-        except requests.exceptions.ConnectionError:
+        except httpx.ConnectError:
             return DiagnosticResult(
                 category="services",
                 name="prometheus",
diff --git a/src/benchmark_core/services/health_service.py b/src/benchmark_core/services/health_service.py
index 7504609..5b326ad 100644
--- a/src/benchmark_core/services/health_service.py
+++ b/src/benchmark_core/services/health_service.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass, field
 from typing import Any
 
+import httpx
 from sqlalchemy import text
 from sqlalchemy.orm import Session
 
@@ -126,11 +127,9 @@ def check_litellm_proxy(self) -> HealthCheckResult:
             HealthCheckResult with LiteLLM proxy status.
         """
         try:
-            import requests  # type: ignore[import-untyped]
-
             health_url = f"{self._litellm_base_url}/health/liveliness"
 
-            response = requests.get(health_url, timeout=5)
+            response = httpx.get(health_url, timeout=5)
 
             if response.status_code == 200:
                 data = (
@@ -158,27 +157,20 @@ def check_litellm_proxy(self) -> HealthCheckResult:
                     suggestion="Check if LiteLLM container is running with 'docker ps'",
                 )
 
-        except requests.exceptions.ConnectionError:
+        except httpx.ConnectError:
             return HealthCheckResult(
                 name="litellm_proxy",
                 status=HealthStatus.UNHEALTHY,
                 message=f"Cannot connect to LiteLLM proxy at {self._litellm_base_url}",
                 suggestion="Start LiteLLM proxy with 'docker-compose up -d litellm'",
             )
-        except requests.exceptions.Timeout:
+        except httpx.TimeoutException:
             return HealthCheckResult(
                 name="litellm_proxy",
                 status=HealthStatus.UNHEALTHY,
                 message="LiteLLM proxy health check timed out",
                 suggestion="LiteLLM proxy may be overloaded or starting up",
             )
-        except ImportError:
-            return HealthCheckResult(
-                name="litellm_proxy",
-                status=HealthStatus.UNHEALTHY,
-                message="requests library not available",
-                suggestion="Install requests: pip install requests",
-            )
         except Exception as e:
             return HealthCheckResult(
                 name="litellm_proxy",
@@ -194,11 +186,9 @@ def check_prometheus(self) -> HealthCheckResult:
             HealthCheckResult with Prometheus status.
         """
         try:
-            import requests
-
             health_url = f"{self._prometheus_url}/-/healthy"
 
-            response = requests.get(health_url, timeout=5)
+            response = httpx.get(health_url, timeout=5)
 
             if response.status_code == 200:
                 return HealthCheckResult(
@@ -216,27 +206,20 @@ def check_prometheus(self) -> HealthCheckResult:
                     suggestion="Check Prometheus logs with 'docker logs litellm-prometheus'",
                 )
 
-        except requests.exceptions.ConnectionError:
+        except httpx.ConnectError:
             return HealthCheckResult(
                 name="prometheus",
                 status=HealthStatus.UNHEALTHY,
                 message=f"Cannot connect to Prometheus at {self._prometheus_url}",
                 suggestion="Start Prometheus with 'docker-compose up -d prometheus'",
             )
-        except requests.exceptions.Timeout:
+        except httpx.TimeoutException:
             return HealthCheckResult(
                 name="prometheus",
                 status=HealthStatus.UNHEALTHY,
                 message="Prometheus health check timed out",
                 suggestion="Prometheus may be overloaded",
             )
-        except ImportError:
-            return HealthCheckResult(
-                name="prometheus",
-                status=HealthStatus.UNHEALTHY,
-                message="requests library not available",
-                suggestion="Install requests: pip install requests",
-            )
         except Exception as e:
             return HealthCheckResult(
                 name="prometheus",
diff --git a/src/benchmark_core/services/session_service.py b/src/benchmark_core/services/session_service.py
index d993292..52331be 100644
--- a/src/benchmark_core/services/session_service.py
+++ b/src/benchmark_core/services/session_service.py
@@ -505,7 +505,7 @@ async def run_collection_job(
 
         try:
             # Fetch and normalize requests from LiteLLM
-            requests, new_watermark = await self._collector.collect(  # type: ignore[attr-defined]
+            requests, diagnostics, new_watermark = await self._collector.collect_requests(
                 session_id=session_id,
                 start_time=start_time,
                 end_time=end_time,
@@ -521,13 +521,10 @@ async def run_collection_job(
                     watermark=new_watermark,
                 )
 
-            # Persist normalized requests
-            created = await self._repository.create_many(requests)
-
             return CollectionJobResult(
                 success=True,
                 requests_collected=len(requests),
-                requests_normalized=len(created),
+                requests_normalized=len(requests),
                 diagnostics=diagnostics,
                 watermark=new_watermark,
             )
@@ -539,7 +536,7 @@ async def run_collection_job(
                 requests_collected=0,
                 requests_normalized=0,
                 diagnostics=diagnostics,
-                watermark=IngestWatermark(last_request_id=""),
+                watermark=IngestWatermark(),
                 error_message=str(e),
             )
 
diff --git a/src/cli/commands/collect.py b/src/cli/commands/collect.py
index a97e149..5aa2f7b 100644
--- a/src/cli/commands/collect.py
+++ b/src/cli/commands/collect.py
@@ -8,7 +8,6 @@
 import typer
 from rich.console import Console
 from rich.table import Table
-from sqlalchemy.orm import Session as SQLAlchemySession
 
 from benchmark_core.db.session import get_db_session
 from benchmark_core.repositories.request_repository import SQLRequestRepository
@@ -95,8 +94,7 @@ def collect_litellm(
         )
 
     async def _run_async() -> tuple[int, CollectionDiagnostics]:
-        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-        try:
+        with get_db_session() as db_session:
             repository = SQLRequestRepository(db_session)
 
             # Fetch raw requests
@@ -135,17 +133,6 @@ async def _run_async() -> tuple[int, CollectionDiagnostics]:
             db_session.commit()
             return len(written), diagnostics
 
-        except (OSError, ValueError, httpx.HTTPError) as err:
-            db_session.rollback()
-            console.print(f"[red]Error during collection: {err}[/red]")
-            raise typer.Exit(1) from err
-        except Exception as err:
-            db_session.rollback()
-            console.print(f"[red]Unexpected error during collection: {err}[/red]")
-            raise typer.Exit(1) from err
-        finally:
-            db_session.close()
-
     count, diagnostics = asyncio.run(_run_async())
 
     # Display results
@@ -237,8 +224,7 @@ def collect_prometheus(
         end_time = datetime.now(UTC).isoformat()
 
     async def _run_async() -> int:
-        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-        try:
+        with get_db_session() as db_session:
             repository = SQLRollupRepository(db_session)
             collector = PrometheusCollector(
                 base_url=prometheus_url,
@@ -257,17 +243,6 @@ async def _run_async() -> int:
             db_session.commit()
             return len(written)
 
-        except (OSError, ValueError, httpx.HTTPError) as err:
-            db_session.rollback()
-            console.print(f"[red]Error during Prometheus collection: {err}[/red]")
-            raise typer.Exit(1) from err
-        except Exception as err:
-            db_session.rollback()
-            console.print(f"[red]Unexpected error during Prometheus collection: {err}[/red]")
-            raise typer.Exit(1) from err
-        finally:
-            db_session.close()
-
     count = asyncio.run(_run_async())
 
     console.print("\n[bold green]Prometheus Collection Complete[/bold green]")
@@ -329,8 +304,7 @@ def compute_rollups(
         raise typer.Exit(1) from err
 
     async def _run_async() -> tuple[int, int]:
-        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-        try:
+        with get_db_session() as db_session:
             # Fetch all requests for the session
             stmt = select(RequestORM).where(RequestORM.session_id == session_uuid)
             result = db_session.execute(stmt)
@@ -386,17 +360,6 @@ async def _run_async() -> tuple[int, int]:
 
             return request_rollup_count, session_rollup_count
 
-        except (OSError, ValueError) as err:
-            db_session.rollback()
-            console.print(f"[red]Error during rollup computation: {err}[/red]")
-            raise typer.Exit(1) from err
-        except Exception as err:
-            db_session.rollback()
-            console.print(f"[red]Unexpected error during rollup computation: {err}[/red]")
-            raise typer.Exit(1) from err
-        finally:
-            db_session.close()
-
     request_count, session_count = asyncio.run(_run_async())
 
     console.print("\n[bold green]Rollup Computation Complete[/bold green]")
@@ -437,8 +400,7 @@ def compute_variant_rollups(
     from benchmark_core.db.models import Session as SessionORM
 
     async def _run_async() -> int:
-        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-        try:
+        with get_db_session() as db_session:
             # Fetch all sessions for the variant
             stmt = select(SessionORM).where(SessionORM.variant_id == variant_id)
             result = db_session.execute(stmt)
@@ -448,18 +410,19 @@ async def _run_async() -> int:
             from benchmark_core.models import Session
 
             sessions = [
-                Session(  # type: ignore[call-arg]
-                    session_id=s.session_id,  # type: ignore[attr-defined]
-                    experiment_id=s.experiment_id,  # type: ignore[arg-type]
-                    variant_id=s.variant_id,  # type: ignore[arg-type]
-                    task_card_id=s.task_card_id,  # type: ignore[arg-type]
+                Session(
+                    session_id=s.id,
+                    experiment_id=str(s.experiment_id),
+                    variant_id=str(s.variant_id),
+                    task_card_id=str(s.task_card_id),
+                    harness_profile=s.harness_profile,
                     status=s.status,
                     started_at=s.started_at,
                     ended_at=s.ended_at,
                     operator_label=s.operator_label or "",
-                    repo_root=s.repo_root or "",  # type: ignore[attr-defined]
+                    repo_path=s.repo_path or "",
                     git_branch=s.git_branch or "",
-                    git_commit_sha=s.git_commit_sha or "",  # type: ignore[attr-defined]
+                    git_commit=s.git_commit or "",
                     git_dirty=s.git_dirty or False,
                 )
                 for s in sessions_orm
@@ -478,17 +441,6 @@ async def _run_async() -> int:
 
             return len(rollups)
 
-        except (OSError, ValueError) as err:
-            db_session.rollback()
-            console.print(f"[red]Error during variant rollup computation: {err}[/red]")
-            raise typer.Exit(1) from err
-        except Exception as err:
-            db_session.rollback()
-            console.print(f"[red]Unexpected error during variant rollup computation: {err}[/red]")
-            raise typer.Exit(1) from err
-        finally:
-            db_session.close()
-
     count = asyncio.run(_run_async())
 
     console.print("\n[bold green]Variant Rollup Computation Complete[/bold green]")
@@ -529,8 +481,7 @@ def compute_experiment_rollups(
     from benchmark_core.db.models import Variant as VariantORM
 
     async def _run_async() -> int:
-        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-        try:
+        with get_db_session() as db_session:
             # Fetch all variants for the experiment
             # First get all sessions for the experiment
             stmt = select(SessionORM).where(SessionORM.experiment_id == experiment_id)
@@ -538,20 +489,20 @@ async def _run_async() -> int:
             sessions = result.scalars().all()
 
             # Get unique variant IDs
-            variant_ids = list({s.variant_id for s in sessions if s.variant_id})
+            variant_ids = list({s.variant_id for s in sessions})
 
             # Build variant data
             variants_data = []
             for vid in variant_ids:
                 # Get variant details
-                vstmt = select(VariantORM).where(VariantORM.variant_id == vid)  # type: ignore[attr-defined]
+                vstmt = select(VariantORM).where(VariantORM.id == vid)
                 vresult = db_session.execute(vstmt)
                 variant = vresult.scalar_one_or_none()
 
                 if variant:
                     variants_data.append(
                         {
-                            "variant_id": vid,
+                            "variant_id": str(vid),
                             "name": variant.name,
                             "session_count": sum(1 for s in sessions if s.variant_id == vid),
                         }
@@ -570,19 +521,6 @@ async def _run_async() -> int:
 
             return len(rollups)
 
-        except (OSError, ValueError) as err:
-            db_session.rollback()
-            console.print(f"[red]Error during experiment rollup computation: {err}[/red]")
-            raise typer.Exit(1) from err
-        except Exception as err:
-            db_session.rollback()
-            console.print(
-                f"[red]Unexpected error during experiment rollup computation: {err}[/red]"
-            )
-            raise typer.Exit(1) from err
-        finally:
-            db_session.close()
-
     count = asyncio.run(_run_async())
 
     console.print("\n[bold green]Experiment Rollup Computation Complete[/bold green]")
@@ -600,7 +538,7 @@ async def _fetch_litellm_requests(
     litellm_key: str,
     start_time: str | None,
     end_time: str | None,
-) -> list[dict]:
+) -> list[dict[str, object]]:
     """Fetch raw requests from LiteLLM spend logs endpoint."""
     headers = {
         "Authorization": f"Bearer {litellm_key}",
@@ -623,8 +561,7 @@ async def _fetch_litellm_requests(
         data = response.json()
 
         if isinstance(data, list):
-            return data
-        elif isinstance(data, dict) and "logs" in data:
-            return data["logs"]  # type: ignore[no-any-return]
-        else:
-            return []
+            return [item for item in data if isinstance(item, dict)]
+        if isinstance(data, dict) and isinstance(data.get("logs"), list):
+            return [item for item in data["logs"] if isinstance(item, dict)]
+        return []
diff --git a/src/cli/commands/normalize.py b/src/cli/commands/normalize.py
index 40fd0ce..4dc6ec1 100644
--- a/src/cli/commands/normalize.py
+++ b/src/cli/commands/normalize.py
@@ -8,7 +8,6 @@
 from rich.console import Console
 from rich.table import Table
 from sqlalchemy import func, select
-from sqlalchemy.orm import Session as SQLAlchemySession
 
 from benchmark_core.db.models import Request as RequestORM
 from benchmark_core.db.session import get_db_session
@@ -127,8 +126,7 @@ async def _run() -> tuple[int, ReconciliationReport]:
             return 0, report
 
         # Normal mode: write to database
-        db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-        try:
+        with get_db_session() as db_session:
             repository = SQLRequestRepository(db_session)
             normalizer_job = RequestNormalizerJob(
                 repository=repository,
@@ -137,11 +135,6 @@ async def _run() -> tuple[int, ReconciliationReport]:
             written, report = await normalizer_job.run(raw_requests)
             db_session.commit()
             return len(written), report
-        except Exception:
-            db_session.rollback()
-            raise
-        finally:
-            db_session.close()
 
     try:
         count, report = asyncio.run(_run())
@@ -226,8 +219,7 @@ def show_reconciliation_report(
         raise typer.Exit(1) from err
 
     # Get request count for the session
-    db_session: SQLAlchemySession = get_db_session()  # type: ignore[assignment]
-    try:
+    with get_db_session() as db_session:
         # Query the actual count of normalized requests for this session
         stmt = select(func.count()).where(RequestORM.session_id == UUID(session_id))
         count = db_session.execute(stmt).scalar() or 0
@@ -266,16 +258,13 @@ def show_reconciliation_report(
         if error_count > 0:
             console.print(f"Requests with errors: {error_count}")
 
-    finally:
-        db_session.close()
-
 
 async def _fetch_litellm_requests(
     litellm_url: str,
     litellm_key: str,
     start_time: str | None,
     end_time: str | None,
-) -> list[dict]:
+) -> list[dict[str, object]]:
     """Fetch raw requests from LiteLLM spend logs endpoint."""
     headers = {
         "Authorization": f"Bearer {litellm_key}",
@@ -298,8 +287,7 @@ async def _fetch_litellm_requests(
         data = response.json()
 
         if isinstance(data, list):
-            return data
-        elif isinstance(data, dict) and "logs" in data:
-            return data["logs"]  # type: ignore[no-any-return]
-        else:
-            return []
+            return [item for item in data if isinstance(item, dict)]
+        if isinstance(data, dict) and isinstance(data.get("logs"), list):
+            return [item for item in data["logs"] if isinstance(item, dict)]
+        return []
diff --git a/tests/unit/test_diagnostics_service.py b/tests/unit/test_diagnostics_service.py
index 826acbf..8ca6b38 100644
--- a/tests/unit/test_diagnostics_service.py
+++ b/tests/unit/test_diagnostics_service.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
+import httpx
 import pytest
 
 from benchmark_core.services.diagnostics_service import (
@@ -160,7 +161,7 @@ def test_diagnose_services_litellm_success(self, diagnostics_service):
 
         with (
             patch.dict(os.environ, {"LITELLM_MASTER_KEY": "sk-test"}),
-            patch("requests.get", side_effect=[mock_response, mock_models_response]),
+            patch("httpx.get", side_effect=[mock_response, mock_models_response]),
         ):
             diag = diagnostics_service._diagnose_litellm()
 
@@ -169,9 +170,13 @@ def test_diagnose_services_litellm_success(self, diagnostics_service):
 
     def test_diagnose_services_litellm_connection_error(self, diagnostics_service):
         """Test LiteLLM service diagnostics with connection error."""
-        import requests
-
-        with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
+        with patch(
+            "httpx.get",
+            side_effect=httpx.ConnectError(
+                "boom",
+                request=httpx.Request("GET", "http://localhost:4000/health/liveliness"),
+            ),
+        ):
             diag = diagnostics_service._diagnose_litellm()
 
         assert diag.status == "error"
@@ -187,7 +192,7 @@ def test_diagnose_services_prometheus_success(self, diagnostics_service):
             "data": {"activeTargets": [{"labels": {"job": "litellm"}}]}
         }
 
-        with patch("requests.get", side_effect=[mock_response, mock_targets_response]):
+        with patch("httpx.get", side_effect=[mock_response, mock_targets_response]):
             diag = diagnostics_service._diagnose_prometheus()
 
         assert diag.status == "ok"
@@ -195,9 +200,13 @@ def test_diagnose_services_prometheus_success(self, diagnostics_service):
 
     def test_diagnose_services_prometheus_connection_error(self, diagnostics_service):
         """Test Prometheus service diagnostics with connection error."""
-        import requests
-
-        with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
+        with patch(
+            "httpx.get",
+            side_effect=httpx.ConnectError(
+                "boom",
+                request=httpx.Request("GET", "http://localhost:9090/-/healthy"),
+            ),
+        ):
             diag = diagnostics_service._diagnose_prometheus()
 
         assert diag.status == "error"
@@ -214,7 +223,7 @@ def test_run_diagnostics(self, diagnostics_service, temp_configs_dir, tmp_path):
             mock_response.status_code = 200
             mock_response.json.return_value = {"data": []}
 
-            with patch("requests.get", return_value=mock_response):
+            with patch("httpx.get", return_value=mock_response):
                 report = service.run_diagnostics()
 
         assert isinstance(report, DiagnosticsReport)
diff --git a/tests/unit/test_health_service.py b/tests/unit/test_health_service.py
index d43d0b3..f4cea00 100644
--- a/tests/unit/test_health_service.py
+++ b/tests/unit/test_health_service.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
+import httpx
 import pytest
 
 from benchmark_core.services.health_service import (
@@ -71,7 +72,7 @@ def test_check_litellm_proxy_success(self, health_service):
         mock_response.headers = {"content-type": "application/json"}
         mock_response.json.return_value = {"status": "healthy"}
 
-        with patch("requests.get", return_value=mock_response):
+        with patch("httpx.get", return_value=mock_response):
             result = health_service.check_litellm_proxy()
 
         assert result.status == HealthStatus.HEALTHY
@@ -79,9 +80,13 @@ def test_check_litellm_proxy_success(self, health_service):
 
     def test_check_litellm_proxy_connection_error(self, health_service):
         """Test LiteLLM proxy check with connection error."""
-        import requests
-
-        with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
+        with patch(
+            "httpx.get",
+            side_effect=httpx.ConnectError(
+                "boom",
+                request=httpx.Request("GET", "http://localhost:4000/health/liveliness"),
+            ),
+        ):
             result = health_service.check_litellm_proxy()
 
         assert result.status == HealthStatus.UNHEALTHY
@@ -90,9 +95,7 @@ def test_check_litellm_proxy_connection_error(self, health_service):
 
     def test_check_litellm_proxy_timeout(self, health_service):
         """Test LiteLLM proxy check with timeout."""
-        import requests
-
-        with patch("requests.get", side_effect=requests.exceptions.Timeout()):
+        with patch("httpx.get", side_effect=httpx.TimeoutException("boom")):
             result = health_service.check_litellm_proxy()
 
         assert result.status == HealthStatus.UNHEALTHY
@@ -103,7 +106,7 @@ def test_check_litellm_proxy_unhealthy_status(self, health_service):
         mock_response = MagicMock()
         mock_response.status_code = 503
 
-        with patch("requests.get", return_value=mock_response):
+        with patch("httpx.get", return_value=mock_response):
             result = health_service.check_litellm_proxy()
 
         assert result.status == HealthStatus.UNHEALTHY
@@ -114,7 +117,7 @@ def test_check_prometheus_success(self, health_service):
         mock_response = MagicMock()
         mock_response.status_code = 200
 
-        with patch("requests.get", return_value=mock_response):
+        with patch("httpx.get", return_value=mock_response):
             result = health_service.check_prometheus()
 
         assert result.status == HealthStatus.HEALTHY
@@ -122,9 +125,13 @@ def test_check_prometheus_success(self, health_service):
 
     def test_check_prometheus_connection_error(self, health_service):
         """Test Prometheus check with connection error."""
-        import requests
-
-        with patch("requests.get", side_effect=requests.exceptions.ConnectionError()):
+        with patch(
+            "httpx.get",
+            side_effect=httpx.ConnectError(
+                "boom",
+                request=httpx.Request("GET", "http://localhost:9090/-/healthy"),
+            ),
+        ):
             result = health_service.check_prometheus()
 
         assert result.status == HealthStatus.UNHEALTHY
@@ -183,7 +190,7 @@ def test_run_health_checks_all_healthy(self, health_service, temp_configs_dir, t
             mock_response.headers = {"content-type": "application/json"}
             mock_response.json.return_value = {"status": "healthy"}
 
-            with patch("requests.get", return_value=mock_response):
+            with patch("httpx.get", return_value=mock_response):
                 report = health_service.run_health_checks(temp_configs_dir)
 
         assert report.status == HealthStatus.HEALTHY
@@ -196,11 +203,15 @@ def test_run_health_checks_with_unhealthy(self, health_service, temp_configs_dir
         # Mock database to use SQLite
         db_path = tmp_path / "test.db"
 
-        import requests
-
         with (
             patch.dict(os.environ, {"DATABASE_URL": f"sqlite:///{db_path}"}),
-            patch("requests.get", side_effect=requests.exceptions.ConnectionError()),
+            patch(
+                "httpx.get",
+                side_effect=httpx.ConnectError(
+                    "boom",
+                    request=httpx.Request("GET", "http://localhost:4000/health/liveliness"),
+                ),
+            ),
         ):
             report = health_service.run_health_checks(temp_configs_dir)
 

From 5e311ee9185b926036a301fd905021cfce74cba0 Mon Sep 17 00:00:00 2001
From: Leonardo Gonzalez <leonardo.gonzalez@trilogy.com>
Date: Mon, 30 Mar 2026 09:33:29 -0500
Subject: [PATCH 12/12] chore: remove requests

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3d3ca13..c4dcd35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,6 @@ dependencies = [
     "typer>=0.9.0",
     "rich>=13.0.0",
     "httpx>=0.25.0",
-    "requests>=2.31.0",
     "sqlalchemy>=2.0.0",
     "alembic>=1.12.0",
     "psycopg2-binary>=2.9.0",
@@ -36,7 +35,6 @@ dev = [
     "ruff>=0.1.0",
     "mypy>=1.7.0",
     "types-PyYAML>=6.0.0",
-    "types-requests>=2.31.0",
     "pytest>=7.4.0",
     "pytest-asyncio>=0.21.0",
     "pytest-cov>=4.1.0",