thisisartium · kseebaldt · Apr 7, 2025 · Apr 4, 2025 · Apr 4, 2025 · Apr 4, 2025
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,25 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Commands
+- Install dependencies: `uv sync`
+- Run tests: `uv run pytest`
+- Run specific test: `uv run pytest tests/path/to/test_file.py::test_name -v`
+- Run all example tests: `uv run pytest --all`
+- Type check: `uv run mypy src tests examples/team_recommender/src`
+- Lint: `uv run ruff check src tests examples`
+- Format: `uv run ruff format src tests examples`
+
+## Code Style
+- Python 3.13+ required
+- Use type annotations for all functions and methods (checked by mypy)
+- Max line length: 120 characters
+- Use pytest fixtures in `conftest.py` for test setup
+- Follow black formatting conventions
+- Import order: stdlib, third-party, local
+- Use proper error handling with try/except blocks
+- Use snake_case for functions, variables, and modules
+- Use PascalCase for class names
+- Maintain test coverage for all new code
+- Use `CAT_AI_SAMPLE_SIZE` environment variable for test iterations
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ test = [
 ]
 examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
 dev = [
+  "ipython>=9.0.0",
   "sphinx>=8.1.3,<9",
   "sphinx-rtd-theme>=3.0.2,<4",
   "sphinx-markdown-builder>=0.6.8,<0.7",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,90 @@
+import csv
+import io
+import os
+from typing import Callable, Generator, Optional
+
+import matplotlib
+import pytest
+
+from cat_ai.helpers.helpers import root_dir
+from cat_ai.reporter import Reporter
+from cat_ai.statistical_analysis import StatisticalAnalysis, analyse_measure_from_test_sample
+
+
+@pytest.fixture
+def test_name(request: pytest.FixtureRequest) -> str:
+    return str(request.node.name)
+
+
+@pytest.fixture
+def reporter_factory(test_name: str) -> Callable:
+    """Factory fixture for creating Reporter instances with default settings."""
+
+    def _create_reporter(
+        unique_id: Optional[str] = None,
+    ) -> Reporter:
+        return Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
+
+    return _create_reporter
+
+
+@pytest.fixture
+def tmp_reporter() -> "Reporter":
+    """Creates a reporter that writes to /tmp."""
+    return Reporter(test_name="test_fixture", output_dir="/tmp")
+
+
+@pytest.fixture
+def analyze_failure_rate() -> Callable:
+    """Helper fixture to analyze failure rates."""
+
+    def _analyze(failure_count: int, sample_size: int) -> StatisticalAnalysis:
+        return analyse_measure_from_test_sample(failure_count, sample_size)
+
+    return _analyze
+
+
+def export_results_to_csv(results: list[StatisticalAnalysis]) -> str:
+    output = io.StringIO(newline="\n")
+    writer = csv.writer(output, lineterminator="\n")
+
+    # Write header
+    writer.writerow(StatisticalAnalysis.get_csv_headers())
+
+    # Write rows
+    for result in results:
+        writer.writerow(result.as_csv_row())
+
+    return output.getvalue()
+
+
+@pytest.fixture
+def configure_matplotlib() -> Generator[None, None, None]:
+    """Configure matplotlib for consistent snapshot testing."""
+    matplotlib.use("Agg")  # Force CPU-based renderer
+
+    # Configure for deterministic rendering
+    matplotlib.rcParams.update(
+        {
+            "figure.max_open_warning": 0,
+            "svg.hashsalt": "matplotlib",
+            "figure.dpi": 100,
+            "savefig.dpi": 100,
+            "path.simplify": False,
+            "agg.path.chunksize": 0,
+            "pdf.fonttype": 42,  # Ensures text is stored as text, not paths
+            "ps.fonttype": 42,
+        }
+    )
+
+    yield
+
+    # Clean up any open figures
+    import matplotlib.pyplot as plt
+
+    plt.close("all")
+
+
+def running_in_ci() -> bool:
+    """Check if tests are running in CI environment."""
+    return os.getenv("CI") is not None
diff --git a/tests/snapshots/test_reporter/test_report_creates_correct_json/expected_report.json b/tests/snapshots/test_reporter/test_report_creates_correct_json/expected_report.json
@@ -0,0 +1,11 @@
+{
+  "test_name": "test_report_creates_correct_json",
+  "folder_path": "/tmp/test_runs/test_report_creates_correct_json-20231001_120000",
+  "output_file": "fail-0.json",
+  "metadata_path": "/tmp/test_runs/test_report_creates_correct_json-20231001_120000/metadata.json",
+  "validations": {
+    "can-talk": true,
+    "can-think": false
+  },
+  "response": "Alice is the oldest."
+}
diff --git a/tests/test_reporter.py b/tests/test_reporter.py
@@ -1,60 +1,71 @@
 import json
+import os
 import time
-from unittest.mock import MagicMock, mock_open, patch
+from pathlib import Path
+from typing import Any, Callable
 
 from cat_ai.helpers.helpers import root_dir
 from cat_ai.reporter import Reporter
-from cat_ai.statistical_analysis import analyse_measure_from_test_sample
 
 
-def test_reporter_creates_a_unique_folder_path() -> None:
-    test_name = "unique_folder_path"
-    reporter1 = Reporter(test_name=test_name, output_dir=root_dir())
-    expected_dir_path = f"{root_dir()}/test_runs/{test_name}"
+def test_reporter_creates_a_unique_folder_path(reporter_factory: Callable) -> None:
+    reporter1 = reporter_factory()
+    expected_dir_path = f"{root_dir()}/test_runs/test_reporter_creates_a_unique_folder_path"
     assert expected_dir_path in reporter1.folder_path
+
     time.sleep(2)
-    reporter2 = Reporter(test_name=test_name, output_dir=root_dir())
+    reporter2 = reporter_factory()
     assert str(reporter1.folder_path) != str(reporter2.folder_path)
 
 
-def test_reporter_can_accept_unique_id_override() -> None:
-    test_name = "example_test"
+def test_reporter_can_accept_unique_id_override(reporter_factory: Callable) -> None:
     unique_id = "timestamp_or_any_unique_id"
-    reporter1 = Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
-    expected_dir_path = f"{root_dir()}/test_runs/{test_name}-{unique_id}"
-    assert str(expected_dir_path) == str(reporter1.folder_path)
+    reporter = reporter_factory(unique_id=unique_id)
+
+    expected_dir_path = (
+        f"{root_dir()}/test_runs/test_reporter_can_accept_unique_id_override-{unique_id}"
+    )
+    assert str(expected_dir_path) == str(reporter.folder_path)
 
 
-@patch("os.makedirs")
-@patch("builtins.open", new_callable=mock_open)
-def test_report_creates_correct_json(mock_open: MagicMock, mock_makedirs: MagicMock) -> None:
-    test_name = "report_creates_correct_json"
+def test_report_creates_correct_json(test_name: str, snapshot: Any) -> None:
+    temp_dir = "/tmp"
     unique_id = "20231001_120000"
-    reporter = Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
+    metadata = {"ai-model": "champion-1"}
+    reporter = Reporter(
+        test_name=test_name,
+        output_dir=temp_dir,
+        unique_id=unique_id,
+        metadata=metadata,
+    )
 
-    response = "Sample response"
-    results = {"test1": True, "test2": False}
+    # Generate test data
+    response = "Alice is the oldest."
+    results = {"can-talk": True, "can-think": False}
 
+    # Call report method
     final_result = reporter.report(response, results)
 
+    # Verify return value (should be False because not all results are True)
     assert final_result is False
-    expected_metadata = {
-        "test_name": test_name,
-        "folder_path": f"{root_dir()}/test_runs/{test_name}-{unique_id}",
-        "output_file": "fail-0.json",
-        "metadata_path": f"{root_dir()}/test_runs/{test_name}-{unique_id}/metadata.json",
-        "validations": results,
-        "response": response,
-    }
-    expected_json_string = json.dumps(expected_metadata, indent=4)
 
-    mock_makedirs.assert_called_once_with(reporter.folder_path, exist_ok=True)
+    # Expected output paths
+    expected_dir_path = Path(temp_dir) / "test_runs" / (test_name + "-" + unique_id)
+    expected_metadata_path = expected_dir_path / "metadata.json"
+    with open(expected_metadata_path, "r") as file:
+        contents = json.load(file)
+    assert contents == metadata
+    expected_output_path = expected_dir_path / "fail-0.json"
+    assert os.path.isfile(expected_metadata_path)
+    assert os.path.isfile(expected_output_path)
 
-    mock_open().write.assert_called_with(expected_json_string)
+    with open(expected_output_path, "r") as file:
+        content = json.load(file)
+    snapshot.assert_match(json.dumps(content, indent=2), "expected_report.json")
 
 
-def test_format_summary_with_failure_analysis():
-    failure_analysis = analyse_measure_from_test_sample(6, 100)
+def test_format_summary_with_failure_analysis(analyze_failure_rate: Callable) -> None:
+    failure_analysis = analyze_failure_rate(6, 100)
     assert Reporter.format_summary(failure_analysis) == (
         "> [!NOTE]\n"
         "> ## 6 ± 3 failures detected (100 samples)\n"

diff --git a/tests/test_runner.py b/tests/test_runner.py
@@ -1,12 +1,6 @@
-from cat_ai.reporter import Reporter
-from cat_ai.runner import Runner
-
+import pytest
 
-# Dummy test function that will be passed to Runner
-def dummy_test_function(reporter: Reporter) -> bool:
-    # Imagine that this function does something meaningful
-    # Simply returning True instead of trying to log
-    return True
+from cat_ai.runner import Runner
 
 
 def test_runner_sample_size(monkeypatch):
@@ -19,46 +13,39 @@ def test_runner_sample_size(monkeypatch):
     assert Runner.get_sample_size(default_size=3) == 3
 
 
-def test_run_once():
-    # Create a Reporter with necessary arguments
-    reporter = Reporter(test_name="test_run_once", output_dir="/tmp")
-
+@pytest.mark.parametrize("return_value", [True, False])
+def test_run_once(tmp_reporter, return_value):
     # Initialize Runner with dummy test function and Reporter
-    runner = Runner(test_function=dummy_test_function, reporter=reporter)
+    runner = Runner(test_function=lambda x: return_value, reporter=tmp_reporter)
 
     # Test run_once
     result = runner.run_once()
-    assert result is True
-    assert reporter.run_number == 0
-
+    assert result is return_value
+    assert tmp_reporter.run_number == 0
 
-def test_run_multiple():
-    # Create a Reporter with necessary arguments
-    reporter = Reporter(test_name="test_run", output_dir="/tmp")
 
+@pytest.mark.parametrize("return_value", [True, False])
+def test_run_multiple(tmp_reporter, return_value):
     # Initialize Runner with dummy test function and Reporter
-    runner = Runner(test_function=dummy_test_function, reporter=reporter)
+    runner = Runner(test_function=lambda _: return_value, reporter=tmp_reporter)
 
     # Test with explicit sample size parameter
     results = runner.run_multiple(sample_size=2)
     assert len(results) == 2
-    assert all(results)
-    expected_results = [True, True]
+    expected_results = [return_value, return_value]
     assert results == expected_results
 
 
-def test_run_with_env_variable(monkeypatch):
+@pytest.mark.parametrize("sample_size", [3, 5])
+def test_run_with_env_variable(monkeypatch, tmp_reporter, sample_size):
     # Set the environment variable for a controlled test
-    monkeypatch.setenv("CAT_AI_SAMPLE_SIZE", "3")
-
-    # Create a Reporter with necessary arguments
-    reporter = Reporter(test_name="test_run_with_env", output_dir="/tmp")
+    monkeypatch.setenv("CAT_AI_SAMPLE_SIZE", str(sample_size))
 
     # Initialize Runner with dummy test function and Reporter
-    runner = Runner(test_function=dummy_test_function, reporter=reporter)
+    runner = Runner(test_function=lambda x: True, reporter=tmp_reporter)
 
     # Test without explicit sample size (should use environment variable)
     results = runner.run_multiple()
-    assert len(results) == 3
-    expected_results = [True, True, True]
+    assert len(results) == sample_size
+    expected_results = [True] * sample_size
     assert results == expected_results