Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## Commands
- Install dependencies: `uv sync`
- Run tests: `uv run pytest`
- Run specific test: `uv run pytest tests/path/to/test_file.py::test_name -v`
- Run all example tests: `uv run pytest --all`
- Type check: `uv run mypy src tests examples/team_recommender/src`
- Lint: `uv run ruff check src tests examples`
- Format: `uv run ruff format src tests examples`

## Code Style
- Python 3.13+ required
- Use type annotations for all functions and methods (checked by mypy)
- Max line length: 120 characters
- Use pytest fixtures in `conftest.py` for test setup
- Follow black formatting conventions
- Import order: stdlib, third-party, local
- Use proper error handling with try/except blocks
- Use snake_case for functions, variables, and modules
- Use PascalCase for class names
- Maintain test coverage for all new code
- Use `CAT_AI_SAMPLE_SIZE` environment variable for test iterations
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ test = [
]
examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
dev = [
"ipython>=9.0.0",
"sphinx>=8.1.3,<9",
"sphinx-rtd-theme>=3.0.2,<4",
"sphinx-markdown-builder>=0.6.8,<0.7",
Expand Down
90 changes: 90 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import csv
import io
import os
from typing import Callable, Generator, Optional

import matplotlib
import pytest

from cat_ai.helpers.helpers import root_dir
from cat_ai.reporter import Reporter
from cat_ai.statistical_analysis import StatisticalAnalysis, analyse_measure_from_test_sample


@pytest.fixture
def test_name(request: pytest.FixtureRequest) -> str:
return str(request.node.name)


@pytest.fixture
def reporter_factory(test_name: str) -> Callable:
"""Factory fixture for creating Reporter instances with default settings."""

def _create_reporter(
unique_id: Optional[str] = None,
) -> Reporter:
return Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)

return _create_reporter


@pytest.fixture
def tmp_reporter() -> "Reporter":
"""Creates a reporter that writes to /tmp."""
return Reporter(test_name="test_fixture", output_dir="/tmp")


@pytest.fixture
def analyze_failure_rate() -> Callable:
"""Helper fixture to analyze failure rates."""

def _analyze(failure_count: int, sample_size: int) -> StatisticalAnalysis:
return analyse_measure_from_test_sample(failure_count, sample_size)

return _analyze


def export_results_to_csv(results: list[StatisticalAnalysis]) -> str:
output = io.StringIO(newline="\n")
writer = csv.writer(output, lineterminator="\n")

# Write header
writer.writerow(StatisticalAnalysis.get_csv_headers())

# Write rows
for result in results:
writer.writerow(result.as_csv_row())

return output.getvalue()


@pytest.fixture
def configure_matplotlib() -> Generator[None, None, None]:
"""Configure matplotlib for consistent snapshot testing."""
matplotlib.use("Agg") # Force CPU-based renderer

# Configure for deterministic rendering
matplotlib.rcParams.update(
{
"figure.max_open_warning": 0,
"svg.hashsalt": "matplotlib",
"figure.dpi": 100,
"savefig.dpi": 100,
"path.simplify": False,
"agg.path.chunksize": 0,
"pdf.fonttype": 42, # Ensures text is stored as text, not paths
"ps.fonttype": 42,
}
)

yield

# Clean up any open figures
import matplotlib.pyplot as plt

plt.close("all")


def running_in_ci() -> bool:
"""Check if tests are running in CI environment."""
return os.getenv("CI") is not None
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"test_name": "test_report_creates_correct_json",
"folder_path": "/tmp/test_runs/test_report_creates_correct_json-20231001_120000",
"output_file": "fail-0.json",
"metadata_path": "/tmp/test_runs/test_report_creates_correct_json-20231001_120000/metadata.json",
"validations": {
"can-talk": true,
"can-think": false
},
"response": "Alice is the oldest."
}
75 changes: 43 additions & 32 deletions tests/test_reporter.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,71 @@
import json
import os
import time
from unittest.mock import MagicMock, mock_open, patch
from pathlib import Path
from typing import Any, Callable

from cat_ai.helpers.helpers import root_dir
from cat_ai.reporter import Reporter
from cat_ai.statistical_analysis import analyse_measure_from_test_sample


def test_reporter_creates_a_unique_folder_path() -> None:
test_name = "unique_folder_path"
reporter1 = Reporter(test_name=test_name, output_dir=root_dir())
expected_dir_path = f"{root_dir()}/test_runs/{test_name}"
def test_reporter_creates_a_unique_folder_path(reporter_factory: Callable) -> None:
reporter1 = reporter_factory()
expected_dir_path = f"{root_dir()}/test_runs/test_reporter_creates_a_unique_folder_path"
assert expected_dir_path in reporter1.folder_path

time.sleep(2)
reporter2 = Reporter(test_name=test_name, output_dir=root_dir())
reporter2 = reporter_factory()
assert str(reporter1.folder_path) != str(reporter2.folder_path)


def test_reporter_can_accept_unique_id_override() -> None:
test_name = "example_test"
def test_reporter_can_accept_unique_id_override(reporter_factory: Callable) -> None:
unique_id = "timestamp_or_any_unique_id"
reporter1 = Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
expected_dir_path = f"{root_dir()}/test_runs/{test_name}-{unique_id}"
assert str(expected_dir_path) == str(reporter1.folder_path)
reporter = reporter_factory(unique_id=unique_id)

expected_dir_path = (
f"{root_dir()}/test_runs/test_reporter_can_accept_unique_id_override-{unique_id}"
)
assert str(expected_dir_path) == str(reporter.folder_path)


@patch("os.makedirs")
@patch("builtins.open", new_callable=mock_open)
def test_report_creates_correct_json(mock_open: MagicMock, mock_makedirs: MagicMock) -> None:
test_name = "report_creates_correct_json"
def test_report_creates_correct_json(test_name: str, snapshot: Any) -> None:
temp_dir = "/tmp"
unique_id = "20231001_120000"
reporter = Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
metadata = {"ai-model": "champion-1"}
reporter = Reporter(
test_name=test_name,
output_dir=temp_dir,
unique_id=unique_id,
metadata=metadata,
)

response = "Sample response"
results = {"test1": True, "test2": False}
# Generate test data
response = "Alice is the oldest."
results = {"can-talk": True, "can-think": False}

# Call report method
final_result = reporter.report(response, results)

# Verify return value (should be False because not all results are True)
assert final_result is False
expected_metadata = {
"test_name": test_name,
"folder_path": f"{root_dir()}/test_runs/{test_name}-{unique_id}",
"output_file": "fail-0.json",
"metadata_path": f"{root_dir()}/test_runs/{test_name}-{unique_id}/metadata.json",
"validations": results,
"response": response,
}
expected_json_string = json.dumps(expected_metadata, indent=4)

mock_makedirs.assert_called_once_with(reporter.folder_path, exist_ok=True)
# Expected output paths
expected_dir_path = Path(temp_dir) / "test_runs" / (test_name + "-" + unique_id)
expected_metadata_path = expected_dir_path / "metadata.json"
with open(expected_metadata_path, "r") as file:
contents = json.load(file)
assert contents == metadata
expected_output_path = expected_dir_path / "fail-0.json"
assert os.path.isfile(expected_metadata_path)
assert os.path.isfile(expected_output_path)

mock_open().write.assert_called_with(expected_json_string)
with open(expected_output_path, "r") as file:
content = json.load(file)
snapshot.assert_match(json.dumps(content, indent=2), "expected_report.json")


def test_format_summary_with_failure_analysis():
failure_analysis = analyse_measure_from_test_sample(6, 100)
def test_format_summary_with_failure_analysis(analyze_failure_rate: Callable) -> None:
failure_analysis = analyze_failure_rate(6, 100)
assert Reporter.format_summary(failure_analysis) == (
"> [!NOTE]\n"
"> ## 6 ± 3 failures detected (100 samples)\n"
Expand Down
47 changes: 17 additions & 30 deletions tests/test_runner.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
from cat_ai.reporter import Reporter
from cat_ai.runner import Runner

import pytest

# Dummy test function that will be passed to Runner
def dummy_test_function(reporter: Reporter) -> bool:
# Imagine that this function does something meaningful
# Simply returning True instead of trying to log
return True
from cat_ai.runner import Runner


def test_runner_sample_size(monkeypatch):
Expand All @@ -19,46 +13,39 @@ def test_runner_sample_size(monkeypatch):
assert Runner.get_sample_size(default_size=3) == 3


def test_run_once():
# Create a Reporter with necessary arguments
reporter = Reporter(test_name="test_run_once", output_dir="/tmp")

@pytest.mark.parametrize("return_value", [True, False])
def test_run_once(tmp_reporter, return_value):
# Initialize Runner with dummy test function and Reporter
runner = Runner(test_function=dummy_test_function, reporter=reporter)
runner = Runner(test_function=lambda x: return_value, reporter=tmp_reporter)

# Test run_once
result = runner.run_once()
assert result is True
assert reporter.run_number == 0

assert result is return_value
assert tmp_reporter.run_number == 0

def test_run_multiple():
# Create a Reporter with necessary arguments
reporter = Reporter(test_name="test_run", output_dir="/tmp")

@pytest.mark.parametrize("return_value", [True, False])
def test_run_multiple(tmp_reporter, return_value):
# Initialize Runner with dummy test function and Reporter
runner = Runner(test_function=dummy_test_function, reporter=reporter)
runner = Runner(test_function=lambda _: return_value, reporter=tmp_reporter)

# Test with explicit sample size parameter
results = runner.run_multiple(sample_size=2)
assert len(results) == 2
assert all(results)
expected_results = [True, True]
expected_results = [return_value, return_value]
assert results == expected_results


def test_run_with_env_variable(monkeypatch):
@pytest.mark.parametrize("sample_size", [3, 5])
def test_run_with_env_variable(monkeypatch, tmp_reporter, sample_size):
# Set the environment variable for a controlled test
monkeypatch.setenv("CAT_AI_SAMPLE_SIZE", "3")

# Create a Reporter with necessary arguments
reporter = Reporter(test_name="test_run_with_env", output_dir="/tmp")
monkeypatch.setenv("CAT_AI_SAMPLE_SIZE", str(sample_size))

# Initialize Runner with dummy test function and Reporter
runner = Runner(test_function=dummy_test_function, reporter=reporter)
runner = Runner(test_function=lambda x: True, reporter=tmp_reporter)

# Test without explicit sample size (should use environment variable)
results = runner.run_multiple()
assert len(results) == 3
expected_results = [True, True, True]
assert len(results) == sample_size
expected_results = [True] * sample_size
assert results == expected_results
Loading