Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ test = [
"pytest-cov>=7.0.0,<8.0.0",
"pytest-asyncio>=1.0.0,<1.4.0",
"pytest-xdist>=3.0.0,<4.0.0",
"hypothesis>=6.0.0,<7.0.0",
]

dev = [
Expand All @@ -60,6 +61,7 @@ dependencies = [
"pytest-asyncio>=1.0.0,<1.4.0",
"pytest-xdist>=3.0.0,<4.0.0",
"moto>=5.1.0,<6.0.0",
"hypothesis>=6.0.0,<7.0.0",
]

[tool.hatch.envs.default.scripts]
Expand Down Expand Up @@ -152,6 +154,7 @@ dependencies = [
"pytest-asyncio>=1.0.0,<1.4.0", # This fixed the async support
"pytest-xdist>=3.0.0,<4.0.0",
"moto>=5.1.0,<6.0.0",
"hypothesis>=6.0.0,<7.0.0",
]
extra-dependencies = [
"hatch>=1.0.0,<2.0.0",
Expand Down
33 changes: 30 additions & 3 deletions src/strands_evals/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import os
import uuid
from collections.abc import Callable
from pathlib import Path
from typing import cast
Expand Down Expand Up @@ -99,13 +100,23 @@ def __init__(
self,
cases: list[Case[InputT, OutputT]] | None = None,
evaluators: list[Evaluator[InputT, OutputT]] | None = None,
name: str = "unnamed_experiment",
):
self._name = name
self._cases = cases or []
self._evaluators = evaluators or [Evaluator()]
self._tracer = get_tracer()
# self._logger = get_logger(__name__)

self._config_id = os.environ.get("EVALUATION_RESULTS_LOG_GROUP", "default-strands-evals")
@property
def name(self) -> str:
"""Get the experiment name.

Returns:
The name of the experiment.
"""
return self._name

@property
def cases(self) -> list[Case[InputT, OutputT]]:
Expand Down Expand Up @@ -260,14 +271,15 @@ async def _run_task_async(

return evaluation_context

async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
async def _worker(self, queue: asyncio.Queue, task: Callable, results: list, run_id: str):
"""
Worker that processes cases from the queue. Run evaluation on the task.

Args:
queue: Queue containing cases to process
task: Task function to run on each case
results: List to store results
run_id: Unique identifier for this evaluation run
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same as session.id?

"""
while True:
try:
Expand Down Expand Up @@ -305,6 +317,10 @@ async def _run_task_with_retry(task=task, case=case):
"gen_ai.evaluation.data.has_interactions": (
evaluation_context.actual_interactions is not None
),
"test.suite.name": self._name,
"test.suite.run.id": run_id,
"test.case.name": case_name,
"test.case.id": case.session_id,
}
)
trace_id = format_trace_id(case_span.get_span_context().trace_id)
Expand Down Expand Up @@ -356,6 +372,7 @@ async def _evaluate_with_retry(evaluator=evaluator, evaluation_context=evaluatio
"gen_ai.evaluation.score.value": str(aggregate_score),
"gen_ai.evaluation.test_pass": aggregate_pass,
"gen_ai.evaluation.explanation": aggregate_reason or "",
"test.case.result.status": "pass" if aggregate_pass else "fail",
}
)

Expand Down Expand Up @@ -483,6 +500,8 @@ def run_evaluations(
A list of EvaluationReport objects, one for each evaluator, containing the overall score,
individual case results, and basic feedback for each test case.
"""
run_id = str(uuid.uuid4())

evaluator_data: dict[str, dict[str, list]] = {
evaluator.get_type_name(): {
"scores": [],
Expand All @@ -502,6 +521,10 @@ def run_evaluations(
attributes={
"gen_ai.evaluation.case.name": case_name,
"gen_ai.evaluation.case.input": serialize(case.input),
"test.suite.name": self._name,
"test.suite.run.id": run_id,
"test.case.name": case_name,
"test.case.id": case.session_id,
},
) as case_span:
# Task execution with retry logic
Expand Down Expand Up @@ -605,6 +628,7 @@ def _evaluate_with_retry(evaluator=evaluator, evaluation_context=evaluation_cont
"gen_ai.evaluation.score.value": aggregate_score,
"gen_ai.evaluation.test_pass": aggregate_pass,
"gen_ai.evaluation.explanation": aggregate_reason or "",
"test.case.result.status": "pass" if aggregate_pass else "fail",
}
)

Expand Down Expand Up @@ -679,13 +703,14 @@ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) ->
"""
queue: asyncio.Queue[Case[InputT, OutputT]] = asyncio.Queue()
results: list[Any] = []
run_id = str(uuid.uuid4())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here


for case in self._cases:
queue.put_nowait(case)

num_workers = min(max_workers, len(self._cases))

workers = [asyncio.create_task(self._worker(queue, task, results)) for _ in range(num_workers)]
workers = [asyncio.create_task(self._worker(queue, task, results, run_id)) for _ in range(num_workers)]

await queue.join()
for worker in workers:
Expand Down Expand Up @@ -739,6 +764,7 @@ def to_dict(self) -> dict:
A dictionary representation of the experiment.
"""
return {
"name": self._name,
"cases": [case.model_dump() for case in self._cases],
"evaluators": [evaluator.to_dict() for evaluator in self._evaluators],
}
Expand Down Expand Up @@ -787,6 +813,7 @@ def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None =
Return:
An Experiment object.
"""
name = data.get("name", "unnamed_experiment")
custom_evaluators = custom_evaluators or []
cases: list[Case] = [Case.model_validate(case_data) for case_data in data["cases"]]
default_evaluators: dict[str, type[Evaluator]] = {
Expand Down Expand Up @@ -817,7 +844,7 @@ def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None =
f"all relevant custom evaluators are passed in."
)

return cls(cases=cases, evaluators=evaluators)
return cls(cases=cases, evaluators=evaluators, name=name)

@classmethod
def from_file(cls, path: str, custom_evaluators: list[type[Evaluator]] | None = None):
Expand Down
11 changes: 10 additions & 1 deletion tests/strands_evals/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def counting_task(c):
def test_experiment_to_dict_empty(mock_evaluator):
"""Test converting empty experiment to dictionary"""
experiment = Experiment(cases=[], evaluators=[mock_evaluator])
assert experiment.to_dict() == {"cases": [], "evaluators": [{"evaluator_type": "MockEvaluator"}]}
assert experiment.to_dict() == {"name": "unnamed_experiment", "cases": [], "evaluators": [{"evaluator_type": "MockEvaluator"}]}


def test_experiment_to_dict_non_empty(mock_evaluator):
Expand All @@ -350,6 +350,7 @@ def test_experiment_to_dict_non_empty(mock_evaluator):
experiment = Experiment(cases=cases, evaluators=[mock_evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down Expand Up @@ -380,6 +381,7 @@ def test_experiment_to_dict_OutputEvaluator_full():
experiment = Experiment(cases=cases, evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down Expand Up @@ -412,6 +414,7 @@ def test_experiment_to_dict_OutputEvaluator_default():
session_id = experiment.cases[0].session_id
result = experiment.to_dict()
assert result == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand All @@ -435,6 +438,7 @@ def test_experiment_to_dict_TrajectoryEvaluator_default():
experiment = Experiment(cases=cases, evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down Expand Up @@ -463,6 +467,7 @@ def test_experiment_to_dict_TrajectoryEvaluator_full():
experiment = Experiment(cases=cases, evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down Expand Up @@ -494,6 +499,7 @@ def test_experiment_to_dict_InteractionsEvaluator_default():
experiment = Experiment(cases=cases, evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down Expand Up @@ -525,6 +531,7 @@ def test_experiment_to_dict_InteractionsEvaluator_full():
experiment = Experiment(cases=cases, evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down Expand Up @@ -555,6 +562,7 @@ def test_experiment_to_dict_case_dict():
experiment = Experiment(cases=[case], evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand All @@ -581,6 +589,7 @@ def simple_echo(query):
experiment = Experiment(cases=[case], evaluators=[evaluator])
session_id = experiment.cases[0].session_id
assert experiment.to_dict() == {
"name": "unnamed_experiment",
"cases": [
{
"name": "test",
Expand Down
Loading
Loading