From 99089be6a886869b373fcd7e1dbb8ba0b3010d8f Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Tue, 9 Jun 2026 16:16:15 +0000 Subject: [PATCH 1/6] docs: add chaos testing and resilience evaluator example scripts --- .../chaos_failure_communication_evaluator.py | 120 ++++++++++++ .../chaos_partial_completion_evaluator.py | 137 ++++++++++++++ .../chaos_recovery_strategy_evaluator.py | 133 ++++++++++++++ site/docs/examples/evals-sdk/chaos_testing.py | 171 ++++++++++++++++++ 4 files changed, 561 insertions(+) create mode 100644 site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py create mode 100644 site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py create mode 100644 site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py create mode 100644 site/docs/examples/evals-sdk/chaos_testing.py diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py new file mode 100644 index 0000000000..ea48b1504f --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py @@ -0,0 +1,120 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import StrandsEvalsTelemetry +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import NetworkError +from strands_evals.evaluators.chaos import FailureCommunicationEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test communication quality: +# 1. Search times out — agent must inform user about the failure +# 2. Both tools fail — agent must communicate multiple failures clearly +chaos_cases = [ + ChaosCase( + name="search_timeout", + input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.", + effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}}, + ), + ChaosCase( + name="all_tools_down", + input="Search for flights from Seattle to Tokyo next Tuesday and book one.", + effects={ + "tool_effects": { + "search_flights": [NetworkError(error_message="DNS resolution failed")], + "book_flight": [NetworkError(error_message="Connection refused")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly to the user\n" + "- Explain what went wrong in plain language\n" + "- Suggest next steps (retry later, try alternative)\n" + "- Do NOT hallucinate successful results" + ), + tools=[_search_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[FailureCommunicationEvaluator()], +) + +report = experiment.run_evaluations(task=travel_agent_task) +report.run_display() diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py new file mode 100644 index 0000000000..cebcc1d671 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py @@ -0,0 +1,137 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import StrandsEvalsTelemetry +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields +from strands_evals.chaos.effects import NetworkError +from strands_evals.evaluators.chaos import PartialCompletionEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +class BookingConfirmationResponse(BaseModel): + confirmation_sent: bool = Field(default=False) + method: str = Field(default="email") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +@tool_simulator.tool(output_schema=BookingConfirmationResponse) +def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: + """Send booking confirmation to the user via email or SMS.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test partial completion: +# 1. Search works (truncated) but booking fails — user gets degraded flight info but no reservation +# 2. Search and booking work but confirmation fails — user gets most of what they asked for +chaos_cases = [ + ChaosCase( + name="search_degraded_booking_fails", + input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", + effects={ + "tool_effects": { + "search_flights": [TruncateFields(max_length=5)], + "book_flight": [NetworkError(error_message="Connection reset by peer")], + }, + }, + ), + ChaosCase( + name="confirmation_fails", + input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", + effects={ + "tool_effects": { + "send_booking_confirmation": [NetworkError(error_message="SMTP server unreachable")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") +_confirm_tool = tool_simulator.get_tool("send_booking_confirmation") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly\n" + "- Complete as much of the request as possible\n" + "- Do NOT hallucinate successful results\n" + "- Do NOT retry more than once" + ), + tools=[_search_tool, _book_tool, _confirm_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[PartialCompletionEvaluator()], +) + +report = experiment.run_evaluations(task=travel_agent_task) +report.run_display() diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py new file mode 100644 index 0000000000..aa2102db17 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py @@ -0,0 +1,133 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import StrandsEvalsTelemetry +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import ExecutionError +from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class HotelSearchResponse(BaseModel): + hotels: list[dict[str, Any]] = Field(default_factory=list) + total_results: int = Field(default=0) + status: str = Field(default="success") + + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + flight_id: str = Field(default="") + status: str = Field(default="success") + message: str = Field(default="") + + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + + +@tool_simulator.tool(output_schema=HotelSearchResponse) +def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: + """Search for available hotels in a city for given dates.""" + pass + + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + + +chaos_plugin = ChaosPlugin() + +# Two cases that test recovery strategy: +# 1. Flight search times out but hotel search works — agent should pivot to hotel search +# 2. Flight search fails permanently — agent should try once, then move on +chaos_cases = [ + ChaosCase( + name="flight_timeout_hotel_available", + input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.", + effects={"tool_effects": {"search_flights": [Timeout()]}}, + ), + ChaosCase( + name="flight_and_booking_fail", + input="Find a flight from NYC to London on June 1 and book the cheapest option.", + effects={ + "tool_effects": { + "search_flights": [ExecutionError(error_message="Internal server error")], + "book_flight": [ExecutionError(error_message="Service unavailable")], + }, + }, + ), +] + +_search_flights_tool = tool_simulator.get_tool("search_flights") +_search_hotels_tool = tool_simulator.get_tool("search_hotels") +_book_tool = tool_simulator.get_tool("book_flight") + + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent under chaos and return output + trajectory.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel planning assistant. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "If a tool fails:\n" + "- Try alternative tools that can partially fulfill the request\n" + "- Do NOT retry the same failed tool more than once\n" + "- Do NOT hallucinate results\n" + "- Complete as much of the request as possible with working tools" + ), + tools=[_search_flights_tool, _search_hotels_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[RecoveryStrategyEvaluator()], +) + +report = experiment.run_evaluations(task=travel_agent_task) +report.run_display() diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py new file mode 100644 index 0000000000..8a662cdb89 --- /dev/null +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -0,0 +1,171 @@ +import logging +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import Case, StrandsEvalsTelemetry +from strands_evals.chaos import ( + ChaosCase, + ChaosExperiment, + ChaosPlugin, + CorruptValues, + NetworkError, + RemoveFields, + Timeout, + TruncateFields, +) +from strands_evals.chaos.effects import ExecutionError +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Set up ToolSimulator and register tools +tool_simulator = ToolSimulator() + +class FlightSearchResponse(BaseModel): + """Response from the flight search tool.""" + + flights: list[dict[str, Any]] = Field(default_factory=list, description="List of available flights") + total_results: int = Field(default=0, description="Total number of results found") + status: str = Field(default="success", description="Operation status") + +class BookFlightResponse(BaseModel): + """Response from the flight booking tool.""" + + booking_id: str = Field(default="", description="Booking confirmation ID") + flight_id: str = Field(default="", description="The booked flight ID") + status: str = Field(default="success", description="Booking status") + message: str = Field(default="", description="Status message") + +class BookingConfirmationResponse(BaseModel): + """Response from the booking confirmation tool.""" + + confirmation_sent: bool = Field(default=False, description="Whether confirmation was sent") + method: str = Field(default="email", description="Delivery method") + message: str = Field(default="", description="Confirmation details") + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID. Returns booking confirmation.""" + pass + +@tool_simulator.tool(output_schema=BookingConfirmationResponse) +def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: + """Send booking confirmation or fallback link to the user via email or SMS.""" + pass + +# 2. Create the ChaosPlugin +chaos_plugin = ChaosPlugin() + +# 3. Define named effect maps +effect_maps = { + # Single-tool, pre-hook: tool call is cancelled before execution + "search_timeout": { + "tool_effects": {"search_flights": [Timeout()]}, + }, + # Two-tool, post-hook: tools execute but responses are silently corrupted + "book_corrupt_and_confirm_truncated": { + "tool_effects": { + "book_flight": [CorruptValues(corrupt_ratio=0.8)], + "send_booking_confirmation": [TruncateFields(max_length=5)], + }, + }, + # All-tool, mixed pre+post: combines hard failures with silent corruption + "total_chaos": { + "tool_effects": { + "search_flights": [NetworkError()], + "book_flight": [ExecutionError()], + "send_booking_confirmation": [RemoveFields(remove_ratio=0.7)], + }, + }, +} + +# 4. Define the task function +# Pre-create tool instances once (avoids registry issues across runs) +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") +_confirm_tool = tool_simulator.get_tool("send_booking_confirmation") + +def travel_agent_task(case: ChaosCase) -> dict: + """Run the travel agent with a single user query.""" + logger.info(f"\n{'─'*60}") + logger.info(f" Case: {case.name}") + logger.info(f" User: {case.input}") + + agent = Agent( + system_prompt=( + "You are a travel booking assistant. You help users search for flights, " + "book them, and send confirmations. Use the available tools to complete " + "the user's request. Today's date is May 18, 2025.\n\n" + "Always use the tools directly — do not ask the user for clarification " + "if you can infer reasonable values from context.\n\n" + "If a tool fails or returns an error:\n" + "- Acknowledge the failure honestly to the user\n" + "- Try an alternative approach if possible\n" + "- Do NOT hallucinate successful results\n" + "- Do NOT retry more than once\n\n" + "If tool results look suspicious (e.g., $0 fares, past dates):\n" + "- Inform the user that results seem unreliable\n" + "- Suggest alternatives" + ), + tools=[_search_tool, _book_tool, _confirm_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + ) + + memory_exporter.clear() + try: + result = agent(case.input) + output = str(result) + except Exception as e: + output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" + + logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") + logger.info(f"{'─'*60}") + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": output, "trajectory": session} + +# 5. Define test cases and expand with effect maps +test_cases = [ + Case( + name="book_a_flight", + input="Find me a flight from SFO to JFK on May 20, book the cheapest one, and send me a confirmation.", + ), + Case( + name="search_and_confirm", + input="Search for flights from Seattle to Tokyo next Tuesday, book one, and email me the confirmation.", + ), +] + +# Expand: 2 cases × (3 effect maps + 1 baseline) = 8 ChaosCase objects +chaos_cases = ChaosCase.expand(test_cases, effect_maps, include_no_effect_baseline=True) + +# 6. Create and run the ChaosExperiment +evaluators = [GoalSuccessRateEvaluator()] + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=evaluators, +) + +# Run: 8 chaos cases = 8 agent invocations +report = experiment.run_evaluations(task=travel_agent_task) +report.run_display() From 82a8d3681abe509d1556927dddf871733fef4796 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Wed, 10 Jun 2026 16:18:41 +0000 Subject: [PATCH 2/6] use run_evaluations_async as example --- .../chaos_failure_communication_evaluator.py | 15 ++++++--------- .../chaos_partial_completion_evaluator.py | 17 ++++++----------- .../chaos_recovery_strategy_evaluator.py | 17 ++++++----------- site/docs/examples/evals-sdk/chaos_testing.py | 8 ++++++-- 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py index ea48b1504f..7a4d8acb39 100644 --- a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py @@ -1,3 +1,4 @@ +import asyncio import logging from typing import Any @@ -19,32 +20,27 @@ tool_simulator = ToolSimulator() - class FlightSearchResponse(BaseModel): flights: list[dict[str, Any]] = Field(default_factory=list) total_results: int = Field(default=0) status: str = Field(default="success") - class BookFlightResponse(BaseModel): booking_id: str = Field(default="") flight_id: str = Field(default="") status: str = Field(default="success") message: str = Field(default="") - @tool_simulator.tool(output_schema=FlightSearchResponse) def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: """Search for available flights between two cities on a given date.""" pass - @tool_simulator.tool(output_schema=BookFlightResponse) def book_flight(flight_id: str) -> dict[str, Any]: """Book a specific flight by its flight ID.""" pass - chaos_plugin = ChaosPlugin() # Two cases that test communication quality: @@ -71,7 +67,6 @@ def book_flight(flight_id: str) -> dict[str, Any]: _search_tool = tool_simulator.get_tool("search_flights") _book_tool = tool_simulator.get_tool("book_flight") - def travel_agent_task(case: ChaosCase) -> dict: """Run the travel agent under chaos and return output + trajectory.""" logger.info(f"\n{'─'*60}") @@ -110,11 +105,13 @@ def travel_agent_task(case: ChaosCase) -> dict: return {"output": output, "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[FailureCommunicationEvaluator()], ) -report = experiment.run_evaluations(task=travel_agent_task) -report.run_display() +async def main(): + report = await experiment.run_evaluations_async(task=travel_agent_task, max_workers=10) + report.run_display() + +asyncio.run(main()) diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py index cebcc1d671..6eccdc2cba 100644 --- a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py @@ -1,3 +1,4 @@ +import asyncio import logging from typing import Any @@ -19,44 +20,37 @@ tool_simulator = ToolSimulator() - class FlightSearchResponse(BaseModel): flights: list[dict[str, Any]] = Field(default_factory=list) total_results: int = Field(default=0) status: str = Field(default="success") - class BookFlightResponse(BaseModel): booking_id: str = Field(default="") flight_id: str = Field(default="") status: str = Field(default="success") message: str = Field(default="") - class BookingConfirmationResponse(BaseModel): confirmation_sent: bool = Field(default=False) method: str = Field(default="email") message: str = Field(default="") - @tool_simulator.tool(output_schema=FlightSearchResponse) def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: """Search for available flights between two cities on a given date.""" pass - @tool_simulator.tool(output_schema=BookFlightResponse) def book_flight(flight_id: str) -> dict[str, Any]: """Book a specific flight by its flight ID.""" pass - @tool_simulator.tool(output_schema=BookingConfirmationResponse) def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: str = "email") -> dict[str, Any]: """Send booking confirmation to the user via email or SMS.""" pass - chaos_plugin = ChaosPlugin() # Two cases that test partial completion: @@ -88,7 +82,6 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: _book_tool = tool_simulator.get_tool("book_flight") _confirm_tool = tool_simulator.get_tool("send_booking_confirmation") - def travel_agent_task(case: ChaosCase) -> dict: """Run the travel agent under chaos and return output + trajectory.""" logger.info(f"\n{'─'*60}") @@ -127,11 +120,13 @@ def travel_agent_task(case: ChaosCase) -> dict: return {"output": output, "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[PartialCompletionEvaluator()], ) -report = experiment.run_evaluations(task=travel_agent_task) -report.run_display() +async def main(): + report = await experiment.run_evaluations_async(task=travel_agent_task, max_workers=10) + report.run_display() + +asyncio.run(main()) diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py index aa2102db17..7338a53768 100644 --- a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py @@ -1,3 +1,4 @@ +import asyncio import logging from typing import Any @@ -19,44 +20,37 @@ tool_simulator = ToolSimulator() - class FlightSearchResponse(BaseModel): flights: list[dict[str, Any]] = Field(default_factory=list) total_results: int = Field(default=0) status: str = Field(default="success") - class HotelSearchResponse(BaseModel): hotels: list[dict[str, Any]] = Field(default_factory=list) total_results: int = Field(default=0) status: str = Field(default="success") - class BookFlightResponse(BaseModel): booking_id: str = Field(default="") flight_id: str = Field(default="") status: str = Field(default="success") message: str = Field(default="") - @tool_simulator.tool(output_schema=FlightSearchResponse) def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: """Search for available flights between two cities on a given date.""" pass - @tool_simulator.tool(output_schema=HotelSearchResponse) def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: """Search for available hotels in a city for given dates.""" pass - @tool_simulator.tool(output_schema=BookFlightResponse) def book_flight(flight_id: str) -> dict[str, Any]: """Book a specific flight by its flight ID.""" pass - chaos_plugin = ChaosPlugin() # Two cases that test recovery strategy: @@ -84,7 +78,6 @@ def book_flight(flight_id: str) -> dict[str, Any]: _search_hotels_tool = tool_simulator.get_tool("search_hotels") _book_tool = tool_simulator.get_tool("book_flight") - def travel_agent_task(case: ChaosCase) -> dict: """Run the travel agent under chaos and return output + trajectory.""" logger.info(f"\n{'─'*60}") @@ -123,11 +116,13 @@ def travel_agent_task(case: ChaosCase) -> dict: return {"output": output, "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[RecoveryStrategyEvaluator()], ) -report = experiment.run_evaluations(task=travel_agent_task) -report.run_display() +async def main(): + report = await experiment.run_evaluations_async(task=travel_agent_task, max_workers=10) + report.run_display() + +asyncio.run(main()) diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py index 8a662cdb89..7e2600e12d 100644 --- a/site/docs/examples/evals-sdk/chaos_testing.py +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -1,3 +1,4 @@ +import asyncio import logging from typing import Any @@ -167,5 +168,8 @@ def travel_agent_task(case: ChaosCase) -> dict: ) # Run: 8 chaos cases = 8 agent invocations -report = experiment.run_evaluations(task=travel_agent_task) -report.run_display() +async def main(): + report = await experiment.run_evaluations_async(task=travel_agent_task, max_workers=10) + report.run_display() + +asyncio.run(main()) From edcb08c85b1dc8bfee50482dda0514fda21853ef Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Wed, 10 Jun 2026 19:32:12 +0000 Subject: [PATCH 3/6] docs: pages for chaos testing and resilience evaluators --- site/=4.8.2 | 0 site/src/config/navigation.yml | 4 + .../user-guide/evals-sdk/chaos_testing.mdx | 489 ++++++++++++++++++ .../failure_communication_evaluator.mdx | 243 +++++++++ .../user-guide/evals-sdk/evaluators/index.mdx | 20 + .../partial_completion_evaluator.mdx | 261 ++++++++++ .../recovery_strategy_evaluator.mdx | 265 ++++++++++ 7 files changed, 1282 insertions(+) create mode 100644 site/=4.8.2 create mode 100644 site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx create mode 100644 site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx create mode 100644 site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx create mode 100644 site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx diff --git a/site/=4.8.2 b/site/=4.8.2 new file mode 100644 index 0000000000..e69de29bb2 diff --git a/site/src/config/navigation.yml b/site/src/config/navigation.yml index faffe7b129..363f6f0e4d 100644 --- a/site/src/config/navigation.yml +++ b/site/src/config/navigation.yml @@ -197,6 +197,9 @@ sidebar: - docs/user-guide/evals-sdk/evaluators/multimodal_faithfulness_evaluator - docs/user-guide/evals-sdk/evaluators/multimodal_instruction_following_evaluator - docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator + - docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator + - docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator + - docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator - docs/user-guide/evals-sdk/evaluators/tool_selection_evaluator - docs/user-guide/evals-sdk/evaluators/tool_parameter_evaluator - docs/user-guide/evals-sdk/evaluators/deterministic_evaluators @@ -213,6 +216,7 @@ sidebar: - docs/user-guide/evals-sdk/simulators - docs/user-guide/evals-sdk/simulators/user_simulation - docs/user-guide/evals-sdk/simulators/tool_simulation + - docs/user-guide/evals-sdk/chaos_testing - label: Remote Trace Providers items: - docs/user-guide/evals-sdk/how-to/trace_providers diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx new file mode 100644 index 0000000000..fa7c903bcb --- /dev/null +++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx @@ -0,0 +1,489 @@ +--- +title: Chaos Testing +tags: [error-handling, simulation] +sidebar: + label: "Chaos Testing" +--- + +## Overview + +Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment`, you can test how agents handle tool timeouts, network errors, and corrupted responses without modifying agent code. + +This enables you to answer questions like: +- Does the agent gracefully communicate failures to users? +- Can the agent achieve partial goals when some tools fail? +- Does the agent employ effective recovery strategies? + +## Why Chaos Testing? + +Traditional evaluation tests agents under ideal conditions. In production, tools fail unpredictably: + +**Standard Evaluation:** +- Tools always return correct responses +- No network failures or timeouts +- Cannot reveal fragile error handling +- Misses degraded-mode behavior + +**Chaos Testing:** +- Injects realistic tool failures (timeouts, network errors, validation errors) +- Corrupts tool responses (truncated fields, removed data, corrupted values) +- Tests agent resilience without live infrastructure failures +- Measures graceful degradation and recovery behavior +- Quantifies partial goal completion under failure +- Reveals which tools are single points of failure and which the agent can route around + +## When to Use Chaos Testing + +Use chaos testing when you need to: +- **Evaluate Resilience**: Test how agents handle tool failures gracefully +- **Assess Recovery**: Verify agents try alternative approaches when tools fail +- **Measure Degradation**: Quantify how much of a goal agents achieve despite failures +- **Test Communication**: Ensure agents inform users clearly about failures +- **Validate Robustness**: Confirm agents don't crash or loop on corrupted data + +## How It Works + +Chaos testing integrates with Strands' plugin system via `BeforeToolCallEvent` and `AfterToolCallEvent` hooks: + +1. **ChaosCase**: Extends `Case` with an `effects` field mapping tool names to failure effects +2. **ChaosPlugin**: A Strands plugin that intercepts tool calls and applies effects transparently +3. **ChaosExperiment**: Composes the base `Experiment` to manage chaos context per case +4. **ChaosEffect**: A hierarchy of pre-hook effects (cancel tool calls) and post-hook effects (corrupt responses) + +The workflow: +1. You define `ChaosCase` objects with effects specifying which tools should fail and how +2. `ChaosExperiment` sets a `ContextVar` with the active case before each task execution (thread/async safe) +3. `ChaosPlugin` reads the active case from the `ContextVar` and applies effects at the appropriate hook point +4. Your task function code has zero chaos concepts. Just add `ChaosPlugin()` to the agent's plugins list + +## Basic Usage + +### Define chaos test cases with effects + +Define your tools as usual with `@tool`, then create `ChaosCase` objects specifying which tools should fail. The effect map keys must match the tool function names exactly: + +```python +from strands import tool +from strands_evals.chaos import ChaosCase, Timeout +from strands_evals.chaos.effects import NetworkError + +@tool +def get_weather(city: str) -> str: + """Get current weather for a city.""" + return '{"temperature": 72, "condition": "sunny"}' + +chaos_cases = [ + ChaosCase( + name="search_timeout", + input="What's the weather in Seattle?", + effects={"tool_effects": {"get_weather": [Timeout()]}}, + ), + ChaosCase( + name="network_failure", + input="What's the weather in Seattle?", + effects={"tool_effects": {"get_weather": [NetworkError()]}}, + ), +] +``` + +### Add chaos plugin to your agent + +Add `ChaosPlugin()` to the agent's plugins list. No other code changes are needed: + +```python +from strands import Agent +from strands_evals.chaos import ChaosPlugin + +chaos_plugin = ChaosPlugin() + +def task_function(case: ChaosCase) -> dict: + agent = Agent( + system_prompt="You are a helpful weather assistant.", + plugins=[chaos_plugin], + callback_handler=None + ) + response = agent(case.input) + return {"output": str(response)} +``` + +### Run chaos experiment + +```python +from strands_evals.chaos import ChaosExperiment +from strands_evals.evaluators import GoalSuccessRateEvaluator + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[GoalSuccessRateEvaluator()] +) +report = experiment.run_evaluations(task=task_function) +report.run_display() +``` + +## Effect Types + +### Pre-hook Effects (Tool Call Failures) + +These effects cancel the tool call entirely and return an error: + +| Effect | Description | +| :------- | :------------ | +| `Timeout` | Simulates a tool execution timeout | +| `NetworkError` | Simulates a network connectivity failure | +| `ExecutionError` | Simulates a runtime error during tool execution | +| `ValidationError` | Simulates invalid input/output validation failure | + +```python +from strands_evals.chaos import Timeout +from strands_evals.chaos.effects import NetworkError, ExecutionError, ValidationError + +effect_maps = { + "timeout": {"tool_effects": {"my_tool": [Timeout()]}}, + "network": {"tool_effects": {"my_tool": [NetworkError()]}}, + "execution": {"tool_effects": {"my_tool": [ExecutionError()]}}, + "validation": {"tool_effects": {"my_tool": [ValidationError()]}}, +} +``` + +### Post-hook Effects (Response Corruption) + +These effects let the tool execute but corrupt the response: + +| Effect | Description | Parameters | +| :------- | :------------ | :----------- | +| `TruncateFields` | Truncates string fields in the response | `max_length` | +| `RemoveFields` | Randomly removes fields from the response | `remove_ratio` | +| `CorruptValues` | Corrupts field values with garbage data | `corrupt_ratio` | + +```python +from strands_evals.chaos import TruncateFields, RemoveFields, CorruptValues + +effect_maps = { + "truncated": {"tool_effects": {"my_tool": [TruncateFields(max_length=10)]}}, + "missing_fields": {"tool_effects": {"my_tool": [RemoveFields(remove_ratio=0.5)]}}, + "corrupted": {"tool_effects": {"my_tool": [CorruptValues(corrupt_ratio=0.3)]}}, +} +``` + +### Compound Effects (Multiple Tools) + +Target multiple tools in a single case to simulate cascading failures: + +```python +chaos_case = ChaosCase( + name="total_chaos", + input="Book me a flight to Paris", + effects={ + "tool_effects": { + "search_flights": [Timeout()], + "book_flight": [NetworkError()], + "send_confirmation": [CorruptValues(corrupt_ratio=0.5)], + } + }, +) +``` + +## Expanding Cases Across Multiple Effects + +When you have multiple base cases and want to test across several failure scenarios, use `ChaosCase.expand()` to generate the Cartesian product: + +```python +from strands_evals import Case +from strands_evals.chaos import ChaosCase, Timeout +from strands_evals.chaos.effects import NetworkError + +# Define base test cases +base_cases = [ + Case(name="weather-seattle", input="What's the weather in Seattle?"), + Case(name="weather-tokyo", input="What's the weather in Tokyo?"), +] + +# Define named effect maps +effect_maps = { + "search_timeout": { + "tool_effects": {"get_weather": [Timeout()]}, + }, + "network_failure": { + "tool_effects": {"get_weather": [NetworkError()]}, + }, +} + +# Expand: 2 cases x (2 effect maps + 1 baseline) = 6 ChaosCase objects +chaos_cases = ChaosCase.expand(base_cases, effect_maps, include_no_effect_baseline=True) +``` + +Setting `include_no_effect_baseline=True` adds an extra variant of each base case with no effects applied. This gives you a clean comparison point: you can see how the agent scores under normal conditions versus under each failure scenario, making it easy to measure the delta that chaos introduces. + +## Integration with ToolSimulator + +Chaos testing works naturally with `ToolSimulator` for fully controlled evaluation. Simulated tools provide reproducible responses, and chaos effects inject failures on top: + +```python +from strands import Agent +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout, CorruptValues +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.simulation import ToolSimulator +from pydantic import BaseModel, Field + +tool_simulator = ToolSimulator() + +class SearchResult(BaseModel): + title: str = Field(..., description="Result title") + snippet: str = Field(..., description="Result snippet") + +@tool_simulator.tool(output_schema=SearchResult) +def web_search(query: str) -> dict: + """Search the web for information.""" + pass + +chaos_cases = [ + ChaosCase( + name="search_timeout", + input="Find recent news about AI agents", + effects={"tool_effects": {"web_search": [Timeout()]}}, + ), + ChaosCase( + name="corrupted_results", + input="Find recent news about AI agents", + effects={"tool_effects": {"web_search": [CorruptValues(corrupt_ratio=0.5)]}}, + ), +] + +chaos_plugin = ChaosPlugin() +_search_tool = tool_simulator.get_tool("web_search") + +def task_function(case: ChaosCase) -> dict: + agent = Agent( + tools=[_search_tool], + plugins=[chaos_plugin], + callback_handler=None + ) + response = agent(case.input) + return {"output": str(response)} + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[GoalSuccessRateEvaluator()] +) +report = experiment.run_evaluations(task=task_function) +report.run_display() +``` + +### Chaos Testing vs Simulators + +Understanding when to use each: + +| Aspect | Simulators | Chaos Testing | +| :------- | :---------- | :-------------- | +| **Role** | Replace tool execution entirely | Inject failures into tool execution | +| **Scope** | All tool calls are simulated | Only targeted tools are affected | +| **Use Case** | Test without infrastructure | Test resilience under failure | +| **Combination** | Can be used together | Chaos effects apply on top of simulated tools | + +## Resilience Evaluators + +Chaos testing ships with three specialized evaluators designed to assess agent behavior under failure: + +| Evaluator | What It Measures | Scoring | Baseline | +| :---------- | :---------------- | :-------- | :-------- | +| [FailureCommunicationEvaluator](/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator/) | Clarity, actionability, transparency, and tone of failure messages | Five-level (0.0, 0.25, 0.5, 0.75, 1.0) | 0.5 when no failures occur | +| [PartialCompletionEvaluator](/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator/) | Fraction of user goal achieved despite failures | Continuous (0.0 to 1.0) | ~1.0 when task completes fully | +| [RecoveryStrategyEvaluator](/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator/) | Quality of recovery actions: exploration breadth, retry discipline, approach variation | Five-level (0.0, 0.25, 0.5, 0.75, 1.0) | 0.5 when no failures occur | + +```python +from strands_evals.evaluators.chaos import ( + FailureCommunicationEvaluator, + PartialCompletionEvaluator, + RecoveryStrategyEvaluator, +) + +evaluators = [ + PartialCompletionEvaluator(), # How much was completed? + FailureCommunicationEvaluator(), # Did the agent tell the user? + RecoveryStrategyEvaluator(), # Did the agent try alternatives? +] + +experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators) +report = experiment.run_evaluations(task=task_function) +report.run_display() +``` + +### Interpreting Results + +When reviewing evaluation outputs, look at evaluator scores together to identify patterns in your agent's failure-handling behavior: + +- **High FailureCommunication + low PartialCompletion**: Agent explains failures well but cannot work around them. Add fallback tools or alternative approaches. +- **High RecoveryStrategy + low PartialCompletion**: Agent tries hard (retries, alternatives) but all options also fail. The failure is too severe for the available tools, or the agent's fallback tools are also broken. +- **Low FailureCommunication + high PartialCompletion**: Agent completes the task despite failures but doesn't inform the user about degraded results. Add failure-awareness instructions to the system prompt. +- **Low RecoveryStrategy + low PartialCompletion**: Agent gives up immediately without attempting alternatives. Add retry logic, fallback tools, or system prompt guidance about recovery behavior. + +Check the `reason` field in each evaluation output for specific details about what the judge observed in the trace. + +## Advanced Chaos Testing Patterns + +### Pattern 1: Comparing Agent Configurations Under Chaos + +Compare how different system prompts affect resilience: + +```python +from strands_evals.evaluators.chaos import PartialCompletionEvaluator + +def compare_agents_under_chaos(chaos_cases, configs): + """Compare how different agent configs handle the same failures.""" + results = {} + + for config_name, system_prompt in configs.items(): + def make_task(prompt): + def task_function(case: ChaosCase) -> dict: + agent = Agent( + system_prompt=prompt, + plugins=[ChaosPlugin()], + callback_handler=None, + ) + response = agent(case.input) + return {"output": str(response)} + return task_function + + experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[PartialCompletionEvaluator()] + ) + report = experiment.run_evaluations(task=make_task(system_prompt)) + results[config_name] = report + + return results +``` + +### Pattern 2: Degradation Sweep + +Map the resilience curve of your agent by sweeping corruption intensity from 0% to 100%. This reveals the critical threshold where your agent breaks, and whether degradation is gradual or cliff-edge: + +```python +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, CorruptValues +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.evaluators.chaos import PartialCompletionEvaluator + +# Sweep corrupt_ratio from mild to total corruption +sweep_cases = [ + ChaosCase( + name=f"corrupt_{int(ratio*100)}pct", + input="Find the cheapest flight to Paris next Tuesday", + effects={"tool_effects": {"search_flights": [CorruptValues(corrupt_ratio=ratio)]}}, + ) + for ratio in [0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0] +] + +experiment = ChaosExperiment( + cases=sweep_cases, + evaluators=[GoalSuccessRateEvaluator(), PartialCompletionEvaluator()] +) +report = experiment.run_evaluations(task=task_function) +report.run_display() + +# Analyze: at what ratio does goal success drop below 0.5? +# Gradual degradation = resilient agent; cliff-edge = fragile agent +``` + +### Pattern 3: Multi-turn Chaos Testing with User Simulator + +Combine chaos testing with user simulation for multi-turn resilience evaluation: + +```python +from strands_evals import ActorSimulator + +def task_function(case: ChaosCase) -> dict: + user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, max_turns=8 + ) + + agent = Agent( + system_prompt="You are a helpful assistant.", + plugins=[ChaosPlugin()], + callback_handler=None, + ) + + user_message = case.input + while user_sim.has_next(): + agent_response = agent(user_message) + user_result = user_sim.act(str(agent_response)) + user_message = str(user_result.structured_output.message) + + return {"output": str(agent_response)} +``` + +## Best Practices + +### 1. Start with Baseline Comparisons + +Always include a no-effect baseline to compare agent performance with and without failures. When using `ChaosCase.expand()`: + +```python +chaos_cases = ChaosCase.expand(cases, effect_maps, include_no_effect_baseline=True) +``` + +### 2. Test One Failure at a Time First + +Start with your most critical tool against all effect types before expanding to multi-tool scenarios. Single-tool tests reveal individual tool robustness; compound tests reveal pipeline fragility: + +```python +# Single failure first +single_case = ChaosCase( + name="search_fails", + input="Find flights to Paris", + effects={"tool_effects": {"search": [Timeout()]}}, +) + +# Compound (test after single failures are understood) +compound_case = ChaosCase( + name="total_chaos", + input="Find flights to Paris", + effects={ + "tool_effects": { + "search": [Timeout()], + "database": [NetworkError()], + } + }, +) +``` + +### 3. Use Resilience Evaluators Together + +Combine all three resilience evaluators for a complete picture: + +```python +evaluators = [ + FailureCommunicationEvaluator(), # Did the agent tell the user? + PartialCompletionEvaluator(), # How much was achieved? + RecoveryStrategyEvaluator(), # Did it try alternatives? +] +``` + +### 4. Match Error Types to Tool Semantics + +Choose failure types that reflect realistic production failures: +- `NetworkError` for external API tools +- `Timeout` for slow or overloaded services +- `ExecutionError` for local computation tools +- `ValidationError` for tools with strict input schemas + +### 5. Read the Reasoning, Not Just Pass/Fail + +Evaluator scores alone don't tell the full story. Check the `reason` field in evaluation outputs to understand *why* the agent scored the way it did. A score of 0.5 may mean "barely passes" or "no failures occurred to evaluate against," and the reasoning explains which. + +### 6. Iterate: Diagnose, Fix, Validate + +Treat chaos testing as an iterative improvement loop: +1. Run the experiment and identify which tool-failure combinations produce low scores +2. Fix the agent (add retry logic, fallback tools, or better system prompt guidance) +3. Re-run the same experiment and verify that previously failing cases now pass + +### 7. Monitor Token Usage Under Chaos + +Agents under failure often burn tokens on retry storms (repeated failed tool calls). Compare token consumption between baseline and chaos cases to detect runaway costs. A sharp increase signals excessive retries; a sharp decrease signals the agent is giving up too early. + +## Related Documentation + +- [Tool Simulation](/docs/user-guide/evals-sdk/simulators/tool_simulation/): Simulate tool behavior for reproducible tests +- [Goal Success Rate Evaluator](/docs/user-guide/evals-sdk/evaluators/goal_success_rate_evaluator/): Assess goal completion +- [Simulators Overview](/docs/user-guide/evals-sdk/simulators/): Simulator framework +- [Evaluators](/docs/user-guide/evals-sdk/evaluators/): All available evaluators diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx new file mode 100644 index 0000000000..0c46fb2ff6 --- /dev/null +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx @@ -0,0 +1,243 @@ +--- +title: Failure Communication Evaluator +tags: [error-handling, tool-evaluation, simulation] +sidebar: + label: "Failure Communication" +--- + +## Overview + +The `FailureCommunicationEvaluator` assesses how well an agent communicates failures to the user when tools or services fail. It uses an LLM-as-judge approach with a five-level scoring rubric to evaluate clarity, actionability, transparency, and tone of failure messages. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py). + +## Key Features + +- **Session-Level Evaluation**: Evaluates the full conversation trace including tool call results and agent responses +- **Five-Level Scoring**: Granular scale from "No Communication" to "Excellent Communication" +- **Multi-Dimensional Assessment**: Evaluates clarity, actionability, transparency, and tone +- **Structured Reasoning**: Provides step-by-step reasoning for each evaluation +- **Async Support**: Supports both synchronous and asynchronous evaluation + +## When to Use + +Use the `FailureCommunicationEvaluator` when you need to: + +- Assess whether agents inform users about tool failures +- Evaluate the quality and helpfulness of error messages +- Test agent transparency under degraded conditions +- Measure user trust maintenance during failures +- Compare failure communication across agent configurations + +## Evaluation Level + +This evaluator operates at the **SESSION_LEVEL**, evaluating the full conversation trace including tool call results and agent responses. + +## Parameters + +### `model` (optional) +- **Type**: `Union[Model, str, None]` +- **Default**: `None` (uses default Bedrock model) +- **Description**: The model to use as the judge. + +## Scoring System + +| Rating | Score | Description | +|--------|-------|-------------| +| No Communication | 0.0 | Agent silently ignores failures, fabricates data, or crashes | +| Poor Communication | 0.25 | Agent vaguely acknowledges an issue without useful information | +| Neutral | 0.5 | Mixed communication, or no failures occurred to communicate | +| Good Communication | 0.75 | Agent clearly explains the failure and suggests next steps | +| Excellent Communication | 1.0 | Agent transparently explains what failed, why, and provides actionable alternatives | + +A response passes the evaluation if the score is >= 0.5. + +When no tool failures occur during the session, the evaluator produces a neutral score of 0.5, since there are no failures to assess communication quality against. + +## Basic Usage + +```python +import asyncio +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import StrandsEvalsTelemetry +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import NetworkError +from strands_evals.evaluators.chaos import FailureCommunicationEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + status: str = Field(default="success") + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + +chaos_plugin = ChaosPlugin() + +chaos_cases = [ + ChaosCase( + name="search_timeout", + input="Find me a flight from SFO to JFK on May 20.", + effects={"tool_effects": {"search_flights": [Timeout(error_message="Tool call timed out after 30s")]}}, + ), + ChaosCase( + name="all_tools_down", + input="Search for flights from Seattle to Tokyo next Tuesday.", + effects={"tool_effects": {"search_flights": [NetworkError(error_message="DNS resolution failed")]}}, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") + +def task_function(case: ChaosCase) -> dict: + agent = Agent( + system_prompt="You are a travel booking assistant.", + tools=[_search_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"session.id": case.session_id}, + ) + + memory_exporter.clear() + response = agent(case.input) + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(response), "trajectory": session} + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[FailureCommunicationEvaluator()], +) + +async def main(): + report = await experiment.run_evaluations_async(task=task_function, max_workers=10) + report.run_display() + +asyncio.run(main()) +``` + +## Evaluation Output + +The `FailureCommunicationEvaluator` returns `EvaluationOutput` objects with: + +- **score**: Float (0.0, 0.25, 0.5, 0.75, or 1.0) +- **test_pass**: `True` if score >= 0.5, `False` otherwise +- **reason**: Step-by-step reasoning explaining the evaluation +- **label**: One of the categorical labels (e.g., "Good Communication", "Excellent Communication") + +## What Gets Evaluated + +The evaluator examines: + +1. **Tool Call Results**: Whether tools returned errors or failures +2. **Agent Response**: How the agent communicated those failures to the user +3. **Communication Quality**: + - Does the agent acknowledge the failure clearly? + - Does it suggest actionable next steps? + - Is it transparent about what went wrong (without exposing internals)? + - Is the tone appropriate (not dismissive, not alarming)? + +## Best Practices + +1. **Include Tool Failures in Test Cases**: The evaluator needs tool failures in the trace to assess communication quality +2. **Capture Complete Sessions**: Include all conversation turns and tool call results in the trajectory +3. **Test Various Failure Types**: Include timeouts, network errors, and validation errors +4. **Combine with Other Evaluators**: Use alongside `RecoveryStrategyEvaluator` and `PartialCompletionEvaluator` +5. **Provide System Prompts with Failure Guidance**: Agents with explicit failure-handling instructions tend to score higher + +## Common Patterns + +### Pattern 1: Single Tool Failure +Evaluate how the agent communicates a single tool timeout or error. + +### Pattern 2: Multiple Tool Failures +Assess communication quality when several tools fail simultaneously. + +### Pattern 3: Graceful Degradation +Measure how the agent explains partial results when some tools succeed and others fail. + +## Example Scenarios + +### Scenario 1: Excellent Communication +``` +Tool: search_flights -> Timeout +Agent: "I'm sorry, but the flight search service is currently experiencing delays +and couldn't complete your request. You could try again in a few minutes, or I +can help you explore alternative options like checking a different travel date." +Evaluation: Excellent Communication (1.0) - Clear, transparent, actionable +``` + +### Scenario 2: Good Communication +``` +Tool: search_flights -> NetworkError +Agent: "I wasn't able to search for flights right now due to a connection issue. +Please try again shortly." +Evaluation: Good Communication (0.75) - Acknowledges failure, suggests retry +``` + +### Scenario 3: No Communication +``` +Tool: search_flights -> Timeout +Agent: "There are no flights available for that route." +Evaluation: No Communication (0.0) - Fabricates results instead of reporting failure +``` + +## Common Issues and Solutions + +### Issue 1: Score is Always 0.5 +**Problem**: Evaluator always returns neutral score. +**Solution**: Ensure tool failures are actually present in the trace. If no tools fail, the evaluator returns 0.5 by design. + +### Issue 2: Agent Not Detecting Failures +**Problem**: Agent doesn't mention failures in its response. +**Solution**: Add failure-handling instructions to the system prompt (e.g., "If a tool fails, acknowledge the failure honestly"). + +### Issue 3: No Trajectory Data +**Problem**: Evaluator returns empty results. +**Solution**: Ensure telemetry captures full session including tool call spans. + +## Differences from Other Evaluators + +- **vs. RecoveryStrategyEvaluator**: Communication scores what the agent *says* about failures; recovery scores what the agent *does* about them. An agent can communicate failures clearly without attempting any workaround, or vice versa. +- **vs. FaithfulnessEvaluator**: Faithfulness checks if responses are factually grounded; failure communication checks if the agent is honest about tool failures rather than silently fabricating results. +- **vs. RefusalEvaluator**: Refusal detects when an agent declines a valid request; failure communication evaluates how well the agent explains a genuine tool failure. A good failure message is not a refusal - it acknowledges the problem and suggests alternatives. +- **vs. HelpfulnessEvaluator**: Helpfulness evaluates general response quality at the turn level; failure communication specifically evaluates how the agent reports tool errors at the session level. + +## Use Cases + +### Use Case 1: Customer-Facing Agents +Ensure agents inform users clearly when backend services are down. + +### Use Case 2: Chaos Testing +Evaluate agent transparency under deliberately injected tool failures. + +### Use Case 3: Trust Assessment +Measure whether agents maintain user trust during degraded conditions. + +### Use Case 4: Error Message Quality +Compare failure communication across different system prompt configurations. + +## Related Evaluators + +- [**RecoveryStrategyEvaluator**](recovery_strategy_evaluator): Evaluates quality of recovery actions +- [**PartialCompletionEvaluator**](partial_completion_evaluator): Measures what fraction of goals were achieved despite failures +- [**FaithfulnessEvaluator**](faithfulness_evaluator): Evaluates if responses are factually grounded +- [**RefusalEvaluator**](refusal_evaluator): Detects when agents inappropriately refuse valid requests +- [**GoalSuccessRateEvaluator**](goal_success_rate_evaluator): Binary goal achievement assessment + +## Related Documentation + +- [Chaos Testing](/docs/user-guide/evals-sdk/chaos_testing/): Chaos testing overview and guide diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/index.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/index.mdx index 98dfcfb500..3aab1b2162 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/index.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/index.mdx @@ -186,6 +186,26 @@ Evaluators operate at different levels of granularity: - **Purpose**: Determine if user goals were successfully achieved - **Use Case**: Measure end-to-end task completion success +### Resilience Evaluators + +**[FailureCommunicationEvaluator](failure_communication_evaluator.md)** + +- **Level**: SESSION_LEVEL +- **Purpose**: Assess how well agents communicate failures to users +- **Use Case**: Evaluate transparency, clarity, and actionability of error messages under chaos testing + +**[PartialCompletionEvaluator](partial_completion_evaluator.md)** + +- **Level**: SESSION_LEVEL +- **Purpose**: Measure what fraction of a goal was achieved despite failures +- **Use Case**: Quantify graceful degradation and partial progress under tool failures + +**[RecoveryStrategyEvaluator](recovery_strategy_evaluator.md)** + +- **Level**: SESSION_LEVEL +- **Purpose**: Evaluate the quality of agent recovery actions when tools fail +- **Use Case**: Assess retry discipline, exploration breadth, and approach variation under chaos + ### Deterministic Evaluators **[Deterministic Evaluators](deterministic_evaluators.md)** diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx new file mode 100644 index 0000000000..058c792d6d --- /dev/null +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx @@ -0,0 +1,261 @@ +--- +title: Partial Completion Evaluator +tags: [error-handling, tool-evaluation, simulation] +sidebar: + label: "Partial Completion" +--- + +## Overview + +The `PartialCompletionEvaluator` scores what fraction of the user's goal was achieved, returning a continuous 0.0 to 1.0 score. Unlike the binary `GoalSuccessRateEvaluator`, this evaluator captures partial progress when an agent completes some sub-steps of a multi-step task but cannot finish the rest. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py). + +## Key Features + +- **Session-Level Evaluation**: Evaluates the full conversation trace to assess progress across all task sub-steps +- **Continuous Scoring**: Fine-grained 0.0 to 1.0 scale captures partial progress +- **Sub-Goal Decomposition**: Evaluates completion of individual task steps +- **Structured Reasoning**: Provides step-by-step reasoning for each evaluation +- **Async Support**: Supports both synchronous and asynchronous evaluation + +## When to Use + +Use the `PartialCompletionEvaluator` when you need to: + +- Measure how much of a multi-step task was completed +- Distinguish between "got nothing done" and "completed most steps" +- Quantify graceful degradation under increasing failure severity +- Identify which failure types cause the most progress loss +- Compare agent resilience across different configurations + +## Evaluation Level + +This evaluator operates at the **SESSION_LEVEL**, evaluating the full conversation trace to assess progress across all task sub-steps. + +## Parameters + +### `model` (optional) +- **Type**: `Union[Model, str, None]` +- **Default**: `None` (uses default Bedrock model) +- **Description**: The model to use as the judge. + +## Scoring System + +| Score | Interpretation | +|-------|---------------| +| 1.0 | Full goal achieved, all sub-steps completed | +| 0.7-0.9 | Most sub-goals completed, one or two blocked | +| 0.4-0.6 | Partial progress, some steps completed, key steps blocked | +| 0.1-0.3 | Minimal progress, early steps completed but majority blocked | +| 0.0 | No progress: agent gave up entirely, crashed, or completed nothing | + +A response passes the evaluation if the score is >= 0.5. + +The evaluator decomposes the task into logical sub-steps based on the conversation context and assesses which were completed based on the tool call history and agent responses. + +## Basic Usage + +```python +import asyncio +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import StrandsEvalsTelemetry +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields +from strands_evals.chaos.effects import NetworkError +from strands_evals.evaluators.chaos import PartialCompletionEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + status: str = Field(default="success") + +class BookFlightResponse(BaseModel): + booking_id: str = Field(default="") + status: str = Field(default="success") + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + +@tool_simulator.tool(output_schema=BookFlightResponse) +def book_flight(flight_id: str) -> dict[str, Any]: + """Book a specific flight by its flight ID.""" + pass + +chaos_plugin = ChaosPlugin() + +# Search works (degraded) but booking fails: partial completion expected +chaos_cases = [ + ChaosCase( + name="search_degraded_booking_fails", + input="Find me a flight from SFO to JFK on May 20 and book the cheapest one.", + effects={ + "tool_effects": { + "search_flights": [TruncateFields(max_length=5)], + "book_flight": [NetworkError(error_message="Connection reset by peer")], + }, + }, + ), +] + +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") + +def task_function(case: ChaosCase) -> dict: + agent = Agent( + system_prompt="You are a travel booking assistant.", + tools=[_search_tool, _book_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"session.id": case.session_id}, + ) + + memory_exporter.clear() + response = agent(case.input) + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(response), "trajectory": session} + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[PartialCompletionEvaluator()], +) + +async def main(): + report = await experiment.run_evaluations_async(task=task_function, max_workers=10) + report.run_display() + +asyncio.run(main()) +``` + +## Evaluation Output + +The `PartialCompletionEvaluator` returns `EvaluationOutput` objects with: + +- **score**: Float between 0.0 and 1.0 +- **test_pass**: `True` if score >= 0.5, `False` otherwise +- **reason**: Step-by-step reasoning explaining which sub-steps were completed and which were not +- **label**: Score as string + +## What Gets Evaluated + +The evaluator examines: + +1. **User Request**: The original task and its implicit sub-goals +2. **Tool Call History**: Which tools were called and their results +3. **Agent Response**: What the agent ultimately communicated to the user +4. **Sub-Goal Progress**: + - How many logical sub-steps of the task were completed? + - Which steps succeeded and which failed? + - Did the agent deliver partial value to the user? + +## Best Practices + +1. **Use Multi-Step Tasks**: The evaluator is most valuable for tasks with multiple distinct sub-goals +2. **Capture Complete Sessions**: Include all tool calls and their results in the trajectory +3. **Combine with GoalSuccessRateEvaluator**: Use both to distinguish total failure from partial progress +4. **Test Graduated Failures**: Inject failures at different points in the task to measure degradation curves +5. **Provide Clear Task Descriptions**: Multi-step tasks with distinct phases produce the most informative scores + +## Common Patterns + +### Pattern 1: Multi-Step Task Assessment +Evaluate how much of a search-book-confirm workflow was completed. + +### Pattern 2: Degradation Curve +Sweep failure intensity to map when partial completion drops off. + +### Pattern 3: Comparison with Binary Evaluation +Use alongside `GoalSuccessRateEvaluator` to see how much value was still delivered when the binary evaluator scores 0. + +## Example Scenarios + +### Scenario 1: Full Completion +``` +User: "Find a flight to Paris, book it, and send me a confirmation." +Agent: [searches flights, books cheapest, sends confirmation email] +Evaluation: 1.0 - All three sub-goals completed +``` + +### Scenario 2: Partial Completion (Booking Fails) +``` +User: "Find a flight to Paris, book it, and send me a confirmation." +Agent: [searches flights successfully, booking fails with network error] +Final: "I found several flights to Paris but wasn't able to complete the booking." +Evaluation: 0.4 - Search completed, booking and confirmation blocked +``` + +### Scenario 3: Minimal Completion +``` +User: "Find a flight to Paris, book it, and send me a confirmation." +Agent: [search times out immediately] +Final: "I'm unable to search for flights right now." +Evaluation: 0.0 - No sub-goals completed +``` + +### Scenario 4: Most Steps Completed +``` +User: "Find a flight to Paris, book it, and send me a confirmation." +Agent: [searches flights, books successfully, confirmation email fails] +Final: "Your flight is booked! I couldn't send the confirmation email, but your booking ID is ABC123." +Evaluation: 0.8 - Search and booking completed, only confirmation failed +``` + +## Common Issues and Solutions + +### Issue 1: Score is Always 1.0 or 0.0 +**Problem**: Evaluator doesn't produce intermediate scores. +**Solution**: Ensure test cases involve multi-step tasks. Single-step tasks will produce binary results. + +### Issue 2: No Trajectory Data +**Problem**: Evaluator returns empty results. +**Solution**: Ensure telemetry captures full session including tool call spans and results. + +### Issue 3: Sub-Goal Decomposition Seems Wrong +**Problem**: Evaluator decomposes the task differently than expected. +**Solution**: Use clearer, more explicit task descriptions in the case input. + +## Differences from Other Evaluators + +- **vs. GoalSuccessRateEvaluator**: Goal success is binary (1.0 or 0.0); partial completion is continuous, giving credit for steps completed even when the full goal fails. Use both to separate "total failure" from "almost made it." +- **vs. RecoveryStrategyEvaluator**: Partial completion scores the *outcome* (how much got done); recovery scores the *process* (how the agent handled failures). High partial completion with low recovery means the remaining tools worked without the agent needing to adapt. +- **vs. HelpfulnessEvaluator**: Helpfulness evaluates turn-level response quality; partial completion measures session-level task progress as a fraction of sub-goals completed. +- **vs. TrajectoryEvaluator**: Trajectory evaluates the overall action sequence for workflow quality; partial completion quantifies fractional task progress as a continuous 0.0 to 1.0 score. + +## Use Cases + +### Use Case 1: Chaos Testing +Measure how much of a task completes when tools are deliberately failed. + +### Use Case 2: Service Degradation +Quantify user impact during partial service outages. + +### Use Case 3: Agent Comparison +Compare how much value different agent configurations deliver under the same failure conditions. + +### Use Case 4: Regression Testing +Detect regressions where agents complete fewer sub-steps than before. + +## Related Evaluators + +- [**GoalSuccessRateEvaluator**](goal_success_rate_evaluator): Binary goal achievement assessment +- [**RecoveryStrategyEvaluator**](recovery_strategy_evaluator): Evaluates quality of recovery actions +- [**FailureCommunicationEvaluator**](failure_communication_evaluator): Evaluates how well agents communicate failures +- [**HelpfulnessEvaluator**](helpfulness_evaluator): Evaluates response helpfulness from user perspective +- [**TrajectoryEvaluator**](trajectory_evaluator): Evaluates the sequence of actions taken + +## Related Documentation + +- [Chaos Testing](/docs/user-guide/evals-sdk/chaos_testing/): Chaos testing overview and guide diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx new file mode 100644 index 0000000000..f3f79f3f84 --- /dev/null +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx @@ -0,0 +1,265 @@ +--- +title: Recovery Strategy Evaluator +tags: [error-handling, tool-evaluation, simulation] +sidebar: + label: "Recovery Strategy" +--- + +## Overview + +The `RecoveryStrategyEvaluator` scores the quality of an agent's recovery actions when tools fail. It evaluates whether the agent attempts alternative approaches, retries appropriately, and varies its strategies rather than repeating the same failed action. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py). + +## Key Features + +- **Session-Level Evaluation**: Evaluates the full conversation trace including tool call patterns, retries, and alternative approaches +- **Five-Level Scoring**: Granular scale from "No Recovery" to "Excellent Recovery" +- **Multi-Dimensional Assessment**: Evaluates exploration breadth, retry discipline, and approach variation +- **Structured Reasoning**: Provides step-by-step reasoning for each evaluation +- **Async Support**: Supports both synchronous and asynchronous evaluation + +## When to Use + +Use the `RecoveryStrategyEvaluator` when you need to: + +- Assess whether agents attempt alternative approaches when tools fail +- Evaluate retry behavior (appropriate retries vs. infinite loops) +- Detect agents that give up immediately on first failure +- Measure quality and variety of recovery strategies +- Compare recovery sophistication across agent configurations + +## Evaluation Level + +This evaluator operates at the **SESSION_LEVEL**, evaluating the full conversation trace including tool call patterns, retries, and alternative approaches. + +## Parameters + +### `model` (optional) +- **Type**: `Union[Model, str, None]` +- **Default**: `None` (uses default Bedrock model) +- **Description**: The model to use as the judge. + +## Scoring System + +| Rating | Score | Description | +|--------|-------|-------------| +| No Recovery | 0.0 | Agent gives up immediately or crashes on first failure | +| Poor Recovery | 0.25 | Agent retries the same failed action with no variation | +| Neutral | 0.5 | Minimal recovery, or no failures occurred to recover from | +| Good Recovery | 0.75 | Agent retries with variation or tries alternative tools | +| Excellent Recovery | 1.0 | Agent demonstrates sophisticated recovery: retries, fallbacks, escalation, and adaptation | + +A response passes the evaluation if the score is >= 0.5. + +When no tool failures occur during the session, the evaluator produces a neutral score of 0.5, since there are no failures to assess recovery behavior against. + +## Basic Usage + +```python +import asyncio +from typing import Any + +from pydantic import BaseModel, Field + +from strands import Agent +from strands_evals import StrandsEvalsTelemetry +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout +from strands_evals.chaos.effects import ExecutionError +from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.simulation import ToolSimulator + +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +tool_simulator = ToolSimulator() + +class FlightSearchResponse(BaseModel): + flights: list[dict[str, Any]] = Field(default_factory=list) + status: str = Field(default="success") + +class HotelSearchResponse(BaseModel): + hotels: list[dict[str, Any]] = Field(default_factory=list) + status: str = Field(default="success") + +@tool_simulator.tool(output_schema=FlightSearchResponse) +def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: + """Search for available flights between two cities on a given date.""" + pass + +@tool_simulator.tool(output_schema=HotelSearchResponse) +def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: + """Search for available hotels in a city for given dates.""" + pass + +chaos_plugin = ChaosPlugin() + +# Flight search times out but hotel search works: agent should pivot +chaos_cases = [ + ChaosCase( + name="flight_timeout_hotel_available", + input="Plan my trip to Tokyo: find flights from SFO and hotels for May 20-23.", + effects={"tool_effects": {"search_flights": [Timeout()]}}, + ), + ChaosCase( + name="flight_and_booking_fail", + input="Find a flight from NYC to London on June 1.", + effects={"tool_effects": {"search_flights": [ExecutionError(error_message="Internal server error")]}}, + ), +] + +_flights_tool = tool_simulator.get_tool("search_flights") +_hotels_tool = tool_simulator.get_tool("search_hotels") + +def task_function(case: ChaosCase) -> dict: + agent = Agent( + system_prompt=( + "You are a travel planning assistant. If a tool fails, " + "try alternative tools that can partially fulfill the request. " + "Do NOT retry the same failed tool more than once." + ), + tools=[_flights_tool, _hotels_tool], + plugins=[chaos_plugin], + callback_handler=None, + trace_attributes={"session.id": case.session_id}, + ) + + memory_exporter.clear() + response = agent(case.input) + + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(response), "trajectory": session} + +experiment = ChaosExperiment( + cases=chaos_cases, + evaluators=[RecoveryStrategyEvaluator()], +) + +async def main(): + report = await experiment.run_evaluations_async(task=task_function, max_workers=10) + report.run_display() + +asyncio.run(main()) +``` + +## Evaluation Output + +The `RecoveryStrategyEvaluator` returns `EvaluationOutput` objects with: + +- **score**: Float (0.0, 0.25, 0.5, 0.75, or 1.0) +- **test_pass**: `True` if score >= 0.5, `False` otherwise +- **reason**: Step-by-step reasoning explaining the evaluation +- **label**: One of the categorical labels (e.g., "Good Recovery", "Excellent Recovery") + +## What Gets Evaluated + +The evaluator examines: + +1. **Tool Call Patterns**: Sequence of tool calls and their results +2. **Retry Behavior**: Whether the agent retried failed tools and how many times +3. **Recovery Quality**: + - **Exploration breadth**: Did the agent try alternative tools or approaches? + - **Retry discipline**: Did it retry appropriately (not excessively)? + - **Approach variation**: Did retries use different strategies (different parameters, different tools)? + +## Best Practices + +1. **Provide Alternative Tools**: Give agents access to multiple tools that can partially fulfill the same goal +2. **Add Recovery Instructions**: System prompts with explicit recovery guidance help agents score higher +3. **Capture Complete Sessions**: Include all tool call attempts and retries in the trajectory +4. **Combine with Other Evaluators**: Use alongside `FailureCommunicationEvaluator` and `PartialCompletionEvaluator` +5. **Test Various Failure Severities**: Include single-tool failures and multi-tool failures + +## Common Patterns + +### Pattern 1: Fallback to Alternative Tools +Evaluate if the agent pivots to a different tool when the primary one fails. + +### Pattern 2: Retry with Variation +Assess if the agent retries with different parameters instead of repeating the same call. + +### Pattern 3: Graceful Escalation +Measure if the agent escalates to the user when all automated recovery options are exhausted. + +## Example Scenarios + +### Scenario 1: Excellent Recovery +``` +Tool: search_flights -> Timeout +Agent: [retries search_flights with broader date range -> still fails] +Agent: [calls search_hotels for the destination instead] +Final: "I couldn't find flight info, but I found hotels in Tokyo for your dates." +Evaluation: Excellent Recovery (1.0) - Tried variation, then pivoted to alternative +``` + +### Scenario 2: Good Recovery +``` +Tool: search_flights -> NetworkError +Agent: [retries search_flights once -> still fails] +Final: "Flight search is unavailable. Please try again later." +Evaluation: Good Recovery (0.75) - Retried once, then communicated clearly +``` + +### Scenario 3: Poor Recovery +``` +Tool: search_flights -> Timeout +Agent: [retries search_flights 5 times with identical parameters] +Final: "I'm having trouble finding flights." +Evaluation: Poor Recovery (0.25) - Excessive retries with no variation +``` + +### Scenario 4: No Recovery +``` +Tool: search_flights -> ExecutionError +Agent: "I can't help with that." +Evaluation: No Recovery (0.0) - Gave up immediately without any attempt +``` + +## Common Issues and Solutions + +### Issue 1: Score is Always 0.5 +**Problem**: Evaluator always returns neutral score. +**Solution**: Ensure tool failures are present in the trace. If no tools fail, the evaluator returns 0.5 by design. + +### Issue 2: Agent Retries Excessively +**Problem**: Agent retries the same tool many times, getting a low recovery score. +**Solution**: Add retry limits to the system prompt (e.g., "Do NOT retry more than once"). + +### Issue 3: No Trajectory Data +**Problem**: Evaluator returns empty results. +**Solution**: Ensure telemetry captures full session including all tool call spans. + +## Differences from Other Evaluators + +- **vs. FailureCommunicationEvaluator**: Recovery scores the agent's *actions* (retries, fallbacks, tool switching); communication scores the agent's *words* (how it explains failures). Both can be high, both can be low, or one without the other. +- **vs. PartialCompletionEvaluator**: Recovery scores the quality of recovery *attempts* regardless of outcome; partial completion scores the *result* regardless of how the agent got there. Excellent recovery may still yield low completion if all alternatives also fail. +- **vs. TrajectoryEvaluator**: Trajectory evaluates the full action sequence holistically for workflow adherence; recovery specifically targets the quality of failure-response actions within that sequence. +- **vs. ToolSelectionEvaluator**: Tool selection checks if correct tools were chosen under normal conditions; recovery evaluates whether the agent adapted its tool choices appropriately when failures occurred. + +## Use Cases + +### Use Case 1: Chaos Testing +Evaluate agent recovery strategies under deliberately injected tool failures. + +### Use Case 2: Agent Configuration Comparison +Compare how different system prompts affect recovery behavior. + +### Use Case 3: Retry Policy Validation +Verify agents follow expected retry policies (retry once, then fallback). + +### Use Case 4: Multi-Tool Resilience +Test whether agents leverage alternative tools when primary ones fail. + +## Related Evaluators + +- [**FailureCommunicationEvaluator**](failure_communication_evaluator): Evaluates how well agents communicate failures +- [**PartialCompletionEvaluator**](partial_completion_evaluator): Measures what fraction of goals were achieved +- [**TrajectoryEvaluator**](trajectory_evaluator): Evaluates the sequence of actions taken +- [**ToolSelectionEvaluator**](tool_selection_evaluator): Evaluates whether correct tools were selected +- [**GoalSuccessRateEvaluator**](goal_success_rate_evaluator): Binary goal achievement assessment + +## Related Documentation + +- [Chaos Testing](/docs/user-guide/evals-sdk/chaos_testing/): Chaos testing overview and guide From eba2989de468b10de0c13d5869b41e2b310e681a Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Thu, 11 Jun 2026 16:42:08 +0000 Subject: [PATCH 4/6] fix import pattern and link display --- .../evals-sdk/chaos_failure_communication_evaluator.py | 3 +-- .../evals-sdk/chaos_partial_completion_evaluator.py | 3 +-- .../evals-sdk/chaos_recovery_strategy_evaluator.py | 3 +-- site/docs/examples/evals-sdk/chaos_testing.py | 2 +- .../content/docs/user-guide/evals-sdk/chaos_testing.mdx | 8 ++++---- .../evaluators/failure_communication_evaluator.mdx | 4 ++-- .../evals-sdk/evaluators/partial_completion_evaluator.mdx | 4 ++-- .../evals-sdk/evaluators/recovery_strategy_evaluator.mdx | 4 ++-- 8 files changed, 14 insertions(+), 17 deletions(-) diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py index 7a4d8acb39..8f384ace8a 100644 --- a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py @@ -6,8 +6,7 @@ from strands import Agent from strands_evals import StrandsEvalsTelemetry -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout -from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout, NetworkError from strands_evals.evaluators.chaos import FailureCommunicationEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py index 6eccdc2cba..cc0ef08c25 100644 --- a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py @@ -6,8 +6,7 @@ from strands import Agent from strands_evals import StrandsEvalsTelemetry -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields -from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, NetworkError, TruncateFields from strands_evals.evaluators.chaos import PartialCompletionEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py index 7338a53768..9789155ff7 100644 --- a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py @@ -6,8 +6,7 @@ from strands import Agent from strands_evals import StrandsEvalsTelemetry -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout -from strands_evals.chaos.effects import ExecutionError +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, ExecutionError, Timeout from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py index 7e2600e12d..fd558ab9b6 100644 --- a/site/docs/examples/evals-sdk/chaos_testing.py +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -11,12 +11,12 @@ ChaosExperiment, ChaosPlugin, CorruptValues, + ExecutionError, NetworkError, RemoveFields, Timeout, TruncateFields, ) -from strands_evals.chaos.effects import ExecutionError from strands_evals.evaluators import GoalSuccessRateEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx index fa7c903bcb..62c770ef9e 100644 --- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx @@ -7,7 +7,7 @@ sidebar: ## Overview -Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment`, you can test how agents handle tool timeouts, network errors, and corrupted responses without modifying agent code. +Chaos testing systematically evaluates agent resilience by injecting controlled failures into tool execution. Using `ChaosPlugin`, `ChaosCase`, and `ChaosExperiment`, you can test how agents handle tool timeouts, network errors, and corrupted responses without modifying agent code. A complete example can be found [here](https://github.com/strands-agents/harness-sdk/blob/main/site/docs/examples/evals-sdk/chaos_testing.py). This enables you to answer questions like: - Does the agent gracefully communicate failures to users? @@ -65,7 +65,7 @@ Define your tools as usual with `@tool`, then create `ChaosCase` objects specify ```python from strands import tool from strands_evals.chaos import ChaosCase, Timeout -from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos import NetworkError @tool def get_weather(city: str) -> str: @@ -135,7 +135,7 @@ These effects cancel the tool call entirely and return an error: ```python from strands_evals.chaos import Timeout -from strands_evals.chaos.effects import NetworkError, ExecutionError, ValidationError +from strands_evals.chaos import NetworkError, ExecutionError, ValidationError effect_maps = { "timeout": {"tool_effects": {"my_tool": [Timeout()]}}, @@ -190,7 +190,7 @@ When you have multiple base cases and want to test across several failure scenar ```python from strands_evals import Case from strands_evals.chaos import ChaosCase, Timeout -from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos import NetworkError # Define base test cases base_cases = [ diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx index 0c46fb2ff6..537c62c9c8 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx @@ -7,7 +7,7 @@ sidebar: ## Overview -The `FailureCommunicationEvaluator` assesses how well an agent communicates failures to the user when tools or services fail. It uses an LLM-as-judge approach with a five-level scoring rubric to evaluate clarity, actionability, transparency, and tone of failure messages. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py). +The `FailureCommunicationEvaluator` assesses how well an agent communicates failures to the user when tools or services fail. It uses an LLM-as-judge approach with a five-level scoring rubric to evaluate clarity, actionability, transparency, and tone of failure messages. A complete example can be found [here](https://github.com/strands-agents/harness-sdk/blob/main/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py). ## Key Features @@ -63,7 +63,7 @@ from pydantic import BaseModel, Field from strands import Agent from strands_evals import StrandsEvalsTelemetry from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout -from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos import NetworkError from strands_evals.evaluators.chaos import FailureCommunicationEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx index 058c792d6d..aaaa61dbea 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx @@ -7,7 +7,7 @@ sidebar: ## Overview -The `PartialCompletionEvaluator` scores what fraction of the user's goal was achieved, returning a continuous 0.0 to 1.0 score. Unlike the binary `GoalSuccessRateEvaluator`, this evaluator captures partial progress when an agent completes some sub-steps of a multi-step task but cannot finish the rest. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py). +The `PartialCompletionEvaluator` scores what fraction of the user's goal was achieved, returning a continuous 0.0 to 1.0 score. Unlike the binary `GoalSuccessRateEvaluator`, this evaluator captures partial progress when an agent completes some sub-steps of a multi-step task but cannot finish the rest. A complete example can be found [here](https://github.com/strands-agents/harness-sdk/blob/main/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py). ## Key Features @@ -63,7 +63,7 @@ from pydantic import BaseModel, Field from strands import Agent from strands_evals import StrandsEvalsTelemetry from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields -from strands_evals.chaos.effects import NetworkError +from strands_evals.chaos import NetworkError from strands_evals.evaluators.chaos import PartialCompletionEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx index f3f79f3f84..23e6869305 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx @@ -7,7 +7,7 @@ sidebar: ## Overview -The `RecoveryStrategyEvaluator` scores the quality of an agent's recovery actions when tools fail. It evaluates whether the agent attempts alternative approaches, retries appropriately, and varies its strategies rather than repeating the same failed action. A complete example can be found [here](https://github.com/strands-agents/docs/blob/main/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py). +The `RecoveryStrategyEvaluator` scores the quality of an agent's recovery actions when tools fail. It evaluates whether the agent attempts alternative approaches, retries appropriately, and varies its strategies rather than repeating the same failed action. A complete example can be found [here](https://github.com/strands-agents/harness-sdk/blob/main/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py). ## Key Features @@ -63,7 +63,7 @@ from pydantic import BaseModel, Field from strands import Agent from strands_evals import StrandsEvalsTelemetry from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout -from strands_evals.chaos.effects import ExecutionError +from strands_evals.chaos import ExecutionError from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator From c835eb37bd2a299d76422030429894e46dfa775d Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Fri, 12 Jun 2026 17:42:34 +0000 Subject: [PATCH 5/6] simplify test codes with @eval_task --- site/=4.8.2 | 0 .../chaos_failure_communication_evaluator.py | 28 ++++------------- .../chaos_partial_completion_evaluator.py | 28 ++++------------- .../chaos_recovery_strategy_evaluator.py | 28 ++++------------- site/docs/examples/evals-sdk/chaos_testing.py | 30 ++++--------------- .../user-guide/evals-sdk/chaos_testing.mdx | 22 +++++++------- .../failure_communication_evaluator.mdx | 28 +++++------------ .../partial_completion_evaluator.mdx | 30 +++++-------------- .../recovery_strategy_evaluator.mdx | 30 +++++-------------- 9 files changed, 56 insertions(+), 168 deletions(-) delete mode 100644 site/=4.8.2 diff --git a/site/=4.8.2 b/site/=4.8.2 deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py index 8f384ace8a..32ec76d1d3 100644 --- a/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_failure_communication_evaluator.py @@ -5,18 +5,14 @@ from pydantic import BaseModel, Field from strands import Agent -from strands_evals import StrandsEvalsTelemetry from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout, NetworkError +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.evaluators.chaos import FailureCommunicationEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - tool_simulator = ToolSimulator() class FlightSearchResponse(BaseModel): @@ -66,13 +62,15 @@ def book_flight(flight_id: str) -> dict[str, Any]: _search_tool = tool_simulator.get_tool("search_flights") _book_tool = tool_simulator.get_tool("book_flight") -def travel_agent_task(case: ChaosCase) -> dict: +@eval_task(TracedHandler()) +def travel_agent_task(case: ChaosCase): """Run the travel agent under chaos and return output + trajectory.""" logger.info(f"\n{'─'*60}") logger.info(f" Case: {case.name}") logger.info(f" User: {case.input}") + logger.info(f"{'─'*60}") - agent = Agent( + return Agent( system_prompt=( "You are a travel booking assistant. Use the available tools to complete " "the user's request. Today's date is May 18, 2025.\n\n" @@ -88,22 +86,6 @@ def travel_agent_task(case: ChaosCase) -> dict: trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, ) - memory_exporter.clear() - try: - result = agent(case.input) - output = str(result) - except Exception as e: - output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" - - logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") - logger.info(f"{'─'*60}") - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": output, "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[FailureCommunicationEvaluator()], diff --git a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py index cc0ef08c25..724e26790f 100644 --- a/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_partial_completion_evaluator.py @@ -5,18 +5,14 @@ from pydantic import BaseModel, Field from strands import Agent -from strands_evals import StrandsEvalsTelemetry from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, NetworkError, TruncateFields +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.evaluators.chaos import PartialCompletionEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - tool_simulator = ToolSimulator() class FlightSearchResponse(BaseModel): @@ -81,13 +77,15 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: _book_tool = tool_simulator.get_tool("book_flight") _confirm_tool = tool_simulator.get_tool("send_booking_confirmation") -def travel_agent_task(case: ChaosCase) -> dict: +@eval_task(TracedHandler()) +def travel_agent_task(case: ChaosCase): """Run the travel agent under chaos and return output + trajectory.""" logger.info(f"\n{'─'*60}") logger.info(f" Case: {case.name}") logger.info(f" User: {case.input}") + logger.info(f"{'─'*60}") - agent = Agent( + return Agent( system_prompt=( "You are a travel booking assistant. Use the available tools to complete " "the user's request. Today's date is May 18, 2025.\n\n" @@ -103,22 +101,6 @@ def travel_agent_task(case: ChaosCase) -> dict: trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, ) - memory_exporter.clear() - try: - result = agent(case.input) - output = str(result) - except Exception as e: - output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" - - logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") - logger.info(f"{'─'*60}") - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": output, "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[PartialCompletionEvaluator()], diff --git a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py index 9789155ff7..072a5bee6e 100644 --- a/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py +++ b/site/docs/examples/evals-sdk/chaos_recovery_strategy_evaluator.py @@ -5,18 +5,14 @@ from pydantic import BaseModel, Field from strands import Agent -from strands_evals import StrandsEvalsTelemetry from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, ExecutionError, Timeout +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - tool_simulator = ToolSimulator() class FlightSearchResponse(BaseModel): @@ -77,13 +73,15 @@ def book_flight(flight_id: str) -> dict[str, Any]: _search_hotels_tool = tool_simulator.get_tool("search_hotels") _book_tool = tool_simulator.get_tool("book_flight") -def travel_agent_task(case: ChaosCase) -> dict: +@eval_task(TracedHandler()) +def travel_agent_task(case: ChaosCase): """Run the travel agent under chaos and return output + trajectory.""" logger.info(f"\n{'─'*60}") logger.info(f" Case: {case.name}") logger.info(f" User: {case.input}") + logger.info(f"{'─'*60}") - agent = Agent( + return Agent( system_prompt=( "You are a travel planning assistant. Use the available tools to complete " "the user's request. Today's date is May 18, 2025.\n\n" @@ -99,22 +97,6 @@ def travel_agent_task(case: ChaosCase) -> dict: trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, ) - memory_exporter.clear() - try: - result = agent(case.input) - output = str(result) - except Exception as e: - output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" - - logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") - logger.info(f"{'─'*60}") - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": output, "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[RecoveryStrategyEvaluator()], diff --git a/site/docs/examples/evals-sdk/chaos_testing.py b/site/docs/examples/evals-sdk/chaos_testing.py index fd558ab9b6..6675622190 100644 --- a/site/docs/examples/evals-sdk/chaos_testing.py +++ b/site/docs/examples/evals-sdk/chaos_testing.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, Field from strands import Agent -from strands_evals import Case, StrandsEvalsTelemetry +from strands_evals import Case from strands_evals.chaos import ( ChaosCase, ChaosExperiment, @@ -17,17 +17,13 @@ Timeout, TruncateFields, ) +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.evaluators import GoalSuccessRateEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper from strands_evals.simulation import ToolSimulator logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) -# Setup telemetry -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - # 1. Set up ToolSimulator and register tools tool_simulator = ToolSimulator() @@ -100,13 +96,15 @@ def send_booking_confirmation(booking_id: str = "", flight_id: str = "", method: _book_tool = tool_simulator.get_tool("book_flight") _confirm_tool = tool_simulator.get_tool("send_booking_confirmation") -def travel_agent_task(case: ChaosCase) -> dict: +@eval_task(TracedHandler()) +def travel_agent_task(case: ChaosCase): """Run the travel agent with a single user query.""" logger.info(f"\n{'─'*60}") logger.info(f" Case: {case.name}") logger.info(f" User: {case.input}") + logger.info(f"{'─'*60}") - agent = Agent( + return Agent( system_prompt=( "You are a travel booking assistant. You help users search for flights, " "book them, and send confirmations. Use the available tools to complete " @@ -128,22 +126,6 @@ def travel_agent_task(case: ChaosCase) -> dict: trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, ) - memory_exporter.clear() - try: - result = agent(case.input) - output = str(result) - except Exception as e: - output = f"Agent failed with error: {type(e).__name__}: {str(e)[:200]}" - - logger.info(f" Agent: {output[:300]}{'...' if len(output) > 300 else ''}") - logger.info(f"{'─'*60}") - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": output, "trajectory": session} - # 5. Define test cases and expand with effect maps test_cases = [ Case( diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx index 62c770ef9e..6d8b09bb13 100644 --- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx @@ -93,17 +93,18 @@ Add `ChaosPlugin()` to the agent's plugins list. No other code changes are neede ```python from strands import Agent from strands_evals.chaos import ChaosPlugin +from strands_evals.eval_task_handler import TracedHandler, eval_task chaos_plugin = ChaosPlugin() -def task_function(case: ChaosCase) -> dict: - agent = Agent( +@eval_task(TracedHandler()) +def task_function(case: ChaosCase): + return Agent( system_prompt="You are a helpful weather assistant.", plugins=[chaos_plugin], - callback_handler=None + callback_handler=None, + trace_attributes={"session.id": case.session_id}, ) - response = agent(case.input) - return {"output": str(response)} ``` ### Run chaos experiment @@ -221,6 +222,7 @@ Chaos testing works naturally with `ToolSimulator` for fully controlled evaluati ```python from strands import Agent from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout, CorruptValues +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.evaluators import GoalSuccessRateEvaluator from strands_evals.simulation import ToolSimulator from pydantic import BaseModel, Field @@ -252,14 +254,14 @@ chaos_cases = [ chaos_plugin = ChaosPlugin() _search_tool = tool_simulator.get_tool("web_search") -def task_function(case: ChaosCase) -> dict: - agent = Agent( +@eval_task(TracedHandler()) +def task_function(case: ChaosCase): + return Agent( tools=[_search_tool], plugins=[chaos_plugin], - callback_handler=None + callback_handler=None, + trace_attributes={"session.id": case.session_id}, ) - response = agent(case.input) - return {"output": str(response)} experiment = ChaosExperiment( cases=chaos_cases, diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx index 537c62c9c8..eefb8e4861 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/failure_communication_evaluator.mdx @@ -34,7 +34,7 @@ This evaluator operates at the **SESSION_LEVEL**, evaluating the full conversati ## Parameters ### `model` (optional) -- **Type**: `Union[Model, str, None]` +- **Type**: `Model | str | None` - **Default**: `None` (uses default Bedrock model) - **Description**: The model to use as the judge. @@ -61,16 +61,11 @@ from typing import Any from pydantic import BaseModel, Field from strands import Agent -from strands_evals import StrandsEvalsTelemetry -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout -from strands_evals.chaos import NetworkError +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout, NetworkError from strands_evals.evaluators.chaos import FailureCommunicationEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.simulation import ToolSimulator -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - tool_simulator = ToolSimulator() class FlightSearchResponse(BaseModel): @@ -83,6 +78,7 @@ def search_flights(origin: str, destination: str, date: str) -> dict[str, Any]: pass chaos_plugin = ChaosPlugin() +_search_tool = tool_simulator.get_tool("search_flights") chaos_cases = [ ChaosCase( @@ -97,10 +93,9 @@ chaos_cases = [ ), ] -_search_tool = tool_simulator.get_tool("search_flights") - -def task_function(case: ChaosCase) -> dict: - agent = Agent( +@eval_task(TracedHandler()) +def task_function(case: ChaosCase): + return Agent( system_prompt="You are a travel booking assistant.", tools=[_search_tool], plugins=[chaos_plugin], @@ -108,15 +103,6 @@ def task_function(case: ChaosCase) -> dict: trace_attributes={"session.id": case.session_id}, ) - memory_exporter.clear() - response = agent(case.input) - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": str(response), "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[FailureCommunicationEvaluator()], diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx index aaaa61dbea..9133d8a5e9 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator.mdx @@ -34,7 +34,7 @@ This evaluator operates at the **SESSION_LEVEL**, evaluating the full conversati ## Parameters ### `model` (optional) -- **Type**: `Union[Model, str, None]` +- **Type**: `Model | str | None` - **Default**: `None` (uses default Bedrock model) - **Description**: The model to use as the judge. @@ -61,16 +61,11 @@ from typing import Any from pydantic import BaseModel, Field from strands import Agent -from strands_evals import StrandsEvalsTelemetry -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, TruncateFields -from strands_evals.chaos import NetworkError +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, NetworkError, TruncateFields from strands_evals.evaluators.chaos import PartialCompletionEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.simulation import ToolSimulator -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - tool_simulator = ToolSimulator() class FlightSearchResponse(BaseModel): @@ -92,6 +87,8 @@ def book_flight(flight_id: str) -> dict[str, Any]: pass chaos_plugin = ChaosPlugin() +_search_tool = tool_simulator.get_tool("search_flights") +_book_tool = tool_simulator.get_tool("book_flight") # Search works (degraded) but booking fails: partial completion expected chaos_cases = [ @@ -107,11 +104,9 @@ chaos_cases = [ ), ] -_search_tool = tool_simulator.get_tool("search_flights") -_book_tool = tool_simulator.get_tool("book_flight") - -def task_function(case: ChaosCase) -> dict: - agent = Agent( +@eval_task(TracedHandler()) +def task_function(case: ChaosCase): + return Agent( system_prompt="You are a travel booking assistant.", tools=[_search_tool, _book_tool], plugins=[chaos_plugin], @@ -119,15 +114,6 @@ def task_function(case: ChaosCase) -> dict: trace_attributes={"session.id": case.session_id}, ) - memory_exporter.clear() - response = agent(case.input) - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": str(response), "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[PartialCompletionEvaluator()], diff --git a/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx b/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx index 23e6869305..db2268c5cf 100644 --- a/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator.mdx @@ -34,7 +34,7 @@ This evaluator operates at the **SESSION_LEVEL**, evaluating the full conversati ## Parameters ### `model` (optional) -- **Type**: `Union[Model, str, None]` +- **Type**: `Model | str | None` - **Default**: `None` (uses default Bedrock model) - **Description**: The model to use as the judge. @@ -61,16 +61,11 @@ from typing import Any from pydantic import BaseModel, Field from strands import Agent -from strands_evals import StrandsEvalsTelemetry -from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, Timeout -from strands_evals.chaos import ExecutionError +from strands_evals.chaos import ChaosCase, ChaosExperiment, ChaosPlugin, ExecutionError, Timeout from strands_evals.evaluators.chaos import RecoveryStrategyEvaluator -from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.eval_task_handler import TracedHandler, eval_task from strands_evals.simulation import ToolSimulator -telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() -memory_exporter = telemetry.in_memory_exporter - tool_simulator = ToolSimulator() class FlightSearchResponse(BaseModel): @@ -92,6 +87,8 @@ def search_hotels(city: str, check_in: str, check_out: str) -> dict[str, Any]: pass chaos_plugin = ChaosPlugin() +_flights_tool = tool_simulator.get_tool("search_flights") +_hotels_tool = tool_simulator.get_tool("search_hotels") # Flight search times out but hotel search works: agent should pivot chaos_cases = [ @@ -107,11 +104,9 @@ chaos_cases = [ ), ] -_flights_tool = tool_simulator.get_tool("search_flights") -_hotels_tool = tool_simulator.get_tool("search_hotels") - -def task_function(case: ChaosCase) -> dict: - agent = Agent( +@eval_task(TracedHandler()) +def task_function(case: ChaosCase): + return Agent( system_prompt=( "You are a travel planning assistant. If a tool fails, " "try alternative tools that can partially fulfill the request. " @@ -123,15 +118,6 @@ def task_function(case: ChaosCase) -> dict: trace_attributes={"session.id": case.session_id}, ) - memory_exporter.clear() - response = agent(case.input) - - finished_spans = memory_exporter.get_finished_spans() - mapper = StrandsInMemorySessionMapper() - session = mapper.map_to_session(finished_spans, session_id=case.session_id) - - return {"output": str(response), "trajectory": session} - experiment = ChaosExperiment( cases=chaos_cases, evaluators=[RecoveryStrategyEvaluator()], From c906fdaa51a828c9beb2cf003074d118d98335b2 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Fri, 12 Jun 2026 18:17:33 +0000 Subject: [PATCH 6/6] polish chaos testing docand snippets --- .../user-guide/evals-sdk/chaos_testing.mdx | 33 +++++-------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx index 6d8b09bb13..eb076ae93c 100644 --- a/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx +++ b/site/src/content/docs/user-guide/evals-sdk/chaos_testing.mdx @@ -101,6 +101,7 @@ chaos_plugin = ChaosPlugin() def task_function(case: ChaosCase): return Agent( system_prompt="You are a helpful weather assistant.", + tools=[get_weather], plugins=[chaos_plugin], callback_handler=None, trace_attributes={"session.id": case.session_id}, @@ -171,6 +172,8 @@ effect_maps = { Target multiple tools in a single case to simulate cascading failures: ```python +from strands_evals.chaos import ChaosCase, Timeout, NetworkError, CorruptValues + chaos_case = ChaosCase( name="total_chaos", input="Book me a flight to Paris", @@ -292,24 +295,6 @@ Chaos testing ships with three specialized evaluators designed to assess agent b | [PartialCompletionEvaluator](/docs/user-guide/evals-sdk/evaluators/partial_completion_evaluator/) | Fraction of user goal achieved despite failures | Continuous (0.0 to 1.0) | ~1.0 when task completes fully | | [RecoveryStrategyEvaluator](/docs/user-guide/evals-sdk/evaluators/recovery_strategy_evaluator/) | Quality of recovery actions: exploration breadth, retry discipline, approach variation | Five-level (0.0, 0.25, 0.5, 0.75, 1.0) | 0.5 when no failures occur | -```python -from strands_evals.evaluators.chaos import ( - FailureCommunicationEvaluator, - PartialCompletionEvaluator, - RecoveryStrategyEvaluator, -) - -evaluators = [ - PartialCompletionEvaluator(), # How much was completed? - FailureCommunicationEvaluator(), # Did the agent tell the user? - RecoveryStrategyEvaluator(), # Did the agent try alternatives? -] - -experiment = ChaosExperiment(cases=chaos_cases, evaluators=evaluators) -report = experiment.run_evaluations(task=task_function) -report.run_display() -``` - ### Interpreting Results When reviewing evaluation outputs, look at evaluator scores together to identify patterns in your agent's failure-handling behavior: @@ -379,8 +364,6 @@ experiment = ChaosExperiment( cases=sweep_cases, evaluators=[GoalSuccessRateEvaluator(), PartialCompletionEvaluator()] ) -report = experiment.run_evaluations(task=task_function) -report.run_display() # Analyze: at what ratio does goal success drop below 0.5? # Gradual degradation = resilient agent; cliff-edge = fragile agent @@ -391,7 +374,9 @@ report.run_display() Combine chaos testing with user simulation for multi-turn resilience evaluation: ```python +from strands import Agent from strands_evals import ActorSimulator +from strands_evals.chaos import ChaosCase, ChaosPlugin def task_function(case: ChaosCase) -> dict: user_sim = ActorSimulator.from_case_for_user_simulator( @@ -423,19 +408,19 @@ Always include a no-effect baseline to compare agent performance with and withou chaos_cases = ChaosCase.expand(cases, effect_maps, include_no_effect_baseline=True) ``` -### 2. Test One Failure at a Time First +### 2. Gradually Increase Chaos Severity -Start with your most critical tool against all effect types before expanding to multi-tool scenarios. Single-tool tests reveal individual tool robustness; compound tests reveal pipeline fragility: +Start with single-tool failures to understand how your agent handles each failure point in isolation. Once you understand the baseline behavior, move to compound failures (multiple tools failing simultaneously) and then to advanced patterns like degradation sweeps. When a compound test fails, single-tool results tell you which tool failure is responsible: ```python -# Single failure first +# Start simple: one tool, one effect single_case = ChaosCase( name="search_fails", input="Find flights to Paris", effects={"tool_effects": {"search": [Timeout()]}}, ) -# Compound (test after single failures are understood) +# Then escalate: multiple tools failing together compound_case = ChaosCase( name="total_chaos", input="Find flights to Paris",