diff --git a/src/google/adk/evaluation/eval_case.py b/src/google/adk/evaluation/eval_case.py index 4902b3561..1885e0eb3 100644 --- a/src/google/adk/evaluation/eval_case.py +++ b/src/google/adk/evaluation/eval_case.py @@ -31,12 +31,25 @@ class EvalBaseModel(BaseModel): ) +class ToolUseWithResponse(EvalBaseModel): + """Represents a tool use with its response.""" + + function_call: genai_types.FunctionCall + """The function call.""" + + function_response: Optional[genai_types.FunctionResponse] = None + """The function response.""" + + class IntermediateData(EvalBaseModel): """Container for intermediate data that an agent would generate as it responds with a final answer.""" tool_uses: list[genai_types.FunctionCall] = [] """Tool use trajectory in chronological order.""" + tool_uses_with_responses: list[ToolUseWithResponse] = [] + """Tool use trajectory with responses in chronological order.""" + intermediate_responses: list[tuple[str, list[genai_types.Part]]] = [] """Intermediate responses generated by sub-agents to convey progress or status in a multi-agent system, distinct from the final response. diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 1359967bc..5901c8628 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -32,6 +32,7 @@ from .eval_case import IntermediateData from .eval_case import Invocation from .eval_case import SessionInput +from .eval_case import ToolUseWithResponse from .eval_set import EvalSet @@ -180,6 +181,7 @@ async def _generate_inferences_from_root_agent( final_response = None user_content = invocation.user_content tool_uses = [] + tool_uses_with_responses_by_id = {} invocation_id = "" async for event in runner.run_async( @@ -191,16 +193,29 @@ async def _generate_inferences_from_root_agent( if event.is_final_response() and event.content and event.content.parts: final_response = event.content - elif event.get_function_calls(): - for call in event.get_function_calls(): - tool_uses.append(call) + else: + if event.get_function_calls(): + for call in event.get_function_calls(): + tool_uses.append(call) + tool_uses_with_responses_by_id[call.id] = ToolUseWithResponse( + function_call=call + ) + elif event.get_function_responses(): + for response in event.get_function_responses(): + if response.id in tool_uses_with_responses_by_id: + tool_uses_with_responses_by_id[ + response.id + ].function_response = response response_invocations.append( Invocation( invocation_id=invocation_id, user_content=user_content, final_response=final_response, - intermediate_data=IntermediateData(tool_uses=tool_uses), + intermediate_data=IntermediateData( + tool_uses=tool_uses, + tool_uses_with_responses=tool_uses_with_responses_by_id.values(), + ), ) ) diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py new file mode 100644 index 000000000..7722ee7b3 --- /dev/null +++ b/src/google/adk/evaluation/hallucinations_v1.py @@ -0,0 +1,280 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +import logging +import re +from typing import Optional + +from google.genai import types as genai_types +from typing_extensions import override + +from ..models.llm_response import LlmResponse +from ..utils.feature_decorator import working_in_progress +from .eval_case import Invocation +from .eval_case import ToolUseWithResponse +from .eval_metrics import EvalMetric +from .evaluator import EvalStatus +from .evaluator import EvaluationResult +from .evaluator import PerInvocationResult +from .llm_as_judge import LlmAsJudge +from .llm_as_judge_utils import get_eval_status +from .llm_as_judge_utils import get_text_from_content + +logger = logging.getLogger("google_adk." + __name__) + + +HALLUCINATIONS_V1_PROMPT = """ +You are a helpful and harmless AI assistant. You will be provided with a textual context and a model-generated response. +Your task is to analyze the response sentence by sentence and classify each sentence according to its relationship with the provided context. + +**Instructions:** + +1. **Decompose the response into individual sentences.** +2. **For each sentence, assign one of the following labels:** + * **`supported`**: The sentence is entailed by the given context. Provide a supporting excerpt from the context. The supporting except must *fully* entail the sentence. + * **`unsupported`**: The sentence is not entailed by the given context. No excerpt is needed for this label. + * **`contradictory`**: The sentence is falsified by the given context. Provide a contradicting excerpt from the context. + * **`disputed`**: The given context contains both supporting and contradicting information. Provide both supporting and contradicting excerpt from the context. + * **`no_rad`**: The sentence does not require factual attribution (e.g., opinions, planning steps, greetings, questions, disclaimers, mathematical calculation). +3. **For each label, provide a short rationale explaining your decision.** The rationale should be separate from the excerpt. +4. **Be very strict with your `supported`, `contradictory` and `disputed` decisions.** Unless you can find straightforward, indisputable evidence excepts *in the context* that a sentence is `supported`, `contradictory` or `disputed`, consider it `unsupported`. You should not employ world knowledge unless it is truly trivial. +5. "tool_outputs" blocks contain code execution results of the "tool_code" blocks immediately above them. If any sentence is based on "tool_outputs" results, first analyze if the corresponding "tool_code" is supported and if the results are error-free. Only if the "tool_code" block is supported, you can treat code execution results as correct. +6. If you need to cite multiple supporting excerpts, simply concatenate them. Excerpt could be summary from the context if it is too long. + +**Input Format:** + +The input will consist of two parts, clearly separated: + +* **Context:** The textual context used to generate the response. +* **Response:** The model-generated response to be analyzed. + +**Output Format:** + +For each sentence in the response, output a block of text with the following fields: + +* sentence: The sentence being analyzed. +* label: One of `supported`, `unsupported`, `contradictory`, `disputed` or `no_rad`. +* rationale: A brief explanation for the assessment +* supporting_excerpt: A relevant excerpt from the context that supports the sentence. Only required for `supported` and `disputed` labels. +* contradicting_excerpt: A relevant excerpt from the context that contradicts with the sentence. Only required for `contradictory` and `disputed` labels. + +**Example:** + +**Input:** + +**Context Begin** +Apples are red fruits. Bananas are yellow fruits. Pears are purple fruits. Pears are blue fruits. +**Context End** + +**Response Begin** +Apples are red. Bananas are green. Pears are purple. Bananas are cheaper than apples. Enjoy your fruit! +**Response End** + +**Output:** +sentence: Apples are red. +label: supported +rationale: The context explicitly states that apples are red. +supporting_excerpt: Apples are red fruits. +contradicting_excerpt: null + +sentence: Bananas are green. +label: contradictory +rationale: The context states that bananas are yellow, not green. +supporting_excerpt: null +contradicting_excerpt: Bananas are yellow fruits. + +sentence: Pears are purple. +label: disputed +rationale: The context states that pears are purple but it also states that pears are blue. +supporting_excerpt: Pears are purple fruits +contradicting_excerpt: Pears are blue fruits + +sentence: Bananas are cheaper than apples. +label: unsupported +rationale: The context does not mention the price of bananas or apples. +supporting_excerpt: null +contradicting_excerpt: null + +sentence: Enjoy your fruit! +label: no_rad +rationale: This is a general expression and does not require factual attribution. +supporting_excerpt: null +contradicting_excerpt: null + +**Now, please analyze the following context and response:** + +**Input:** + +**Context Begin** +{context} +**Context End** + +**Response Begin** +{response} +**Response End** + +**Output:** +""".strip() + + +def _format_function_call(function_call: genai_types.FunctionCall) -> str: + """Formats a function call as a string.""" + formatted_function_call = f"Function call\nName: {function_call.name}\n" + formatted_function_call += f"Args: {json.dumps(function_call.args)}" + return formatted_function_call + + +def _format_function_response( + function_response: genai_types.FunctionResponse, +) -> str: + """Formats a function response as a string.""" + formatted_function_response = ( + f"Function response\nName: {function_response.name}\n" + ) + formatted_function_response += ( + f"Response: {json.dumps(function_response.response)}" + ) + return formatted_function_response + + +def _format_tool_use_with_response( + tool_use_with_response: ToolUseWithResponse, +) -> str: + """Formats a tool use with response as a string.""" + formatted_tool_use = { + "name": tool_use_with_response.function_call.name, + "args": tool_use_with_response.function_call.args, + } + if tool_use_with_response.function_response: + formatted_tool_use["response"] = ( + tool_use_with_response.function_response.response + ) + return json.dumps(formatted_tool_use) + + +def _extract_labels_from_critique(response: str) -> list[str]: + """Extracts the label from the LLM critique.""" + label_matches = re.findall( + r"label: (supported|unsupported|contradictory|disputed|no_rad)", response + ) + return label_matches + + +@working_in_progress +class HallucinationsV1Evaluator(LlmAsJudge): + """LLM-based evaluator to judge factuality and whether the response contains hallucinations.""" + + def __init__( + self, + eval_metric: EvalMetric, + ): + super().__init__(eval_metric) + self._auto_rater_prompt_template = HALLUCINATIONS_V1_PROMPT + + @override + def format_auto_rater_prompt( + self, actual_invocation: Invocation, expected_invocation: Invocation + ) -> str: + response = get_text_from_content(actual_invocation.final_response) + user_content = get_text_from_content(expected_invocation.user_content) + context = [f"User prompt: {user_content}"] + if not actual_invocation.intermediate_data: + return self._auto_rater_prompt_template.format( + context=context[0], response=response + ) + # Only support text and function calls/tool uses for now. + for ( + tool_use + ) in actual_invocation.intermediate_data.tool_uses_with_responses: + context.append(f"Tool use:\n{_format_tool_use_with_response(tool_use)}") + for ( + author, + parts, + ) in actual_invocation.intermediate_data.intermediate_responses: + parts_formatted = [] + for part in parts: + if part.text: + parts_formatted.append(part.text) + elif part.function_call: + parts_formatted.append(_format_function_call(part.function_call)) + elif part.function_response: + parts_formatted.append( + _format_function_response(part.function_response) + ) + parts_formatted = "\n".join(parts_formatted) + context.append(f"Sub-agent {author} response:\n{parts_formatted}") + return self._auto_rater_prompt_template.format( + context="\n\n".join(context), response=response + ) + + @override + def convert_auto_rater_response_to_score( + self, llm_response: LlmResponse + ) -> Optional[float]: + text = get_text_from_content(llm_response.content) + labels = _extract_labels_from_critique(text) + if not labels: + return None + final_score = 0.0 + # If label is 'disputed' or 'no_rad', the score is not affected. + for label in labels: + if label == "supported": + final_score += 1.0 + elif label in ["unsupported", "contradictory"]: + final_score -= 1.0 + return final_score / len(labels) + + @override + def aggregate_per_invocation_samples( + self, per_invocation_samples: list[PerInvocationResult] + ) -> PerInvocationResult: + """Computes the fraction of invocation samples that are valid.""" + final_score = 0.0 + num_evaluated = 0 + for sample in per_invocation_samples: + if sample.score is None or sample.eval_status == EvalStatus.NOT_EVALUATED: + continue + num_evaluated += 1 + final_score += sample.score + final_score /= num_evaluated + return PerInvocationResult( + actual_invocation=per_invocation_samples[0].actual_invocation, + expected_invocation=per_invocation_samples[0].expected_invocation, + score=final_score, + eval_status=get_eval_status(final_score, self._eval_metric.threshold), + ) + + @override + def aggregate_invocation_results( + self, per_invocation_results: list[PerInvocationResult] + ) -> EvaluationResult: + """Computes the fraction of invocation results that are valid.""" + final_score = 0.0 + num_evaluated = 0 + for result in per_invocation_results: + if result.score is None or result.eval_status == EvalStatus.NOT_EVALUATED: + continue + num_evaluated += 1 + final_score += result.score + final_score /= num_evaluated + return EvaluationResult( + overall_score=final_score, + overall_eval_status=get_eval_status( + final_score, self._eval_metric.threshold + ), + per_invocation_results=per_invocation_results, + ) diff --git a/tests/unittests/evaluation/test_hallucinations_v1.py b/tests/unittests/evaluation/test_hallucinations_v1.py new file mode 100644 index 000000000..b63c2e58d --- /dev/null +++ b/tests/unittests/evaluation/test_hallucinations_v1.py @@ -0,0 +1,233 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_case import ToolUseWithResponse +from google.adk.evaluation.eval_metrics import EvalMetric +from google.adk.evaluation.eval_metrics import JudgeModelOptions +from google.adk.evaluation.evaluator import EvalStatus +from google.adk.evaluation.evaluator import PerInvocationResult +from google.adk.evaluation.hallucinations_v1 import _format_function_call +from google.adk.evaluation.hallucinations_v1 import _format_function_response +from google.adk.evaluation.hallucinations_v1 import _format_tool_use_with_response +from google.adk.evaluation.hallucinations_v1 import HallucinationsV1Evaluator +from google.adk.models.llm_response import LlmResponse +from google.genai import types as genai_types +import pytest + + +def test_format_function_call(): + function_call = genai_types.FunctionCall( + id="test_function_id", + name="test_function", + args={"arg1": "arg1_value", "arg2": "arg2_value"}, + ) + formatted_function_call = _format_function_call(function_call) + assert formatted_function_call == """Function call +Name: test_function +Args: {"arg1": "arg1_value", "arg2": "arg2_value"}""" + + +def test_format_function_response(): + function_response = genai_types.FunctionResponse( + id="test_function_id", + name="test_function", + response={"result": "return_value"}, + ) + formatted_function_response = _format_function_response(function_response) + assert formatted_function_response == """Function response +Name: test_function +Response: {"result": "return_value"}""" + + +def test_format_tool_use_with_response(): + tool_use_with_response = ToolUseWithResponse( + function_call=genai_types.FunctionCall( + id="test_function_id", + name="test_function", + args={"arg1": "arg1_value", "arg2": "arg2_value"}, + ), + function_response=genai_types.FunctionResponse( + id="test_function_id", + name="test_function", + response={"result": "return_value"}, + ), + ) + formatted_tool_use_with_response = _format_tool_use_with_response( + tool_use_with_response + ) + assert formatted_tool_use_with_response == ( + """{"name": "test_function", "args": {"arg1": "arg1_value", "arg2": "arg2_value"}, "response": {"result": "return_value"}}""" + ) + + +def _create_test_evaluator_gemini( + threshold: float, +) -> HallucinationsV1Evaluator: + evaluator = HallucinationsV1Evaluator( + EvalMetric( + metric_name="hallucinations_v1", + threshold=threshold, + judge_model_options=JudgeModelOptions( + judge_model="gemini-2.5-flash", + num_samples=5, + ), + ), + ) + return evaluator + + +def test_convert_auto_rater_response_to_score(): + llm_response = LlmResponse( + content=genai_types.Content( + parts=[ + genai_types.Part( + text=( + "label: supported\n" + "rationale: The sentence is supported.\n\n" + "label: unsupported\n" + "rationale: The sentence is unsupported.\n\n" + "label: contradictory\n" + "rationale: The sentence is contradictory.\n\n" + "label: disputed\n" + "rationale: The sentence is disputed.\n\n" + "label: no_rad\n" + "rationale: The sentence is no_rad.\n\n" + ) + ) + ], + role="model", + ) + ) + evaluator = _create_test_evaluator_gemini(threshold=0.8) + score = evaluator.convert_auto_rater_response_to_score(llm_response) + # The score is the average of the labels, with 1.0 for supported and -1.0 for + # unsupported and contradictory. + assert score == -1 / 5 + + +def test_aggregate_per_invocation_samples(): + per_invocation_samples = [ + PerInvocationResult( + actual_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + expected_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + score=0.5, + eval_status=EvalStatus.FAILED, + ), + PerInvocationResult( + actual_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + expected_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + score=1.0, + eval_status=EvalStatus.PASSED, + ), + ] + evaluator = _create_test_evaluator_gemini(threshold=0.8) + result = evaluator.aggregate_per_invocation_samples(per_invocation_samples) + assert result.score == 0.75 + assert result.eval_status == EvalStatus.FAILED + + +def test_aggregate_invocation_results(): + per_invocation_samples = [ + PerInvocationResult( + actual_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + expected_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + score=0.5, + eval_status=EvalStatus.FAILED, + ), + PerInvocationResult( + actual_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + expected_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="This is a test query.")], + role="user", + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="This is a test response.")], + role="model", + ), + ), + score=1.0, + eval_status=EvalStatus.PASSED, + ), + ] + evaluator = _create_test_evaluator_gemini(threshold=0.7) + result = evaluator.aggregate_per_invocation_samples(per_invocation_samples) + assert result.score == 0.75 + assert result.eval_status == EvalStatus.PASSED