Skip to content

feat: Implement hallucinations_v1 auto-rater #1844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/google/adk/evaluation/eval_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,25 @@ class EvalBaseModel(BaseModel):
)


class ToolUseWithResponse(EvalBaseModel):
"""Represents a tool use with its response."""

function_call: genai_types.FunctionCall
"""The function call."""

function_response: Optional[genai_types.FunctionResponse] = None
"""The function response."""


class IntermediateData(EvalBaseModel):
"""Container for intermediate data that an agent would generate as it responds with a final answer."""

tool_uses: list[genai_types.FunctionCall] = []
"""Tool use trajectory in chronological order."""

tool_uses_with_responses: list[ToolUseWithResponse] = []
"""Tool use trajectory with responses in chronological order."""

intermediate_responses: list[tuple[str, list[genai_types.Part]]] = []
"""Intermediate responses generated by sub-agents to convey progress or status
in a multi-agent system, distinct from the final response.
Expand Down
23 changes: 19 additions & 4 deletions src/google/adk/evaluation/evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .eval_case import IntermediateData
from .eval_case import Invocation
from .eval_case import SessionInput
from .eval_case import ToolUseWithResponse
from .eval_set import EvalSet


Expand Down Expand Up @@ -180,6 +181,7 @@ async def _generate_inferences_from_root_agent(
final_response = None
user_content = invocation.user_content
tool_uses = []
tool_uses_with_responses_by_id = {}
invocation_id = ""

async for event in runner.run_async(
Expand All @@ -191,16 +193,29 @@ async def _generate_inferences_from_root_agent(

if event.is_final_response() and event.content and event.content.parts:
final_response = event.content
elif event.get_function_calls():
for call in event.get_function_calls():
tool_uses.append(call)
else:
if event.get_function_calls():
for call in event.get_function_calls():
tool_uses.append(call)
tool_uses_with_responses_by_id[call.id] = ToolUseWithResponse(
function_call=call
)
elif event.get_function_responses():
for response in event.get_function_responses():
if response.id in tool_uses_with_responses_by_id:
tool_uses_with_responses_by_id[
response.id
].function_response = response

response_invocations.append(
Invocation(
invocation_id=invocation_id,
user_content=user_content,
final_response=final_response,
intermediate_data=IntermediateData(tool_uses=tool_uses),
intermediate_data=IntermediateData(
tool_uses=tool_uses,
tool_uses_with_responses=tool_uses_with_responses_by_id.values(),
),
)
)

Expand Down
280 changes: 280 additions & 0 deletions src/google/adk/evaluation/hallucinations_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import json
import logging
import re
from typing import Optional

from google.genai import types as genai_types
from typing_extensions import override

from ..models.llm_response import LlmResponse
from ..utils.feature_decorator import working_in_progress
from .eval_case import Invocation
from .eval_case import ToolUseWithResponse
from .eval_metrics import EvalMetric
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import PerInvocationResult
from .llm_as_judge import LlmAsJudge
from .llm_as_judge_utils import get_eval_status
from .llm_as_judge_utils import get_text_from_content

logger = logging.getLogger("google_adk." + __name__)


HALLUCINATIONS_V1_PROMPT = """
You are a helpful and harmless AI assistant. You will be provided with a textual context and a model-generated response.
Your task is to analyze the response sentence by sentence and classify each sentence according to its relationship with the provided context.

**Instructions:**

1. **Decompose the response into individual sentences.**
2. **For each sentence, assign one of the following labels:**
* **`supported`**: The sentence is entailed by the given context. Provide a supporting excerpt from the context. The supporting except must *fully* entail the sentence.
* **`unsupported`**: The sentence is not entailed by the given context. No excerpt is needed for this label.
* **`contradictory`**: The sentence is falsified by the given context. Provide a contradicting excerpt from the context.
* **`disputed`**: The given context contains both supporting and contradicting information. Provide both supporting and contradicting excerpt from the context.
* **`no_rad`**: The sentence does not require factual attribution (e.g., opinions, planning steps, greetings, questions, disclaimers, mathematical calculation).
3. **For each label, provide a short rationale explaining your decision.** The rationale should be separate from the excerpt.
4. **Be very strict with your `supported`, `contradictory` and `disputed` decisions.** Unless you can find straightforward, indisputable evidence excepts *in the context* that a sentence is `supported`, `contradictory` or `disputed`, consider it `unsupported`. You should not employ world knowledge unless it is truly trivial.
5. "tool_outputs" blocks contain code execution results of the "tool_code" blocks immediately above them. If any sentence is based on "tool_outputs" results, first analyze if the corresponding "tool_code" is supported and if the results are error-free. Only if the "tool_code" block is supported, you can treat code execution results as correct.
6. If you need to cite multiple supporting excerpts, simply concatenate them. Excerpt could be summary from the context if it is too long.

**Input Format:**

The input will consist of two parts, clearly separated:

* **Context:** The textual context used to generate the response.
* **Response:** The model-generated response to be analyzed.

**Output Format:**

For each sentence in the response, output a block of text with the following fields:

* sentence: The sentence being analyzed.
* label: One of `supported`, `unsupported`, `contradictory`, `disputed` or `no_rad`.
* rationale: A brief explanation for the assessment
* supporting_excerpt: A relevant excerpt from the context that supports the sentence. Only required for `supported` and `disputed` labels.
* contradicting_excerpt: A relevant excerpt from the context that contradicts with the sentence. Only required for `contradictory` and `disputed` labels.

**Example:**

**Input:**

**Context Begin**
Apples are red fruits. Bananas are yellow fruits. Pears are purple fruits. Pears are blue fruits.
**Context End**

**Response Begin**
Apples are red. Bananas are green. Pears are purple. Bananas are cheaper than apples. Enjoy your fruit!
**Response End**

**Output:**
sentence: Apples are red.
label: supported
rationale: The context explicitly states that apples are red.
supporting_excerpt: Apples are red fruits.
contradicting_excerpt: null

sentence: Bananas are green.
label: contradictory
rationale: The context states that bananas are yellow, not green.
supporting_excerpt: null
contradicting_excerpt: Bananas are yellow fruits.

sentence: Pears are purple.
label: disputed
rationale: The context states that pears are purple but it also states that pears are blue.
supporting_excerpt: Pears are purple fruits
contradicting_excerpt: Pears are blue fruits

sentence: Bananas are cheaper than apples.
label: unsupported
rationale: The context does not mention the price of bananas or apples.
supporting_excerpt: null
contradicting_excerpt: null

sentence: Enjoy your fruit!
label: no_rad
rationale: This is a general expression and does not require factual attribution.
supporting_excerpt: null
contradicting_excerpt: null

**Now, please analyze the following context and response:**

**Input:**

**Context Begin**
{context}
**Context End**

**Response Begin**
{response}
**Response End**

**Output:**
""".strip()


def _format_function_call(function_call: genai_types.FunctionCall) -> str:
"""Formats a function call as a string."""
formatted_function_call = f"Function call\nName: {function_call.name}\n"
formatted_function_call += f"Args: {json.dumps(function_call.args)}"
return formatted_function_call


def _format_function_response(
function_response: genai_types.FunctionResponse,
) -> str:
"""Formats a function response as a string."""
formatted_function_response = (
f"Function response\nName: {function_response.name}\n"
)
formatted_function_response += (
f"Response: {json.dumps(function_response.response)}"
)
return formatted_function_response


def _format_tool_use_with_response(
tool_use_with_response: ToolUseWithResponse,
) -> str:
"""Formats a tool use with response as a string."""
formatted_tool_use = {
"name": tool_use_with_response.function_call.name,
"args": tool_use_with_response.function_call.args,
}
if tool_use_with_response.function_response:
formatted_tool_use["response"] = (
tool_use_with_response.function_response.response
)
return json.dumps(formatted_tool_use)


def _extract_labels_from_critique(response: str) -> list[str]:
"""Extracts the label from the LLM critique."""
label_matches = re.findall(
r"label: (supported|unsupported|contradictory|disputed|no_rad)", response
)
return label_matches


@working_in_progress
class HallucinationsV1Evaluator(LlmAsJudge):
"""LLM-based evaluator to judge factuality and whether the response contains hallucinations."""

def __init__(
self,
eval_metric: EvalMetric,
):
super().__init__(eval_metric)
self._auto_rater_prompt_template = HALLUCINATIONS_V1_PROMPT

@override
def format_auto_rater_prompt(
self, actual_invocation: Invocation, expected_invocation: Invocation
) -> str:
response = get_text_from_content(actual_invocation.final_response)
user_content = get_text_from_content(expected_invocation.user_content)
context = [f"User prompt: {user_content}"]
if not actual_invocation.intermediate_data:
return self._auto_rater_prompt_template.format(
context=context[0], response=response
)
# Only support text and function calls/tool uses for now.
for (
tool_use
) in actual_invocation.intermediate_data.tool_uses_with_responses:
context.append(f"Tool use:\n{_format_tool_use_with_response(tool_use)}")
for (
author,
parts,
) in actual_invocation.intermediate_data.intermediate_responses:
parts_formatted = []
for part in parts:
if part.text:
parts_formatted.append(part.text)
elif part.function_call:
parts_formatted.append(_format_function_call(part.function_call))
elif part.function_response:
parts_formatted.append(
_format_function_response(part.function_response)
)
parts_formatted = "\n".join(parts_formatted)
context.append(f"Sub-agent {author} response:\n{parts_formatted}")
return self._auto_rater_prompt_template.format(
context="\n\n".join(context), response=response
)

@override
def convert_auto_rater_response_to_score(
self, llm_response: LlmResponse
) -> Optional[float]:
text = get_text_from_content(llm_response.content)
labels = _extract_labels_from_critique(text)
if not labels:
return None
final_score = 0.0
# If label is 'disputed' or 'no_rad', the score is not affected.
for label in labels:
if label == "supported":
final_score += 1.0
elif label in ["unsupported", "contradictory"]:
final_score -= 1.0
return final_score / len(labels)

@override
def aggregate_per_invocation_samples(
self, per_invocation_samples: list[PerInvocationResult]
) -> PerInvocationResult:
"""Computes the fraction of invocation samples that are valid."""
final_score = 0.0
num_evaluated = 0
for sample in per_invocation_samples:
if sample.score is None or sample.eval_status == EvalStatus.NOT_EVALUATED:
continue
num_evaluated += 1
final_score += sample.score
final_score /= num_evaluated
return PerInvocationResult(
actual_invocation=per_invocation_samples[0].actual_invocation,
expected_invocation=per_invocation_samples[0].expected_invocation,
score=final_score,
eval_status=get_eval_status(final_score, self._eval_metric.threshold),
)

@override
def aggregate_invocation_results(
self, per_invocation_results: list[PerInvocationResult]
) -> EvaluationResult:
"""Computes the fraction of invocation results that are valid."""
final_score = 0.0
num_evaluated = 0
for result in per_invocation_results:
if result.score is None or result.eval_status == EvalStatus.NOT_EVALUATED:
continue
num_evaluated += 1
final_score += result.score
final_score /= num_evaluated
return EvaluationResult(
overall_score=final_score,
overall_eval_status=get_eval_status(
final_score, self._eval_metric.threshold
),
per_invocation_results=per_invocation_results,
)
Loading