Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,24 @@
},
"request": "launch",
"program": "evalbench/client/eval_client.py",
"args": ["--insecure"],
"args": [
"--insecure"
],
"console": "integratedTerminal"
},
{
"name": "Standalone",
"type": "debugpy",
"cwd": "${workspaceFolder}",
"env": {
"PYTHONPATH": "./evalbench:./evalbench/evalproto"
},
"request": "launch",
"program": "evalbench/evalbench.py",
"args": [
"--experiment_config",
"${workspaceFolder}/datasets/adk-tools/example_run_config.yaml"
],
"console": "integratedTerminal"
}
]
Expand Down
8 changes: 8 additions & 0 deletions datasets/adk-tools/example_run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
############################################################
### Dataset / Eval Items
############################################################
dataset_config: datasets/adk-tools/toolbox.evalset.json
dataset_format: adk-format
setup_directory: datasets/adk-tools/setup
agent_name: myagent
agent_module: myagent
324 changes: 324 additions & 0 deletions datasets/adk-tools/toolbox.evalset.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions evalbench/agent/myagent/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import agent
20 changes: 20 additions & 0 deletions evalbench/agent/myagent/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from google.adk import Agent
from google.adk.apps import App
from toolbox_core import ToolboxSyncClient, auth_methods

URL = "https://toolbox-zxwxgw5sma-uc.a.run.app"
auth_token_provider = auth_methods.aget_google_id_token(URL) # can also use sync method

client = ToolboxSyncClient(
URL,
client_headers={"Authorization": auth_token_provider},
)

root_agent = Agent(
name="root_agent",
model="gemini-2.5-flash",
instruction="You are a helpful AI assistant designed to provide accurate and useful information.",
tools=client.load_toolset(),
)

app = App(root_agent=root_agent, name="myagent")
26 changes: 24 additions & 2 deletions evalbench/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from collections.abc import Sequence
from dataset.evalinput import EvalInputRequest
from dataset.evalinteractinput import EvalInteractInputRequest
from dataset.evaladkinput import EvalADKRequest
from itertools import chain


Expand Down Expand Up @@ -90,6 +91,19 @@ def load_bird_interact_dataset(json_file_path, config):
return input_items


def load_adk_json(json_file_path):
all_items = []
with open(json_file_path, "r") as json_file:
json_item = json_file.read()
item = json.loads(json_item)
eval_input = EvalADKRequest(
id=item["eval_set_id"],
payload=json_item,
)
all_items.append(eval_input)
return all_items


def load_json(json_file_path):
all_items = []
with open(json_file_path, "r") as json_file:
Expand All @@ -102,6 +116,8 @@ def load_dataset_from_json(json_file_path, config):
dataset_format = config.get("dataset_format", "evalbench-standard-format")
if dataset_format == "bird-interact-format":
all_items = load_bird_interact_dataset(json_file_path, config)
elif dataset_format == "adk-format":
all_items = load_adk_json(json_file_path)
else:
all_items = load_json(json_file_path)

Expand All @@ -114,11 +130,15 @@ def load_dataset_from_json(json_file_path, config):
elif dataset_format == "bird-interact-format":
config["orchestrator"] = "interact"
input_items = all_items
elif dataset_format == "adk-format":
config["orchestrator"] = "adk"
input_items = all_items
else:
raise ValueError("Dataset not in any of the recognised formats")

totalEntries = sum(len(input_items.get(q, [])) for q in ["dql", "dml", "ddl"])
logging.info(f"Converted {totalEntries} entries to EvalInput.")
if dataset_format != "adk-format":
totalEntries = sum(len(input_items.get(q, [])) for q in ["dql", "dml", "ddl"])
logging.info(f"Converted {totalEntries} entries to EvalInput.")
return input_items


Expand Down Expand Up @@ -249,4 +269,6 @@ def breakdown_datasets(total_dataset: list[EvalInputRequest]):


def flatten_dataset(dataset: dict[str, list]):
if type(dataset) == list:
return dataset
return list(chain.from_iterable(dataset.values()))
60 changes: 60 additions & 0 deletions evalbench/dataset/evaladkinput.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from google.protobuf.json_format import MessageToDict
import copy
from enum import Enum

try:
from evalproto import eval_request_pb2

PROTO_IMPORTED = True
except ImportError:
PROTO_IMPORTED = False


class EvalADKRequest:
def __init__(
self,
id: str,
payload: dict,
job_id: str = "",
trace_id: str = "",
):
"""Initializes an EvalInputRequest object with all required fields.

See eval_request_pb2 for types
"""
self.id = id
self.payload = payload
self.job_id = job_id
self.trace_id = trace_id

if PROTO_IMPORTED:

@classmethod
def init_from_proto(self, proto: eval_request_pb2.EvalInputRequest): # type: ignore
"""Initializes an EvalInputRequest from eval_request_pb2 proto."""

request = MessageToDict(proto)
return self(
id=str(request.get("id") or -1),
payload=request.get("payload") or {},
job_id=request.get("jobId") or "",
trace_id=request.get("traceId") or "",
)

def to_proto(self): # type: ignore
return eval_request_pb2.EvalInputRequest( # type: ignore
id=int(self.id),
payload=self.payload,
)

else:

@classmethod
def init_from_proto(cls, proto):
raise ImportError("protobuf module not available")

def to_proto(self):
raise ImportError("protobuf module not available")

def copy(self):
return copy.deepcopy(self)
9 changes: 9 additions & 0 deletions evalbench/dataset/evaladkoutput.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from dataset.evaladkinput import EvalADKRequest


class EvalADKOutput(dict):
def __init__(
self,
evaladkinput: EvalADKRequest,
):
self.update(evaladkinput.__dict__)
3 changes: 3 additions & 0 deletions evalbench/evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from evaluator.orchestrator import Orchestrator
from evaluator.oneshotorchestrator import OneShotOrchestrator
from evaluator.interactorchestrator import InteractOrchestrator
from evaluator.adkorchestrator import ADKOrchestrator
import logging


Expand All @@ -11,5 +12,7 @@ def get_orchestrator(config, db_configs, setup_config, report_progress=False):
return OneShotOrchestrator(config, db_configs, setup_config, report_progress)
elif orchestrator_type == "interact":
return InteractOrchestrator(config, db_configs, setup_config, report_progress)
elif orchestrator_type == "adk":
return ADKOrchestrator(config, db_configs, setup_config, report_progress)
else:
return Orchestrator(config, db_configs, setup_config, report_progress)
164 changes: 164 additions & 0 deletions evalbench/evaluator/adkevaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from typing import Any, List
import datetime
from dataset.evaladkinput import EvalADKRequest
from dataset.evaladkoutput import EvalADKOutput
from queue import Queue
from databases import DB
import logging
import asyncio
from rich.console import Console
from google.genai.types import Part
from google.adk.evaluation.agent_evaluator import (
AgentEvaluator,
EvalConfig,
EvalSet,
get_eval_metrics_from_config,
UserSimulatorProvider,
EvalStatus,
)
from google.adk.evaluation.eval_result import EvalCaseResult
from dotenv import load_dotenv
from typing import Optional

console = Console()


class ADKEvaluator:
def __init__(
self,
config,
):
self.config = config
self.agent_name = config["agent_name"]
self.agent_module = config["agent_module"]
self.eval_dataset_file = config["dataset_config"]
if "initial_session_file" in config:
self.initial_session_file = config["initial_session_file"]
else:
self.initial_session_file = None

def evaluate(
self,
dataset: List[EvalADKRequest],
job_id: str,
run_time: datetime.datetime,
):
eval_outputs: List[Any] = []
scoring_results: List[Any] = []
logging.info("Running ADK evaluation")
for eval_input in dataset:
self._evaluate(eval_input, eval_outputs, job_id, run_time, scoring_results)
return eval_outputs, scoring_results

def _evaluate(
self,
eval_input: EvalADKRequest,
eval_outputs: List[Any],
job_id: str,
run_time: datetime.datetime,
scoring_results: List[Any],
num_runs: int = 1,
):
test_files = [self.eval_dataset_file]

initial_session = AgentEvaluator._get_initial_session(self.initial_session_file)

for test_file in test_files:
eval_output = EvalADKOutput(eval_input)
eval_output["job_id"] = job_id
eval_output["run_time"] = run_time
eval_output["test_file"] = test_file
eval_config = AgentEvaluator.find_config_for_test_file(test_file)
eval_set = AgentEvaluator._load_eval_set_from_file(
test_file, eval_config, initial_session
)
eval_output["eval_set"] = eval_set
asyncio.run(
self.evaluate_eval_set(
eval_output,
scoring_results,
agent_module=self.agent_module,
eval_set=eval_set,
eval_config=eval_config,
num_runs=num_runs,
agent_name=self.agent_name,
)
)

async def evaluate_eval_set(
self,
eval_output: EvalADKOutput,
scoring_results: List[Any],
agent_module: str,
eval_set: EvalSet,
criteria: Optional[dict[str, float]] = None,
eval_config: Optional[EvalConfig] = None,
num_runs: int = 1,
agent_name: Optional[str] = None,
):
agent_for_eval = await AgentEvaluator._get_agent_for_eval(
module_name=agent_module, agent_name=None
)
eval_metrics = get_eval_metrics_from_config(eval_config)

user_simulator_provider = UserSimulatorProvider(
user_simulator_config=eval_config.user_simulator_config
)

eval_results_by_eval_id = await AgentEvaluator._get_eval_results_by_eval_id(
agent_for_eval=agent_for_eval,
eval_set=eval_set,
eval_metrics=eval_metrics,
num_runs=num_runs,
user_simulator_provider=user_simulator_provider,
)
self.process_results(eval_results_by_eval_id)
eval_output["eval_results_by_eval_id"] = eval_results_by_eval_id
return eval_results_by_eval_id

def merge_parts(self, parts: list[Part]):
return "".join([part.text for part in parts])

def process_eval_result(
self,
eval_result: EvalCaseResult,
):
for eval_metric_result in eval_result.overall_eval_metric_results:
console.print(
f"{eval_metric_result.metric_name}: {eval_metric_result.eval_status}: {eval_metric_result.score}"
)

for (
eval_metric_result_per_invocation
) in eval_result.eval_metric_result_per_invocation:
console.print(
"*************************************************************************************\n"
)
console.print(
f"USER: {self.merge_parts(eval_metric_result_per_invocation.actual_invocation.user_content.parts)}"
)
console.print(
f"ASSISTANT: {self.merge_parts(eval_metric_result_per_invocation.actual_invocation.final_response.parts)}"
)
for (
eval_metric_result
) in eval_metric_result_per_invocation.eval_metric_results:
console.print(
f"{eval_metric_result.metric_name}: {eval_metric_result.eval_status}: {eval_metric_result.score}"
)
console.print(
"*************************************************************************************\n"
)

def process_results(
self,
eval_results_by_eval_id: dict[str, list],
):
for (
eval_id,
eval_results,
) in eval_results_by_eval_id.items():
print(f"Processing eval results for eval id: {eval_id}")
for eval_result in eval_results:
print(type(eval_result))
self.process_eval_result(eval_result)
Loading