Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 8 additions & 26 deletions documentation/examples/anagrams_with_genai.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,11 @@
# title: Anagrams Task with Genai/ OpenAI Api
# ---
# %%
from kaggle_benchmarks import assertions, chats, task
from kaggle_benchmarks.kaggle import model_proxy

llm_with_openai_api = model_proxy.ModelProxy(
model="google/gemini-2.5-flash",
api="openai",
)
llm_with_genai_api = model_proxy.ModelProxy(
model="google/gemini-2.5-pro",
api="genai",
)
from kaggle_benchmarks import assertions, chats, kaggle, task

llm_with_openai_api = kaggle.load_model("google/gemini-2.5-flash", api="openai")
llm_with_genai_api = kaggle.load_model("google/gemini-2.5-flash", api="genai")


def is_anagram(x: str, y: str) -> bool:
Expand All @@ -54,23 +48,11 @@ def write_anagrams(llm, word: str) -> int:
for msg in reversed(non_streaming_result.chat.messages)
if msg.sender is llm_with_genai_api
)
metadata = llm_response_message._meta

assert "input_tokens" in metadata, "Metadata is missing 'input_tokens' key"
assert "output_tokens" in metadata, "Metadata is missing 'output_tokens' key"
# %%
llm_with_genai_api.stream_responses = True
usage = llm_response_message.usage

streaming_result = write_anagrams.run(llm_with_genai_api, "creative")

llm_response_message_stream = next(
msg
for msg in reversed(streaming_result.chat.messages)
if msg.sender is llm_with_genai_api
)
metadata_stream = llm_response_message_stream._meta
assert "input_tokens" in metadata_stream
assert "output_tokens" in metadata_stream
assert usage, "Metadata is missing 'usage' attribute"
assert usage.input_tokens > 0, "usage is missing 'input_tokens' key"
assert usage.output_tokens > 0, "usage is missing 'output_tokens' key"


# %%
Expand Down
74 changes: 74 additions & 0 deletions documentation/examples/guess_the_number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2026 Kaggle Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# %% [markdown]
# ---
# title: Example of a game that requires tool use.
# ---

# %%
import random

import kaggle_benchmarks as kbench
from kaggle_benchmarks.kaggle import models

SECRET_NUMBER = random.randint(1, 10)


def guess_number(guess: int) -> str:
"""Make a guess in the number guessing game."""
if guess < SECRET_NUMBER:
return "Higher"
elif guess > SECRET_NUMBER:
return "Lower"
else:
return "Correct!"


@kbench.task(name="guess-the-number-game")
def play_game(llm):
prompt = "I'm thinking of a number between 1 and 10. Can you guess it?"
response = llm.prompt(prompt, schema=int, tools=[guess_number])

for _ in range(4):
if response == SECRET_NUMBER:
break
response = llm.prompt(response, schema=int, tools=[guess_number])

kbench.assertions.assert_equal(
SECRET_NUMBER,
response,
expectation=f"LLM should have guessed the secret number. The secret number was {SECRET_NUMBER}",
)


# %%

llm_with_genai_api = models.load_model(
model_name=kbench.llm.name,
api="genai",
)

play_game.run(llm=llm_with_genai_api)

# %%

llm_with_openai_api = models.load_model(
model_name=kbench.llm.name,
api="openai",
)

play_game.run(llm_with_openai_api)

# %%
86 changes: 0 additions & 86 deletions documentation/examples/prompt_with_tools.py

This file was deleted.

74 changes: 12 additions & 62 deletions documentation/examples/use_calculator_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,16 @@

# %% [markdown]
# ---
# title: Manual Calculator Tool Calling
# title: Calculator Tool
# ---
# %%
import json

from kaggle_benchmarks import actors, assertions, llm, messages, task
from kaggle_benchmarks import actors, assertions, llm, task

tool = actors.Actor(name="Tool", role="tool", avatar="🛠️")


def run_simple_calculator(a: float, b: float, operator: str) -> float:
"""Calculates the result of an arithmetic operation like +, -, *, or /."""
if operator == "+":
return a + b
if operator == "-":
Expand All @@ -37,72 +36,23 @@ def run_simple_calculator(a: float, b: float, operator: str) -> float:


@task("Calculator Tool Use")
def use_calculator(
llm, problem: str, expected_answer: float, stream_mode: bool = False
) -> None:
calculator_tool = {
"type": "function",
"function": {
"name": "simple_calculator",
"description": "Calculates the result of an arithmetic operation.",
"parameters": {
"type": "object",
"properties": {
"a": {"type": "number", "description": "The first number."},
"b": {"type": "number", "description": "The second number."},
"operator": {
"type": "string",
"description": "The operator (+, -, *, /).",
},
},
"required": ["a", "b", "operator"],
},
},
}
llm.stream_responses = stream_mode

actors.user.send(problem)

tool_call_msg = llm.respond(tools=[calculator_tool])
tool_calls = tool_call_msg.tool_calls
assertions.assert_true(
bool(tool_calls), "LLM was expected to call a tool, but it did not."
)

tool_call = tool_calls[0]
function_args = json.loads(tool_call["function"]["arguments"])
# Removes 'signature' parameter in thinking mode.
function_args.pop("signature", None)
tool_result = ""
try:
tool_result = run_simple_calculator(**function_args)
except Exception as e:
tool_result = f"Error executing tool: {type(e).__name__} - {e}"

tool.send(
messages.Message(
sender=tool,
content=str(tool_result),
_meta={"tool_call_id": tool_call["id"]},
)
def use_calculator(llm, problem: str, expected_answer: float) -> None:
final_answer = llm.prompt(problem, tools=[run_simple_calculator])
assertions.assert_tool_was_invoked(
run_simple_calculator, "LLM was expected to call a tool, but it did not."
)

final_answer_msg = llm.respond()
final_answer = final_answer_msg.content

assertions.assert_true(
str(expected_answer) in final_answer,
f"Expected '{expected_answer}' to be in the final answer, but got '{final_answer}'.",
str(expected_answer) in answer,
f"Expected '{expected_answer}' to be in the final answer, but got '{answer}'.",
)


# %%

problem = "What is 485 multiplied by 12?"
expected = 485 * 12

# %%
use_calculator.run(llm, problem=problem, expected_answer=expected, stream_mode=True)

# %%
use_calculator.run(llm, problem=problem, expected_answer=expected, stream_mode=False)
use_calculator.run(llm, problem=problem, expected_answer=expected)

# %%
6 changes: 5 additions & 1 deletion golden_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ def module_report_fixture(request):
model_report = report.setdefault(
f"{api}://{llm.name}",
{
"config": {"structured_output": llm.support_structured_outputs},
"config": {
"structured_output": llm.support_structured_outputs,
"tools": llm.support_tool_calling,
"vision": llm.support_vision,
},
"tests": {},
},
)
Expand Down
Loading