Kaggle · s-alexey · Apr 1, 2026 · Feb 14, 2026
diff --git a/documentation/examples/anagrams_with_genai.py b/documentation/examples/anagrams_with_genai.py
@@ -17,17 +17,11 @@
 # title: Anagrams Task with Genai/ OpenAI Api
 # ---
 # %%
-from kaggle_benchmarks import assertions, chats, task
-from kaggle_benchmarks.kaggle import model_proxy
 
-llm_with_openai_api = model_proxy.ModelProxy(
-    model="google/gemini-2.5-flash",
-    api="openai",
-)
-llm_with_genai_api = model_proxy.ModelProxy(
-    model="google/gemini-2.5-pro",
-    api="genai",
-)
+from kaggle_benchmarks import assertions, chats, kaggle, task
+
+llm_with_openai_api = kaggle.load_model("google/gemini-2.5-flash", api="openai")
+llm_with_genai_api = kaggle.load_model("google/gemini-2.5-flash", api="genai")
 
 
 def is_anagram(x: str, y: str) -> bool:
@@ -54,23 +48,11 @@ def write_anagrams(llm, word: str) -> int:
     for msg in reversed(non_streaming_result.chat.messages)
     if msg.sender is llm_with_genai_api
 )
-metadata = llm_response_message._meta
-
-assert "input_tokens" in metadata, "Metadata is missing 'input_tokens' key"
-assert "output_tokens" in metadata, "Metadata is missing 'output_tokens' key"
-# %%
-llm_with_genai_api.stream_responses = True
+usage = llm_response_message.usage
 
-streaming_result = write_anagrams.run(llm_with_genai_api, "creative")
-
-llm_response_message_stream = next(
-    msg
-    for msg in reversed(streaming_result.chat.messages)
-    if msg.sender is llm_with_genai_api
-)
-metadata_stream = llm_response_message_stream._meta
-assert "input_tokens" in metadata_stream
-assert "output_tokens" in metadata_stream
+assert usage, "Metadata is missing 'usage' attribute"
+assert usage.input_tokens > 0, "usage is missing 'input_tokens' key"
+assert usage.output_tokens > 0, "usage is missing 'output_tokens' key"
 
 
 # %%

diff --git a/documentation/examples/guess_the_number.py b/documentation/examples/guess_the_number.py
@@ -0,0 +1,74 @@
+# Copyright 2026 Kaggle Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# %% [markdown]
+# ---
+# title: Example of a game that requires tool use.
+# ---
+
+# %%
+import random
+
+import kaggle_benchmarks as kbench
+from kaggle_benchmarks.kaggle import models
+
+SECRET_NUMBER = random.randint(1, 10)
+
+
+def guess_number(guess: int) -> str:
+    """Make a guess in the number guessing game."""
+    if guess < SECRET_NUMBER:
+        return "Higher"
+    elif guess > SECRET_NUMBER:
+        return "Lower"
+    else:
+        return "Correct!"
+
+
+@kbench.task(name="guess-the-number-game")
+def play_game(llm):
+    prompt = "I'm thinking of a number between 1 and 10. Can you guess it?"
+    response = llm.prompt(prompt, schema=int, tools=[guess_number])
+
+    for _ in range(4):
+        if response == SECRET_NUMBER:
+            break
+        response = llm.prompt(response, schema=int, tools=[guess_number])
+
+    kbench.assertions.assert_equal(
+        SECRET_NUMBER,
+        response,
+        expectation=f"LLM should have guessed the secret number. The secret number was {SECRET_NUMBER}",
+    )
+
+
+# %%
+
+llm_with_genai_api = models.load_model(
+    model_name=kbench.llm.name,
+    api="genai",
+)
+
+play_game.run(llm=llm_with_genai_api)
+
+# %%
+
+llm_with_openai_api = models.load_model(
+    model_name=kbench.llm.name,
+    api="openai",
+)
+
+play_game.run(llm_with_openai_api)
+
+# %%
diff --git a/documentation/examples/prompt_with_tools.py b/documentation/examples/prompt_with_tools.py
diff --git a/documentation/examples/use_calculator_tool.py b/documentation/examples/use_calculator_tool.py
@@ -14,17 +14,16 @@
 
 # %% [markdown]
 # ---
-# title: Manual Calculator Tool Calling
+# title: Calculator Tool
 # ---
 # %%
-import json
-
-from kaggle_benchmarks import actors, assertions, llm, messages, task
+from kaggle_benchmarks import actors, assertions, llm, task
 
 tool = actors.Actor(name="Tool", role="tool", avatar="🛠️")
 
 
 def run_simple_calculator(a: float, b: float, operator: str) -> float:
+    """Calculates the result of an arithmetic operation like +, -, *, or /."""
     if operator == "+":
         return a + b
     if operator == "-":
@@ -37,72 +36,23 @@ def run_simple_calculator(a: float, b: float, operator: str) -> float:
 
 
 @task("Calculator Tool Use")
-def use_calculator(
-    llm, problem: str, expected_answer: float, stream_mode: bool = False
-) -> None:
-    calculator_tool = {
-        "type": "function",
-        "function": {
-            "name": "simple_calculator",
-            "description": "Calculates the result of an arithmetic operation.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "a": {"type": "number", "description": "The first number."},
-                    "b": {"type": "number", "description": "The second number."},
-                    "operator": {
-                        "type": "string",
-                        "description": "The operator (+, -, *, /).",
-                    },
-                },
-                "required": ["a", "b", "operator"],
-            },
-        },
-    }
-    llm.stream_responses = stream_mode
-
-    actors.user.send(problem)
-
-    tool_call_msg = llm.respond(tools=[calculator_tool])
-    tool_calls = tool_call_msg.tool_calls
-    assertions.assert_true(
-        bool(tool_calls), "LLM was expected to call a tool, but it did not."
-    )
-
-    tool_call = tool_calls[0]
-    function_args = json.loads(tool_call["function"]["arguments"])
-    # Removes 'signature' parameter in thinking mode.
-    function_args.pop("signature", None)
-    tool_result = ""
-    try:
-        tool_result = run_simple_calculator(**function_args)
-    except Exception as e:
-        tool_result = f"Error executing tool: {type(e).__name__} - {e}"
-
-    tool.send(
-        messages.Message(
-            sender=tool,
-            content=str(tool_result),
-            _meta={"tool_call_id": tool_call["id"]},
-        )
+def use_calculator(llm, problem: str, expected_answer: float) -> None:
+    final_answer = llm.prompt(problem, tools=[run_simple_calculator])
+    assertions.assert_tool_was_invoked(
+        run_simple_calculator, "LLM was expected to call a tool, but it did not."
     )
 
-    final_answer_msg = llm.respond()
-    final_answer = final_answer_msg.content
-
     assertions.assert_true(
-        str(expected_answer) in final_answer,
-        f"Expected '{expected_answer}' to be in the final answer, but got '{final_answer}'.",
+        str(expected_answer) in answer,
+        f"Expected '{expected_answer}' to be in the final answer, but got '{answer}'.",
     )
 
 
+# %%
+
 problem = "What is 485 multiplied by 12?"
 expected = 485 * 12
 
-# %%
-use_calculator.run(llm, problem=problem, expected_answer=expected, stream_mode=True)
-
-# %%
-use_calculator.run(llm, problem=problem, expected_answer=expected, stream_mode=False)
+use_calculator.run(llm, problem=problem, expected_answer=expected)
 
 # %%
diff --git a/golden_tests/conftest.py b/golden_tests/conftest.py
@@ -54,7 +54,11 @@ def module_report_fixture(request):
             model_report = report.setdefault(
                 f"{api}://{llm.name}",
                 {
-                    "config": {"structured_output": llm.support_structured_outputs},
+                    "config": {
+                        "structured_output": llm.support_structured_outputs,
+                        "tools": llm.support_tool_calling,
+                        "vision": llm.support_vision,
+                    },
                     "tests": {},
                 },
             )