From 52a5b2d8249edb0b0b7459f2ecdaccc4752df83d Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 21:03:09 -0500 Subject: [PATCH 01/18] yutori templates: update model to n1-latest, viewport to 1280, role to user - Model: n1-preview-2025-11 -> n1-latest - Viewport width: 1200 -> 1280 (Yutori's recommended resolution) - Message role: observation -> user for screenshots and tool results - Clean up JSDoc/docstring comments referencing old defaults Co-authored-by: Cursor --- .../python/yutori-computer-use/loop.py | 21 +++++++------- .../python/yutori-computer-use/main.py | 2 +- .../python/yutori-computer-use/session.py | 2 +- .../yutori-computer-use/tools/computer.py | 2 +- .../typescript/yutori-computer-use/index.ts | 2 +- .../typescript/yutori-computer-use/loop.ts | 28 +++++++++---------- .../typescript/yutori-computer-use/session.ts | 6 ++-- .../yutori-computer-use/tools/computer.ts | 2 +- 8 files changed, 31 insertions(+), 34 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 236d4ad1..45ecece9 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -3,9 +3,8 @@ Implements the agent loop for Yutori's n1 computer use model. n1 uses an OpenAI-compatible API with specific conventions: -- Screenshots use role: "observation" (not "user") +- Screenshots and tool results are sent with role: "user" - Coordinates are returned in 1000x1000 space and need scaling -- WebP format recommended for screenshots Supports two modes: - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) @@ -41,7 +40,7 @@ async def screenshot(self) -> ToolResult: async def sampling_loop( *, - model: str = "n1-preview-2025-11", + model: str = "n1-latest", task: str, api_key: str, kernel: Kernel, @@ -49,7 +48,7 @@ async def sampling_loop( cdp_ws_url: Optional[str] = None, max_tokens: int = 4096, max_iterations: int = 50, - viewport_width: int = 1200, + viewport_width: int = 1280, viewport_height: int = 800, mode: BrowserMode = "computer_use", ) -> dict[str, Any]: @@ -88,7 +87,7 @@ async def sampling_loop( if initial_screenshot.get("base64_image"): conversation_messages.append({ - "role": "observation", + "role": "user", "content": [ { "type": "image_url", @@ -157,16 +156,16 @@ async def sampling_loop( result = {"error": str(e)} if result.get("base64_image") or result.get("output"): - observation_content = [] + result_content = [] if result.get("output"): - observation_content.append({ + result_content.append({ "type": "text", "text": result["output"], }) if result.get("base64_image"): - observation_content.append({ + result_content.append({ "type": "image_url", "image_url": { "url": f"data:image/png;base64,{result['base64_image']}" @@ -174,12 +173,12 @@ async def sampling_loop( }) conversation_messages.append({ - "role": "observation", - "content": observation_content, + "role": "user", + "content": result_content, }) elif result.get("error"): conversation_messages.append({ - "role": "observation", + "role": "user", "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], }) diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py index d909c67f..91633288 100644 --- a/pkg/templates/python/yutori-computer-use/main.py +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -59,7 +59,7 @@ async def cua_task( print("Kernel browser live view url:", session.live_view_url) loop_result = await sampling_loop( - model="n1-preview-2025-11", + model="n1-latest", task=payload["query"], api_key=str(api_key), kernel=session.kernel, diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori-computer-use/session.py index 1c449ec9..42dc0177 100644 --- a/pkg/templates/python/yutori-computer-use/session.py +++ b/pkg/templates/python/yutori-computer-use/session.py @@ -32,7 +32,7 @@ class KernelBrowserSession: stealth: bool = True timeout_seconds: int = 300 - viewport_width: int = 1200 + viewport_width: int = 1280 viewport_height: int = 800 # Replay recording options diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index 44601616..e72f191a 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -91,7 +91,7 @@ class N1Action(TypedDict, total=False): class ComputerTool: - def __init__(self, kernel: Kernel, session_id: str, width: int = 1200, height: int = 800): + def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: int = 800): self.kernel = kernel self.session_id = session_id self.width = width diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts index afe51bab..a4aee2f8 100644 --- a/pkg/templates/typescript/yutori-computer-use/index.ts +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -50,7 +50,7 @@ app.action( // Run the sampling loop const mode = payload.mode ?? 'computer_use'; const { finalAnswer, messages } = await samplingLoop({ - model: 'n1-preview-2025-11', + model: 'n1-latest', task: payload.query, apiKey: YUTORI_API_KEY, kernel, diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 351aa9c1..5e7c44cc 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -3,9 +3,8 @@ * * Implements the agent loop for Yutori's n1 computer use model. * n1 uses an OpenAI-compatible API with specific conventions: - * - Screenshots use role: "observation" (not "user") + * - Screenshots and tool results are sent with role: "user" * - Coordinates are returned in 1000x1000 space and need scaling - * - WebP format recommended for screenshots * * Supports two modes: * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) @@ -31,7 +30,7 @@ interface N1ComputerTool { // Per docs: "we generally do not recommend providing custom system prompts" interface Message { - role: 'user' | 'assistant' | 'observation'; + role: 'user' | 'assistant'; content: string | MessageContent[]; } @@ -53,9 +52,9 @@ interface SamplingLoopOptions { cdpWsUrl?: string; maxTokens?: number; maxIterations?: number; - /** Viewport width for coordinate scaling (default: 1200, closest to Yutori's 1280 recommendation) */ + /** Viewport width for coordinate scaling */ viewportWidth?: number; - /** Viewport height for coordinate scaling (default: 800 per Yutori recommendation) */ + /** Viewport height for coordinate scaling */ viewportHeight?: number; /** * Browser interaction mode: @@ -72,7 +71,7 @@ interface SamplingLoopResult { } export async function samplingLoop({ - model = 'n1-preview-2025-11', + model = 'n1-latest', task, apiKey, kernel, @@ -80,8 +79,7 @@ export async function samplingLoop({ cdpWsUrl, maxTokens = 4096, maxIterations = 50, - // Default viewport: 1200x800 (closest Kernel-supported size to Yutori's recommended 1280x800) - viewportWidth = 1200, + viewportWidth = 1280, viewportHeight = 800, mode = 'computer_use', }: SamplingLoopOptions): Promise { @@ -121,7 +119,7 @@ export async function samplingLoop({ if (initialScreenshot.base64Image) { conversationMessages.push({ - role: 'observation', + role: 'user', content: [ { type: 'image_url', @@ -200,17 +198,17 @@ export async function samplingLoop({ } if (result.base64Image || result.output) { - const observationContent: MessageContent[] = []; + const resultContent: MessageContent[] = []; if (result.output) { - observationContent.push({ + resultContent.push({ type: 'text', text: result.output, }); } if (result.base64Image) { - observationContent.push({ + resultContent.push({ type: 'image_url', image_url: { url: `data:image/png;base64,${result.base64Image}`, @@ -219,12 +217,12 @@ export async function samplingLoop({ } conversationMessages.push({ - role: 'observation', - content: observationContent, + role: 'user', + content: resultContent, }); } else if (result.error) { conversationMessages.push({ - role: 'observation', + role: 'user', content: [{ type: 'text', text: `Action failed: ${result.error}` }], }); } diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori-computer-use/session.ts index 2ba59697..d3324f0a 100644 --- a/pkg/templates/typescript/yutori-computer-use/session.ts +++ b/pkg/templates/typescript/yutori-computer-use/session.ts @@ -16,9 +16,9 @@ export interface SessionOptions { recordReplay?: boolean; /** Grace period in seconds before stopping replay */ replayGracePeriod?: number; - /** Viewport width (default: 1200, closest to Yutori's 1280 recommendation) */ + /** Viewport width */ viewportWidth?: number; - /** Viewport height (default: 800 per Yutori recommendation) */ + /** Viewport height */ viewportHeight?: number; } @@ -37,7 +37,7 @@ const DEFAULT_OPTIONS: Required = { timeoutSeconds: 300, recordReplay: false, replayGracePeriod: 5.0, - viewportWidth: 1200, + viewportWidth: 1280, viewportHeight: 800, }; diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index 46fd76ef..e9cdaf35 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -98,7 +98,7 @@ export class ComputerTool { private width: number; private height: number; - constructor(kernel: Kernel, sessionId: string, width = 1200, height = 800) { + constructor(kernel: Kernel, sessionId: string, width = 1280, height = 800) { this.kernel = kernel; this.sessionId = sessionId; this.width = width; From 9cd6181f3e094a56e60bbc210b4e13888e4b4b99 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 21:05:48 -0500 Subject: [PATCH 02/18] yutori templates: remove Playwright mode Simplify templates to only support Kernel's Computer Controls API. Removes playwright-computer.ts/py, BrowserMode type, cdpWsUrl/mode from loop options and entrypoint payloads, and playwright deps. Co-authored-by: Cursor --- .../python/yutori-computer-use/loop.py | 256 ++++++------ .../python/yutori-computer-use/main.py | 9 +- .../python/yutori-computer-use/pyproject.toml | 1 - .../yutori-computer-use/tools/__init__.py | 2 - .../tools/playwright_computer.py | 307 --------------- .../typescript/yutori-computer-use/index.ts | 11 +- .../typescript/yutori-computer-use/loop.ts | 254 +++++------- .../yutori-computer-use/package.json | 3 +- .../tools/playwright-computer.ts | 363 ------------------ 9 files changed, 215 insertions(+), 991 deletions(-) delete mode 100644 pkg/templates/python/yutori-computer-use/tools/playwright_computer.py delete mode 100644 pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 45ecece9..55a76695 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -6,36 +6,17 @@ - Screenshots and tool results are sent with role: "user" - Coordinates are returned in 1000x1000 space and need scaling -Supports two modes: -- computer_use: Uses Kernel's Computer Controls API (full VM screenshots) -- playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) - @see https://docs.yutori.com/reference/n1 """ import json import re -from typing import Any, Literal, Optional, Protocol +from typing import Any, Optional from kernel import Kernel from openai import OpenAI from tools import ComputerTool, N1Action, ToolResult -from tools.playwright_computer import PlaywrightComputerTool - -# Mode for browser interaction -BrowserMode = Literal["computer_use", "playwright"] - - -class N1ComputerToolProtocol(Protocol): - async def execute(self, action: N1Action) -> ToolResult: - ... - - async def screenshot(self) -> ToolResult: - ... - -# n1 uses its own system prompt - custom prompts may degrade performance -# Per docs: "we generally do not recommend providing custom system prompts" async def sampling_loop( @@ -45,12 +26,10 @@ async def sampling_loop( api_key: str, kernel: Kernel, session_id: str, - cdp_ws_url: Optional[str] = None, max_tokens: int = 4096, max_iterations: int = 50, viewport_width: int = 1280, viewport_height: int = 800, - mode: BrowserMode = "computer_use", ) -> dict[str, Any]: """Run the n1 sampling loop until the model returns a stop action or max iterations.""" client = OpenAI( @@ -58,140 +37,121 @@ async def sampling_loop( base_url="https://api.yutori.com/v1", ) - computer_tool: N1ComputerToolProtocol - playwright_tool: Optional[PlaywrightComputerTool] = None - - print(f"Mode requested: {mode!r}, cdp_ws_url available: {cdp_ws_url is not None}") + computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) - if mode == "playwright": - if not cdp_ws_url: - raise ValueError("cdp_ws_url is required for playwright mode") - print(f"Connecting to CDP WebSocket: {cdp_ws_url[:50]}...") - playwright_tool = PlaywrightComputerTool(cdp_ws_url, viewport_width, viewport_height) - await playwright_tool.connect() - computer_tool = playwright_tool - print("Using playwright mode (viewport-only screenshots)") - else: - computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) - print("Using computer_use mode (Computer Controls API)") + initial_screenshot = await computer_tool.screenshot() - try: - initial_screenshot = await computer_tool.screenshot() - - conversation_messages: list[dict[str, Any]] = [ - { - "role": "user", - "content": [{"type": "text", "text": task}], - } - ] + conversation_messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": [{"type": "text", "text": task}], + } + ] + + if initial_screenshot.get("base64_image"): + conversation_messages.append({ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" + }, + } + ], + }) + + iteration = 0 + final_answer: Optional[str] = None + + while iteration < max_iterations: + iteration += 1 + print(f"\n=== Iteration {iteration} ===") + + try: + response = client.chat.completions.create( + model=model, + messages=conversation_messages, + max_tokens=max_tokens, + temperature=0.3, + ) + except Exception as api_error: + print(f"API call failed: {api_error}") + raise api_error + + if not response.choices or len(response.choices) == 0: + print(f"No choices in response: {response}") + raise ValueError("No choices in API response") + + assistant_message = response.choices[0].message + if not assistant_message: + raise ValueError("No response from model") + + response_content = assistant_message.content or "" + print("Assistant response:", response_content) + + conversation_messages.append({ + "role": "assistant", + "content": response_content, + }) + + parsed = _parse_n1_response(response_content) + + if not parsed or not parsed.get("actions"): + print("No actions found in response, ending loop") + break + + for action in parsed["actions"]: + print(f"Executing action: {action.get('action_type')}", action) + + if action.get("action_type") == "stop": + final_answer = action.get("answer") + print(f"Stop action received, final answer: {final_answer}") + return {"messages": conversation_messages, "final_answer": final_answer} + + scaled_action = _scale_coordinates(action, viewport_width, viewport_height) + + result: ToolResult + try: + result = await computer_tool.execute(scaled_action) + except Exception as e: + print(f"Action failed: {e}") + result = {"error": str(e)} + + if result.get("base64_image") or result.get("output"): + result_content = [] + + if result.get("output"): + result_content.append({ + "type": "text", + "text": result["output"], + }) - if initial_screenshot.get("base64_image"): - conversation_messages.append({ - "role": "user", - "content": [ - { + if result.get("base64_image"): + result_content.append({ "type": "image_url", "image_url": { - "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" + "url": f"data:image/png;base64,{result['base64_image']}" }, - } - ], - }) - - iteration = 0 - final_answer: Optional[str] = None - - while iteration < max_iterations: - iteration += 1 - print(f"\n=== Iteration {iteration} ===") - - try: - response = client.chat.completions.create( - model=model, - messages=conversation_messages, - max_tokens=max_tokens, - temperature=0.3, - ) - except Exception as api_error: - print(f"API call failed: {api_error}") - raise api_error - - if not response.choices or len(response.choices) == 0: - print(f"No choices in response: {response}") - raise ValueError("No choices in API response") - - assistant_message = response.choices[0].message - if not assistant_message: - raise ValueError("No response from model") - - response_content = assistant_message.content or "" - print("Assistant response:", response_content) - - conversation_messages.append({ - "role": "assistant", - "content": response_content, - }) - - parsed = _parse_n1_response(response_content) - - if not parsed or not parsed.get("actions"): - print("No actions found in response, ending loop") - break - - for action in parsed["actions"]: - print(f"Executing action: {action.get('action_type')}", action) - - if action.get("action_type") == "stop": - final_answer = action.get("answer") - print(f"Stop action received, final answer: {final_answer}") - return {"messages": conversation_messages, "final_answer": final_answer} - - scaled_action = _scale_coordinates(action, viewport_width, viewport_height) - - result: ToolResult - try: - result = await computer_tool.execute(scaled_action) - except Exception as e: - print(f"Action failed: {e}") - result = {"error": str(e)} - - if result.get("base64_image") or result.get("output"): - result_content = [] - - if result.get("output"): - result_content.append({ - "type": "text", - "text": result["output"], - }) - - if result.get("base64_image"): - result_content.append({ - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{result['base64_image']}" - }, - }) - - conversation_messages.append({ - "role": "user", - "content": result_content, - }) - elif result.get("error"): - conversation_messages.append({ - "role": "user", - "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], }) - if iteration >= max_iterations: - print("Max iterations reached") - - return { - "messages": conversation_messages, - "final_answer": final_answer, - } - finally: - if playwright_tool: - await playwright_tool.disconnect() + conversation_messages.append({ + "role": "user", + "content": result_content, + }) + elif result.get("error"): + conversation_messages.append({ + "role": "user", + "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], + }) + + if iteration >= max_iterations: + print("Max iterations reached") + + return { + "messages": conversation_messages, + "final_answer": final_answer, + } def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py index 91633288..a86e3a63 100644 --- a/pkg/templates/python/yutori-computer-use/main.py +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -2,17 +2,13 @@ from typing import Optional, TypedDict import kernel -from loop import sampling_loop, BrowserMode +from loop import sampling_loop from session import KernelBrowserSession class QueryInput(TypedDict): query: str record_replay: Optional[bool] - # Browser interaction mode: - # - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - default - # - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) - mode: Optional[BrowserMode] class QueryOutput(TypedDict): @@ -50,7 +46,6 @@ async def cua_task( raise ValueError("Query is required") record_replay = payload.get("record_replay", False) - mode: BrowserMode = payload.get("mode") or "computer_use" async with KernelBrowserSession( stealth=True, @@ -64,10 +59,8 @@ async def cua_task( api_key=str(api_key), kernel=session.kernel, session_id=str(session.session_id), - cdp_ws_url=session.cdp_ws_url, viewport_width=session.viewport_width, viewport_height=session.viewport_height, - mode=mode, ) final_answer = loop_result.get("final_answer") diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori-computer-use/pyproject.toml index cca32fc7..31876ef5 100644 --- a/pkg/templates/python/yutori-computer-use/pyproject.toml +++ b/pkg/templates/python/yutori-computer-use/pyproject.toml @@ -7,5 +7,4 @@ dependencies = [ "openai>=1.58.0", "kernel>=0.24.0", "python-dotenv>=1.2.1", - "playwright>=1.52.0", ] diff --git a/pkg/templates/python/yutori-computer-use/tools/__init__.py b/pkg/templates/python/yutori-computer-use/tools/__init__.py index b01c1a2b..63da5188 100644 --- a/pkg/templates/python/yutori-computer-use/tools/__init__.py +++ b/pkg/templates/python/yutori-computer-use/tools/__init__.py @@ -2,12 +2,10 @@ from .base import ToolError, ToolResult from .computer import ComputerTool, N1Action -from .playwright_computer import PlaywrightComputerTool __all__ = [ "ToolError", "ToolResult", "ComputerTool", "N1Action", - "PlaywrightComputerTool", ] diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py deleted file mode 100644 index df98628a..00000000 --- a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py +++ /dev/null @@ -1,307 +0,0 @@ -""" -Yutori n1 Playwright Computer Tool - -Maps n1 action format to Playwright methods via CDP WebSocket connection. -Uses viewport-only screenshots optimized for Yutori n1's training data. - -See: https://docs.yutori.com/reference/n1#screenshot-requirements -""" - -import asyncio -import base64 -import json -from typing import Optional - -from playwright.async_api import async_playwright, Browser, BrowserContext, Page - -from .base import ToolError, ToolResult -from .computer import N1Action - -# Delay after actions before taking screenshot (in seconds for asyncio.sleep) -# Matches TypeScript SCREENSHOT_DELAY_MS = 300 (300ms = 0.3s) -SCREENSHOT_DELAY_S = 0.3 - -# Key mappings from n1 output format to Playwright format -KEY_MAP = { - "Return": "Enter", - "BackSpace": "Backspace", - "Page_Up": "PageUp", - "Page_Down": "PageDown", -} - -MODIFIER_MAP = { - "ctrl": "Control", - "super": "Meta", - "command": "Meta", - "cmd": "Meta", -} - - -class PlaywrightComputerTool: - def __init__(self, cdp_ws_url: str, width: int = 1200, height: int = 800): - self.cdp_ws_url = cdp_ws_url - self.width = width - self.height = height - self._playwright = None - self._browser: Optional[Browser] = None - self._context: Optional[BrowserContext] = None - self._page: Optional[Page] = None - - async def connect(self) -> None: - if self._browser: - return # Already connected - - self._playwright = await async_playwright().start() - self._browser = await self._playwright.chromium.connect_over_cdp(self.cdp_ws_url) - - # Get existing context or create new one - contexts = self._browser.contexts - self._context = contexts[0] if contexts else await self._browser.new_context() - - # Handle new page events - self._context.on("page", self._handle_new_page) - - # Get existing page or create new one - pages = self._context.pages - self._page = pages[0] if pages else await self._context.new_page() - - # Set viewport size to Yutori's recommended dimensions - await self._page.set_viewport_size({"width": self.width, "height": self.height}) - self._page.on("close", self._handle_page_close) - - async def disconnect(self) -> None: - if self._playwright: - await self._playwright.stop() - self._playwright = None - self._browser = None - self._context = None - self._page = None - - def _handle_new_page(self, page: Page) -> None: - print("New page created") - self._page = page - page.on("close", self._handle_page_close) - - def _handle_page_close(self, closed_page: Page) -> None: - print("Page closed") - if self._page == closed_page and self._context: - pages = self._context.pages - if pages: - self._page = pages[-1] - else: - print("Warning: All pages have been closed.") - self._page = None - - def _assert_page(self) -> Page: - if not self._page: - raise ToolError("Page not available. Did you call connect()?") - return self._page - - async def execute(self, action: N1Action) -> ToolResult: - action_type = action.get("action_type") - - handlers = { - "click": self._handle_click, - "scroll": self._handle_scroll, - "type": self._handle_type, - "key_press": self._handle_key_press, - "hover": self._handle_hover, - "drag": self._handle_drag, - "wait": self._handle_wait, - "refresh": self._handle_refresh, - "go_back": self._handle_go_back, - "goto_url": self._handle_goto_url, - "read_texts_and_links": self._handle_read_texts_and_links, - "stop": self._handle_stop, - } - - handler = handlers.get(action_type) - if not handler: - raise ToolError(f"Unknown action type: {action_type}") - - return await handler(action) - - async def _handle_click(self, action: N1Action) -> ToolResult: - page = self._assert_page() - coords = self._get_coordinates(action.get("center_coordinates")) - - await page.mouse.click(coords["x"], coords["y"]) - await asyncio.sleep(SCREENSHOT_DELAY_S) - return await self.screenshot() - - async def _handle_scroll(self, action: N1Action) -> ToolResult: - page = self._assert_page() - coords = self._get_coordinates(action.get("center_coordinates")) - direction = action.get("direction") - amount = action.get("amount", 3) - - if direction not in ("up", "down", "left", "right"): - raise ToolError(f"Invalid scroll direction: {direction}") - - scroll_delta = amount * 100 - - await page.mouse.move(coords["x"], coords["y"]) - - delta_x = 0 - delta_y = 0 - - if direction == "up": - delta_y = -scroll_delta - elif direction == "down": - delta_y = scroll_delta - elif direction == "left": - delta_x = -scroll_delta - elif direction == "right": - delta_x = scroll_delta - - await page.mouse.wheel(delta_x, delta_y) - await asyncio.sleep(SCREENSHOT_DELAY_S) - return await self.screenshot() - - async def _handle_type(self, action: N1Action) -> ToolResult: - page = self._assert_page() - text = action.get("text") - if not text: - raise ToolError("text is required for type action") - - if action.get("clear_before_typing"): - await page.keyboard.press("Control+a") - await asyncio.sleep(0.1) - await page.keyboard.press("Backspace") - await asyncio.sleep(0.1) - - await page.keyboard.type(text) - - if action.get("press_enter_after"): - await asyncio.sleep(0.1) - await page.keyboard.press("Enter") - - await asyncio.sleep(SCREENSHOT_DELAY_S) - return await self.screenshot() - - async def _handle_key_press(self, action: N1Action) -> ToolResult: - page = self._assert_page() - key_comb = action.get("key_comb") - if not key_comb: - raise ToolError("key_comb is required for key_press action") - - mapped_key = self._map_key_to_playwright(key_comb) - await page.keyboard.press(mapped_key) - - await asyncio.sleep(SCREENSHOT_DELAY_S) - return await self.screenshot() - - async def _handle_hover(self, action: N1Action) -> ToolResult: - page = self._assert_page() - coords = self._get_coordinates(action.get("center_coordinates")) - - await page.mouse.move(coords["x"], coords["y"]) - - await asyncio.sleep(SCREENSHOT_DELAY_S) - return await self.screenshot() - - async def _handle_drag(self, action: N1Action) -> ToolResult: - page = self._assert_page() - start_coords = self._get_coordinates(action.get("start_coordinates")) - end_coords = self._get_coordinates(action.get("center_coordinates")) - - await page.mouse.move(start_coords["x"], start_coords["y"]) - await page.mouse.down() - await asyncio.sleep(0.05) - await page.mouse.move(end_coords["x"], end_coords["y"], steps=12) - await page.mouse.up() - - await asyncio.sleep(0.3) - return await self.screenshot() - - async def _handle_wait(self, action: N1Action) -> ToolResult: - await asyncio.sleep(2) - return await self.screenshot() - - async def _handle_refresh(self, action: N1Action) -> ToolResult: - page = self._assert_page() - await page.reload() - await asyncio.sleep(2) - return await self.screenshot() - - async def _handle_go_back(self, action: N1Action) -> ToolResult: - page = self._assert_page() - await page.go_back() - await asyncio.sleep(1.5) - return await self.screenshot() - - async def _handle_goto_url(self, action: N1Action) -> ToolResult: - page = self._assert_page() - url = action.get("url") - if not url: - raise ToolError("url is required for goto_url action") - - await page.goto(url) - await asyncio.sleep(2) - return await self.screenshot() - - async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: - page = self._assert_page() - try: - snapshot = await page.locator("body").aria_snapshot() - url = page.url - title = await page.title() - - screenshot_result = await self.screenshot() - - return { - "base64_image": screenshot_result.get("base64_image", ""), - "output": json.dumps({"url": url, "title": title, "snapshot": snapshot}, indent=2), - } - except Exception as e: - print(f"read_texts_and_links failed: {e}") - return await self.screenshot() - - async def _handle_stop(self, action: N1Action) -> ToolResult: - return {"output": action.get("answer", "Task completed")} - - async def screenshot(self) -> ToolResult: - page = self._assert_page() - try: - buffer = await page.screenshot(full_page=False) - base64_image = base64.b64encode(buffer).decode("utf-8") - return {"base64_image": base64_image} - except Exception as e: - raise ToolError(f"Failed to take screenshot: {e}") - - def get_current_url(self) -> str: - page = self._assert_page() - return page.url - - def _get_coordinates( - self, coords: tuple[int, int] | list[int] | None - ) -> dict[str, int]: - if coords is None or len(coords) != 2: - # Default to center of viewport - return {"x": self.width // 2, "y": self.height // 2} - - x, y = coords - if not isinstance(x, (int, float)) or not isinstance(y, (int, float)) or x < 0 or y < 0: - raise ToolError(f"Invalid coordinates: {coords}") - - return {"x": int(x), "y": int(y)} - - def _map_key_to_playwright(self, key: str) -> str: - # Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") - if "+" in key: - parts = key.split("+") - mapped_parts = [] - for part in parts: - trimmed = part.strip() - lower = trimmed.lower() - - # Map modifier names - if lower in MODIFIER_MAP: - mapped_parts.append(MODIFIER_MAP[lower]) - else: - # Check KEY_MAP for special keys - mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) - - return "+".join(mapped_parts) - - return KEY_MAP.get(key, key) diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts index a4aee2f8..6b02a236 100644 --- a/pkg/templates/typescript/yutori-computer-use/index.ts +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -1,5 +1,5 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; -import { samplingLoop, type BrowserMode } from './loop'; +import { samplingLoop } from './loop'; import { KernelBrowserSession } from './session'; const kernel = new Kernel(); @@ -9,12 +9,6 @@ const app = kernel.app('ts-yutori-cua'); interface QueryInput { query: string; record_replay?: boolean; - /** - * Browser interaction mode: - * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - default - * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) - */ - mode?: BrowserMode; } interface QueryOutput { @@ -48,17 +42,14 @@ app.action( try { // Run the sampling loop - const mode = payload.mode ?? 'computer_use'; const { finalAnswer, messages } = await samplingLoop({ model: 'n1-latest', task: payload.query, apiKey: YUTORI_API_KEY, kernel, sessionId: session.sessionId, - cdpWsUrl: session.cdpWsUrl ?? undefined, viewportWidth: session.viewportWidth, viewportHeight: session.viewportHeight, - mode, }); // Extract the result diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 5e7c44cc..b6500d68 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -6,9 +6,6 @@ * - Screenshots and tool results are sent with role: "user" * - Coordinates are returned in 1000x1000 space and need scaling * - * Supports two modes: - * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) * * @see https://docs.yutori.com/reference/n1 */ @@ -16,15 +13,6 @@ import OpenAI from 'openai'; import type { Kernel } from '@onkernel/sdk'; import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; -import { PlaywrightComputerTool } from './tools/playwright-computer'; - -/** Mode for browser interaction */ -export type BrowserMode = 'computer_use' | 'playwright'; - -interface N1ComputerTool { - execute(action: N1Action): Promise; - screenshot(): Promise; -} // n1 uses its own system prompt - custom prompts may degrade performance // Per docs: "we generally do not recommend providing custom system prompts" @@ -48,21 +36,12 @@ interface SamplingLoopOptions { apiKey: string; kernel: Kernel; sessionId: string; - /** CDP WebSocket URL for playwright mode */ - cdpWsUrl?: string; maxTokens?: number; maxIterations?: number; /** Viewport width for coordinate scaling */ viewportWidth?: number; /** Viewport height for coordinate scaling */ viewportHeight?: number; - /** - * Browser interaction mode: - * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) - * @default 'computer_use' - */ - mode?: BrowserMode; } interface SamplingLoopResult { @@ -76,172 +55,147 @@ export async function samplingLoop({ apiKey, kernel, sessionId, - cdpWsUrl, maxTokens = 4096, maxIterations = 50, viewportWidth = 1280, viewportHeight = 800, - mode = 'computer_use', }: SamplingLoopOptions): Promise { const client = new OpenAI({ apiKey, baseURL: 'https://api.yutori.com/v1', }); - let computerTool: N1ComputerTool; - let playwrightTool: PlaywrightComputerTool | null = null; - - console.log(`Mode requested: '${mode}', cdpWsUrl available: ${cdpWsUrl != null}`); - - if (mode === 'playwright') { - if (!cdpWsUrl) { - throw new Error('cdpWsUrl is required for playwright mode'); - } - console.log(`Connecting to CDP WebSocket: ${cdpWsUrl.substring(0, 50)}...`); - playwrightTool = new PlaywrightComputerTool(cdpWsUrl, viewportWidth, viewportHeight); - await playwrightTool.connect(); - computerTool = playwrightTool; - console.log('Using playwright mode (viewport-only screenshots)'); - } else { - computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); - console.log('Using computer_use mode (Computer Controls API)'); + const computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); + + const initialScreenshot = await computerTool.screenshot(); + + const conversationMessages: Message[] = [ + { + role: 'user', + content: [{ type: 'text', text: task }], + }, + ]; + + if (initialScreenshot.base64Image) { + conversationMessages.push({ + role: 'user', + content: [ + { + type: 'image_url', + image_url: { + url: `data:image/png;base64,${initialScreenshot.base64Image}`, + }, + }, + ], + }); } - try { - const initialScreenshot = await computerTool.screenshot(); + let iteration = 0; + let finalAnswer: string | undefined; - const conversationMessages: Message[] = [ - { - role: 'user', - content: [{ type: 'text', text: task }], - }, - ]; + while (iteration < maxIterations) { + iteration++; + console.log(`\n=== Iteration ${iteration} ===`); - if (initialScreenshot.base64Image) { - conversationMessages.push({ - role: 'user', - content: [ - { - type: 'image_url', - image_url: { - url: `data:image/png;base64,${initialScreenshot.base64Image}`, - }, - }, - ], + let response; + try { + response = await client.chat.completions.create({ + model, + messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], + max_tokens: maxTokens, + temperature: 0.3, }); + } catch (apiError) { + console.error('API call failed:', apiError); + throw apiError; } - let iteration = 0; - let finalAnswer: string | undefined; - - while (iteration < maxIterations) { - iteration++; - console.log(`\n=== Iteration ${iteration} ===`); + if (!response.choices || response.choices.length === 0) { + console.error('No choices in response:', JSON.stringify(response, null, 2)); + throw new Error('No choices in API response'); + } - let response; - try { - response = await client.chat.completions.create({ - model, - messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], - max_tokens: maxTokens, - temperature: 0.3, - }); - } catch (apiError) { - console.error('API call failed:', apiError); - throw apiError; - } + const assistantMessage = response.choices[0]?.message; + if (!assistantMessage) { + throw new Error('No response from model'); + } - if (!response.choices || response.choices.length === 0) { - console.error('No choices in response:', JSON.stringify(response, null, 2)); - throw new Error('No choices in API response'); - } + const responseContent = assistantMessage.content || ''; + console.log('Assistant response:', responseContent); - const assistantMessage = response.choices[0]?.message; - if (!assistantMessage) { - throw new Error('No response from model'); - } + conversationMessages.push({ + role: 'assistant', + content: responseContent, + }); - const responseContent = assistantMessage.content || ''; - console.log('Assistant response:', responseContent); + const parsed = parseN1Response(responseContent); - conversationMessages.push({ - role: 'assistant', - content: responseContent, - }); + if (!parsed || !parsed.actions || parsed.actions.length === 0) { + console.log('No actions found in response, ending loop'); + break; + } - const parsed = parseN1Response(responseContent); + for (const action of parsed.actions) { + console.log('Executing action:', action.action_type, action); - if (!parsed || !parsed.actions || parsed.actions.length === 0) { - console.log('No actions found in response, ending loop'); - break; + if (action.action_type === 'stop') { + finalAnswer = action.answer; + console.log('Stop action received, final answer:', finalAnswer); + return { messages: conversationMessages, finalAnswer }; } - for (const action of parsed.actions) { - console.log('Executing action:', action.action_type, action); + const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); - if (action.action_type === 'stop') { - finalAnswer = action.answer; - console.log('Stop action received, final answer:', finalAnswer); - return { messages: conversationMessages, finalAnswer }; - } + let result: ToolResult; + try { + result = await computerTool.execute(scaledAction); + } catch (error) { + console.error('Action failed:', error); + result = { + error: error instanceof Error ? error.message : String(error), + }; + } - const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); + if (result.base64Image || result.output) { + const resultContent: MessageContent[] = []; - let result: ToolResult; - try { - result = await computerTool.execute(scaledAction); - } catch (error) { - console.error('Action failed:', error); - result = { - error: error instanceof Error ? error.message : String(error), - }; + if (result.output) { + resultContent.push({ + type: 'text', + text: result.output, + }); } - if (result.base64Image || result.output) { - const resultContent: MessageContent[] = []; - - if (result.output) { - resultContent.push({ - type: 'text', - text: result.output, - }); - } - - if (result.base64Image) { - resultContent.push({ - type: 'image_url', - image_url: { - url: `data:image/png;base64,${result.base64Image}`, - }, - }); - } - - conversationMessages.push({ - role: 'user', - content: resultContent, - }); - } else if (result.error) { - conversationMessages.push({ - role: 'user', - content: [{ type: 'text', text: `Action failed: ${result.error}` }], + if (result.base64Image) { + resultContent.push({ + type: 'image_url', + image_url: { + url: `data:image/png;base64,${result.base64Image}`, + }, }); } - } - } - if (iteration >= maxIterations) { - console.log('Max iterations reached'); + conversationMessages.push({ + role: 'user', + content: resultContent, + }); + } else if (result.error) { + conversationMessages.push({ + role: 'user', + content: [{ type: 'text', text: `Action failed: ${result.error}` }], + }); + } } + } - return { - messages: conversationMessages, - finalAnswer, - }; - } finally { - if (playwrightTool) { - await playwrightTool.disconnect(); - } + if (iteration >= maxIterations) { + console.log('Max iterations reached'); } + + return { + messages: conversationMessages, + finalAnswer, + }; } function parseN1Response(content: string): { thoughts?: string; actions?: N1Action[] } | null { diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori-computer-use/package.json index 2bc4fbe3..c7475198 100644 --- a/pkg/templates/typescript/yutori-computer-use/package.json +++ b/pkg/templates/typescript/yutori-computer-use/package.json @@ -5,8 +5,7 @@ "private": true, "dependencies": { "@onkernel/sdk": "^0.24.0", - "openai": "^4.77.0", - "playwright-core": "^1.52.0" + "openai": "^4.77.0" }, "devDependencies": { "@types/node": "^22.15.17", diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts deleted file mode 100644 index d6ce229d..00000000 --- a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts +++ /dev/null @@ -1,363 +0,0 @@ -/** - * Yutori n1 Playwright Computer Tool - * - * Maps n1 action format to Playwright methods via CDP WebSocket connection. - * Uses viewport-only screenshots optimized for Yutori n1's training data. - * - * @see https://docs.yutori.com/reference/n1#screenshot-requirements - */ - -import { chromium, type Browser, type BrowserContext, type Page } from 'playwright-core'; -import type { ToolResult, N1Action } from './computer'; -import { ToolError } from './computer'; - -const SCREENSHOT_DELAY_MS = 300; - -// Key mappings from n1 output format to Playwright format -const KEY_MAP: Record = { - 'Return': 'Enter', - 'BackSpace': 'Backspace', - 'Page_Up': 'PageUp', - 'Page_Down': 'PageDown', -}; - -const MODIFIER_MAP: Record = { - 'ctrl': 'Control', - 'super': 'Meta', - 'command': 'Meta', - 'cmd': 'Meta', -}; - -export class PlaywrightComputerTool { - private cdpWsUrl: string; - private width: number; - private height: number; - private browser: Browser | null = null; - private context: BrowserContext | null = null; - private page: Page | null = null; - - constructor(cdpWsUrl: string, width = 1200, height = 800) { - this.cdpWsUrl = cdpWsUrl; - this.width = width; - this.height = height; - } - - async connect(): Promise { - if (this.browser) { - return; // Already connected - } - - this.browser = await chromium.connectOverCDP(this.cdpWsUrl); - - // Get existing context or create new one - this.context = this.browser.contexts()[0]; - if (!this.context) { - this.context = await this.browser.newContext(); - } - - // Handle new page events - this.context.on('page', this.handleNewPage.bind(this)); - - // Get existing page or create new one - this.page = this.context.pages()[0]; - if (!this.page) { - this.page = await this.context.newPage(); - } - - // Set viewport size to Yutori's recommended dimensions - await this.page.setViewportSize({ width: this.width, height: this.height }); - this.page.on('close', this.handlePageClose.bind(this)); - } - - async disconnect(): Promise { - if (this.browser) { - this.browser = null; - this.context = null; - this.page = null; - } - } - - private handleNewPage(page: Page): void { - console.log('New page created'); - this.page = page; - page.on('close', this.handlePageClose.bind(this)); - } - - private handlePageClose(closedPage: Page): void { - console.log('Page closed'); - if (this.page === closedPage && this.context) { - const pages = this.context.pages(); - if (pages.length > 0) { - this.page = pages[pages.length - 1]; - } else { - console.warn('Warning: All pages have been closed.'); - this.page = null; - } - } - } - - private assertPage(): asserts this is { page: Page } { - if (!this.page) { - throw new ToolError('Page not available. Did you call connect()?'); - } - } - - async execute(action: N1Action): Promise { - this.assertPage(); - const { action_type } = action; - - switch (action_type) { - case 'click': - return this.handleClick(action); - case 'scroll': - return this.handleScroll(action); - case 'type': - return this.handleType(action); - case 'key_press': - return this.handleKeyPress(action); - case 'hover': - return this.handleHover(action); - case 'drag': - return this.handleDrag(action); - case 'wait': - return this.handleWait(); - case 'refresh': - return this.handleRefresh(); - case 'go_back': - return this.handleGoBack(); - case 'goto_url': - return this.handleGotoUrl(action); - case 'read_texts_and_links': - return this.handleReadTextsAndLinks(); - case 'stop': - return this.handleStop(action); - default: - throw new ToolError(`Unknown action type: ${action_type}`); - } - } - - private async handleClick(action: N1Action): Promise { - this.assertPage(); - const coords = this.getCoordinates(action.center_coordinates); - - await this.page.mouse.click(coords.x, coords.y); - await this.sleep(SCREENSHOT_DELAY_MS); - return this.screenshot(); - } - - private async handleScroll(action: N1Action): Promise { - this.assertPage(); - const coords = this.getCoordinates(action.center_coordinates); - const direction = action.direction; - const amount = action.amount ?? 3; - - if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { - throw new ToolError(`Invalid scroll direction: ${direction}`); - } - - const scrollDelta = amount * 100; - - await this.page.mouse.move(coords.x, coords.y); - - let deltaX = 0; - let deltaY = 0; - - switch (direction) { - case 'up': - deltaY = -scrollDelta; - break; - case 'down': - deltaY = scrollDelta; - break; - case 'left': - deltaX = -scrollDelta; - break; - case 'right': - deltaX = scrollDelta; - break; - } - - await this.page.mouse.wheel(deltaX, deltaY); - await this.sleep(SCREENSHOT_DELAY_MS); - return this.screenshot(); - } - - private async handleType(action: N1Action): Promise { - this.assertPage(); - const text = action.text; - if (!text) { - throw new ToolError('text is required for type action'); - } - - if (action.clear_before_typing) { - await this.page.keyboard.press('Control+a'); - await this.sleep(100); - await this.page.keyboard.press('Backspace'); - await this.sleep(100); - } - - await this.page.keyboard.type(text); - - if (action.press_enter_after) { - await this.sleep(100); - await this.page.keyboard.press('Enter'); - } - - await this.sleep(SCREENSHOT_DELAY_MS); - return this.screenshot(); - } - - private async handleKeyPress(action: N1Action): Promise { - this.assertPage(); - const keyComb = action.key_comb; - if (!keyComb) { - throw new ToolError('key_comb is required for key_press action'); - } - - const mappedKey = this.mapKeyToPlaywright(keyComb); - await this.page.keyboard.press(mappedKey); - - await this.sleep(SCREENSHOT_DELAY_MS); - return this.screenshot(); - } - - private async handleHover(action: N1Action): Promise { - this.assertPage(); - const coords = this.getCoordinates(action.center_coordinates); - - await this.page.mouse.move(coords.x, coords.y); - - await this.sleep(SCREENSHOT_DELAY_MS); - return this.screenshot(); - } - - private async handleDrag(action: N1Action): Promise { - this.assertPage(); - const startCoords = this.getCoordinates(action.start_coordinates); - const endCoords = this.getCoordinates(action.center_coordinates); - - await this.page.mouse.move(startCoords.x, startCoords.y); - await this.page.mouse.down(); - await this.sleep(50); - await this.page.mouse.move(endCoords.x, endCoords.y, { steps: 12 }); - await this.page.mouse.up(); - - await this.sleep(300); - return this.screenshot(); - } - - private async handleWait(): Promise { - await this.sleep(2000); - return this.screenshot(); - } - - private async handleRefresh(): Promise { - this.assertPage(); - await this.page.reload(); - await this.sleep(2000); - return this.screenshot(); - } - - private async handleGoBack(): Promise { - this.assertPage(); - await this.page.goBack(); - await this.sleep(1500); - return this.screenshot(); - } - - private async handleGotoUrl(action: N1Action): Promise { - this.assertPage(); - const url = action.url; - if (!url) { - throw new ToolError('url is required for goto_url action'); - } - - await this.page.goto(url); - await this.sleep(2000); - return this.screenshot(); - } - - private async handleReadTextsAndLinks(): Promise { - this.assertPage(); - try { - const snapshot = await this.page.locator('body').ariaSnapshot(); - const url = this.page.url(); - const title = await this.page.title(); - - const screenshotResult = await this.screenshot(); - - return { - base64Image: screenshotResult.base64Image, - output: JSON.stringify({ url, title, snapshot }, null, 2), - }; - } catch (error) { - console.warn('read_texts_and_links failed:', error); - return this.screenshot(); - } - } - - private handleStop(action: N1Action): ToolResult { - // Return the final answer without taking a screenshot - return { - output: action.answer || 'Task completed', - }; - } - - async screenshot(): Promise { - this.assertPage(); - try { - const buffer = await this.page.screenshot({ fullPage: false }); - - return { - base64Image: buffer.toString('base64'), - }; - } catch (error) { - throw new ToolError(`Failed to take screenshot: ${error}`); - } - } - - getCurrentUrl(): string { - this.assertPage(); - return this.page.url(); - } - - private getCoordinates(coords?: [number, number]): { x: number; y: number } { - if (!coords || coords.length !== 2) { - // Default to center of viewport - return { x: this.width / 2, y: this.height / 2 }; - } - - const [x, y] = coords; - if (typeof x !== 'number' || typeof y !== 'number' || x < 0 || y < 0) { - throw new ToolError(`Invalid coordinates: ${JSON.stringify(coords)}`); - } - - return { x, y }; - } - - private mapKeyToPlaywright(key: string): string { - // Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") - if (key.includes('+')) { - const parts = key.split('+'); - const mappedParts = parts.map((part) => { - const trimmed = part.trim(); - const lower = trimmed.toLowerCase(); - - // Map modifier names - if (MODIFIER_MAP[lower]) { - return MODIFIER_MAP[lower]; - } - - // Check KEY_MAP for special keys - return KEY_MAP[trimmed] || trimmed; - }); - return mappedParts.join('+'); - } - - return KEY_MAP[key] || key; - } - - private sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); - } -} From c9866182539579d51c6f799c10631217d9f19acf Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 21:07:12 -0500 Subject: [PATCH 03/18] yutori templates: update tool mapping for n1-latest API - Rename click -> left_click, add double_click/triple_click/right_click - Rename center_coordinates -> coordinates throughout - Remove stop and read_texts_and_links action handlers - Add WebP screenshot conversion (sharp for TS, Pillow for Python) - Add sharp/Pillow to dependencies Co-authored-by: Cursor --- .../python/yutori-computer-use/pyproject.toml | 1 + .../yutori-computer-use/tools/computer.py | 91 +++++----------- .../yutori-computer-use/package.json | 3 +- .../yutori-computer-use/tools/computer.ts | 103 +++++------------- 4 files changed, 57 insertions(+), 141 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori-computer-use/pyproject.toml index 31876ef5..25471749 100644 --- a/pkg/templates/python/yutori-computer-use/pyproject.toml +++ b/pkg/templates/python/yutori-computer-use/pyproject.toml @@ -7,4 +7,5 @@ dependencies = [ "openai>=1.58.0", "kernel>=0.24.0", "python-dotenv>=1.2.1", + "Pillow>=10.0.0", ] diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index e72f191a..8ff3fe9e 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -1,27 +1,29 @@ """ Yutori n1 Computer Tool -Maps n1 action format to Kernel's Computer Controls API. +Maps n1-latest action format to Kernel's Computer Controls API. +Screenshots are converted to WebP for better compression across multi-step trajectories. """ import asyncio import base64 -import json -from typing import Literal, TypedDict, Optional +from io import BytesIO +from typing import Literal, TypedDict from kernel import Kernel +from PIL import Image from .base import ToolError, ToolResult -TYPING_DELAY_MS = 12 # Typing delay in milliseconds (used by Kernel API) -# Delays in seconds for asyncio.sleep (matches TypeScript 300ms = 0.3s) +TYPING_DELAY_MS = 12 SCREENSHOT_DELAY_S = 0.3 ACTION_DELAY_S = 0.3 - -# n1 action types N1ActionType = Literal[ - "click", + "left_click", + "double_click", + "triple_click", + "right_click", "scroll", "type", "key_press", @@ -31,14 +33,12 @@ "refresh", "go_back", "goto_url", - "read_texts_and_links", - "stop", ] class N1Action(TypedDict, total=False): action_type: N1ActionType - center_coordinates: tuple[int, int] | list[int] + coordinates: tuple[int, int] | list[int] start_coordinates: tuple[int, int] | list[int] direction: Literal["up", "down", "left", "right"] amount: int @@ -47,10 +47,8 @@ class N1Action(TypedDict, total=False): clear_before_typing: bool key_comb: str url: str - answer: str -# Key mappings from Playwright format (n1 output) to xdotool format (Kernel) KEY_MAP = { "Enter": "Return", "Escape": "Escape", @@ -101,7 +99,10 @@ async def execute(self, action: N1Action) -> ToolResult: action_type = action.get("action_type") handlers = { - "click": self._handle_click, + "left_click": lambda a: self._handle_click(a, "left", 1), + "double_click": lambda a: self._handle_click(a, "left", 2), + "triple_click": lambda a: self._handle_click(a, "left", 3), + "right_click": lambda a: self._handle_click(a, "right", 1), "scroll": self._handle_scroll, "type": self._handle_type, "key_press": self._handle_key_press, @@ -111,8 +112,6 @@ async def execute(self, action: N1Action) -> ToolResult: "refresh": self._handle_refresh, "go_back": self._handle_go_back, "goto_url": self._handle_goto_url, - "read_texts_and_links": self._handle_read_texts_and_links, - "stop": self._handle_stop, } handler = handlers.get(action_type) @@ -121,23 +120,23 @@ async def execute(self, action: N1Action) -> ToolResult: return await handler(action) - async def _handle_click(self, action: N1Action) -> ToolResult: - coords = self._get_coordinates(action.get("center_coordinates")) + async def _handle_click(self, action: N1Action, button: str, num_clicks: int) -> ToolResult: + coords = self._get_coordinates(action.get("coordinates")) self.kernel.browsers.computer.click_mouse( self.session_id, x=coords["x"], y=coords["y"], - button="left", + button=button, click_type="click", - num_clicks=1, + num_clicks=num_clicks, ) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() async def _handle_scroll(self, action: N1Action) -> ToolResult: - coords = self._get_coordinates(action.get("center_coordinates")) + coords = self._get_coordinates(action.get("coordinates")) direction = action.get("direction") amount = action.get("amount", 3) @@ -218,7 +217,7 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult: return await self.screenshot() async def _handle_hover(self, action: N1Action) -> ToolResult: - coords = self._get_coordinates(action.get("center_coordinates")) + coords = self._get_coordinates(action.get("coordinates")) self.kernel.browsers.computer.move_mouse( self.session_id, @@ -231,7 +230,7 @@ async def _handle_hover(self, action: N1Action) -> ToolResult: async def _handle_drag(self, action: N1Action) -> ToolResult: start_coords = self._get_coordinates(action.get("start_coordinates")) - end_coords = self._get_coordinates(action.get("center_coordinates")) + end_coords = self._get_coordinates(action.get("coordinates")) self.kernel.browsers.computer.drag_mouse( self.session_id, @@ -293,48 +292,16 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: await asyncio.sleep(2) return await self.screenshot() - async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: - try: - result = self.kernel.browsers.playwright.execute( - self.session_id, - code=""" - const snapshot = await page._snapshotForAI(); - const url = page.url(); - const title = await page.title(); - return { url, title, snapshot }; - """, - timeout_sec=30 - ) - - screenshot_result = await self.screenshot() - - if result.success and result.result: - data = result.result - return { - "base64_image": screenshot_result.get("base64_image", ""), - "output": json.dumps({ - "url": data.get("url"), - "title": data.get("title"), - "snapshot": data.get("snapshot") - }, indent=2) - } - - print("Playwright execution failed, falling back to screenshot only") - return screenshot_result - except Exception as e: - print(f"read_texts_and_links failed: {e}") - return await self.screenshot() - - async def _handle_stop(self, action: N1Action) -> ToolResult: - return {"output": action.get("answer", "Task completed")} - async def screenshot(self) -> ToolResult: try: response = self.kernel.browsers.computer.capture_screenshot( self.session_id ) - image_bytes = response.read() - base64_image = base64.b64encode(image_bytes).decode("utf-8") + png_bytes = response.read() + img = Image.open(BytesIO(png_bytes)) + webp_buf = BytesIO() + img.save(webp_buf, "WEBP", quality=80) + base64_image = base64.b64encode(webp_buf.getvalue()).decode("utf-8") return {"base64_image": base64_image} except Exception as e: raise ToolError(f"Failed to take screenshot: {e}") @@ -343,7 +310,6 @@ def _get_coordinates( self, coords: tuple[int, int] | list[int] | None ) -> dict[str, int]: if coords is None or len(coords) != 2: - # Default to center of screen return {"x": self.width // 2, "y": self.height // 2} x, y = coords @@ -353,7 +319,6 @@ def _get_coordinates( return {"x": int(x), "y": int(y)} def _map_key(self, key: str) -> str: - # Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") if "+" in key: parts = key.split("+") mapped_parts = [] @@ -361,11 +326,9 @@ def _map_key(self, key: str) -> str: trimmed = part.strip() lower = trimmed.lower() - # Map modifier names if lower in MODIFIER_MAP: mapped_parts.append(MODIFIER_MAP[lower]) else: - # Check KEY_MAP for special keys mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) return "+".join(mapped_parts) diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori-computer-use/package.json index c7475198..e3683db3 100644 --- a/pkg/templates/typescript/yutori-computer-use/package.json +++ b/pkg/templates/typescript/yutori-computer-use/package.json @@ -5,7 +5,8 @@ "private": true, "dependencies": { "@onkernel/sdk": "^0.24.0", - "openai": "^4.77.0" + "openai": "^4.77.0", + "sharp": "^0.33.0" }, "devDependencies": { "@types/node": "^22.15.17", diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index e9cdaf35..7bdd50aa 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -1,11 +1,13 @@ /** * Yutori n1 Computer Tool * - * Maps n1 action format to Kernel's Computer Controls API. + * Maps n1-latest action format to Kernel's Computer Controls API. + * Screenshots are converted to WebP for better compression across multi-step trajectories. */ import { Buffer } from 'buffer'; import type { Kernel } from '@onkernel/sdk'; +import sharp from 'sharp'; const TYPING_DELAY_MS = 12; const SCREENSHOT_DELAY_MS = 300; @@ -24,9 +26,11 @@ export class ToolError extends Error { } } -// n1 action types export type N1ActionType = - | 'click' + | 'left_click' + | 'double_click' + | 'triple_click' + | 'right_click' | 'scroll' | 'type' | 'key_press' @@ -35,13 +39,11 @@ export type N1ActionType = | 'wait' | 'refresh' | 'go_back' - | 'goto_url' - | 'read_texts_and_links' - | 'stop'; + | 'goto_url'; export interface N1Action { action_type: N1ActionType; - center_coordinates?: [number, number]; + coordinates?: [number, number]; start_coordinates?: [number, number]; direction?: 'up' | 'down' | 'left' | 'right'; amount?: number; @@ -50,10 +52,8 @@ export interface N1Action { clear_before_typing?: boolean; key_comb?: string; url?: string; - answer?: string; } -// Key mappings from Playwright format (n1 output) to xdotool format (Kernel) const KEY_MAP: Record = { 'Enter': 'Return', 'Escape': 'Escape', @@ -109,8 +109,14 @@ export class ComputerTool { const { action_type } = action; switch (action_type) { - case 'click': - return this.handleClick(action); + case 'left_click': + return this.handleClick(action, 'left', 1); + case 'double_click': + return this.handleClick(action, 'left', 2); + case 'triple_click': + return this.handleClick(action, 'left', 3); + case 'right_click': + return this.handleClick(action, 'right', 1); case 'scroll': return this.handleScroll(action); case 'type': @@ -129,24 +135,20 @@ export class ComputerTool { return this.handleGoBack(); case 'goto_url': return this.handleGotoUrl(action); - case 'read_texts_and_links': - return this.handleReadTextsAndLinks(); - case 'stop': - return this.handleStop(action); default: throw new ToolError(`Unknown action type: ${action_type}`); } } - private async handleClick(action: N1Action): Promise { - const coords = this.getCoordinates(action.center_coordinates); + private async handleClick(action: N1Action, button: 'left' | 'right', numClicks: number): Promise { + const coords = this.getCoordinates(action.coordinates); await this.kernel.browsers.computer.clickMouse(this.sessionId, { x: coords.x, y: coords.y, - button: 'left', + button, click_type: 'click', - num_clicks: 1, + num_clicks: numClicks, }); await this.sleep(SCREENSHOT_DELAY_MS); @@ -154,7 +156,7 @@ export class ComputerTool { } private async handleScroll(action: N1Action): Promise { - const coords = this.getCoordinates(action.center_coordinates); + const coords = this.getCoordinates(action.coordinates); const direction = action.direction; const amount = action.amount ?? 3; @@ -243,7 +245,7 @@ export class ComputerTool { } private async handleHover(action: N1Action): Promise { - const coords = this.getCoordinates(action.center_coordinates); + const coords = this.getCoordinates(action.coordinates); await this.kernel.browsers.computer.moveMouse(this.sessionId, { x: coords.x, @@ -256,7 +258,7 @@ export class ComputerTool { private async handleDrag(action: N1Action): Promise { const startCoords = this.getCoordinates(action.start_coordinates); - const endCoords = this.getCoordinates(action.center_coordinates); + const endCoords = this.getCoordinates(action.coordinates); await this.kernel.browsers.computer.dragMouse(this.sessionId, { path: [[startCoords.x, startCoords.y], [endCoords.x, endCoords.y]], @@ -320,63 +322,16 @@ export class ComputerTool { return this.screenshot(); } - private async handleReadTextsAndLinks(): Promise { - try { - // Get AI snapshot via Playwright Execution API - const result = await this.kernel.browsers.playwright.execute( - this.sessionId, - { - code: ` - const snapshot = await page._snapshotForAI(); - const url = page.url(); - const title = await page.title(); - return { url, title, snapshot }; - `, - timeout_sec: 30 - } - ); - - // Get screenshot via Computer Controls API - const screenshotResult = await this.screenshot(); - - if (result.success && result.result) { - const { url, title, snapshot } = result.result as { - url: string; - title: string; - snapshot: string; - }; - - return { - base64Image: screenshotResult.base64Image, - output: JSON.stringify({ url, title, snapshot }, null, 2) - }; - } - - // Fallback to just screenshot if Playwright execution fails - console.warn('Playwright execution failed, falling back to screenshot only'); - return screenshotResult; - } catch (error) { - console.warn('read_texts_and_links failed:', error); - return this.screenshot(); - } - } - - private handleStop(action: N1Action): ToolResult { - // Return the final answer without taking a screenshot - return { - output: action.answer || 'Task completed', - }; - } - async screenshot(): Promise { try { const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); const blob = await response.blob(); const arrayBuffer = await blob.arrayBuffer(); - const buffer = Buffer.from(arrayBuffer); + const pngBuffer = Buffer.from(arrayBuffer); + const webpBuffer = await sharp(pngBuffer).webp({ quality: 80 }).toBuffer(); return { - base64Image: buffer.toString('base64'), + base64Image: webpBuffer.toString('base64'), }; } catch (error) { throw new ToolError(`Failed to take screenshot: ${error}`); @@ -385,7 +340,6 @@ export class ComputerTool { private getCoordinates(coords?: [number, number]): { x: number; y: number } { if (!coords || coords.length !== 2) { - // Default to center of screen return { x: this.width / 2, y: this.height / 2 }; } @@ -398,19 +352,16 @@ export class ComputerTool { } private mapKey(key: string): string { - // Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") if (key.includes('+')) { const parts = key.split('+'); const mappedParts = parts.map(part => { const trimmed = part.trim(); const lower = trimmed.toLowerCase(); - // Map modifier names if (MODIFIER_MAP[lower]) { return MODIFIER_MAP[lower]; } - // Check KEY_MAP for special keys return KEY_MAP[trimmed] || trimmed; }); return mappedParts.join('+'); From f262fbc83940f0331cea06e80e0efb61204bdc40 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 21:08:07 -0500 Subject: [PATCH 04/18] yutori templates: rewrite sampling loop for n1-latest tool_calls format - Parse actions from response.tool_calls instead of JSON in content - Use role: "tool" with tool_call_id for tool results - Combine task text + initial screenshot in single user message - Stop condition: no tool_calls in response (model returns plain content) - Update MIME type to image/webp - Remove parseN1Response / _parse_n1_response JSON parsing - Update scaleCoordinates for coordinates field (was center_coordinates) Co-authored-by: Cursor --- .../python/yutori-computer-use/loop.py | 155 +++++++-------- .../typescript/yutori-computer-use/loop.ts | 177 ++++++++---------- 2 files changed, 153 insertions(+), 179 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 55a76695..1e75b406 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -1,16 +1,17 @@ """ Yutori n1 Sampling Loop -Implements the agent loop for Yutori's n1 computer use model. -n1 uses an OpenAI-compatible API with specific conventions: -- Screenshots and tool results are sent with role: "user" +Implements the agent loop for Yutori's n1-latest computer use model. +n1-latest uses an OpenAI-compatible API with tool_calls: +- Actions are returned via tool_calls in the assistant message +- Tool results use role: "tool" with matching tool_call_id +- The model stops by returning content without tool_calls - Coordinates are returned in 1000x1000 space and need scaling @see https://docs.yutori.com/reference/n1 """ import json -import re from typing import Any, Optional from kernel import Kernel @@ -31,7 +32,7 @@ async def sampling_loop( viewport_width: int = 1280, viewport_height: int = 800, ) -> dict[str, Any]: - """Run the n1 sampling loop until the model returns a stop action or max iterations.""" + """Run the n1 sampling loop until the model stops calling tools or max iterations.""" client = OpenAI( api_key=api_key, base_url="https://api.yutori.com/v1", @@ -41,26 +42,19 @@ async def sampling_loop( initial_screenshot = await computer_tool.screenshot() - conversation_messages: list[dict[str, Any]] = [ - { - "role": "user", - "content": [{"type": "text", "text": task}], - } - ] - + user_content: list[dict[str, Any]] = [{"type": "text", "text": task}] if initial_screenshot.get("base64_image"): - conversation_messages.append({ - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" - }, - } - ], + user_content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/webp;base64,{initial_screenshot['base64_image']}" + }, }) + conversation_messages: list[dict[str, Any]] = [ + {"role": "user", "content": user_content} + ] + iteration = 0 final_answer: Optional[str] = None @@ -83,31 +77,55 @@ async def sampling_loop( print(f"No choices in response: {response}") raise ValueError("No choices in API response") - assistant_message = response.choices[0].message + choice = response.choices[0] + assistant_message = choice.message if not assistant_message: raise ValueError("No response from model") - response_content = assistant_message.content or "" - print("Assistant response:", response_content) + print("Assistant content:", assistant_message.content or "(none)") - conversation_messages.append({ + # Preserve full assistant message (including tool_calls) in history + assistant_dict: dict[str, Any] = { "role": "assistant", - "content": response_content, - }) + "content": assistant_message.content or "", + } + if assistant_message.tool_calls: + assistant_dict["tool_calls"] = [ + { + "id": tc.id, + "type": tc.type, + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + for tc in assistant_message.tool_calls + ] + conversation_messages.append(assistant_dict) - parsed = _parse_n1_response(response_content) + tool_calls = assistant_message.tool_calls - if not parsed or not parsed.get("actions"): - print("No actions found in response, ending loop") + # No tool_calls means the model is done + if not tool_calls: + final_answer = assistant_message.content or None + print(f"No tool_calls, model is done. Final answer: {final_answer}") break - for action in parsed["actions"]: - print(f"Executing action: {action.get('action_type')}", action) + for tc in tool_calls: + action_name = tc.function.name + try: + args = json.loads(tc.function.arguments) + except json.JSONDecodeError: + print(f"Failed to parse tool_call arguments: {tc.function.arguments}") + conversation_messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": "Error: failed to parse arguments", + }) + continue - if action.get("action_type") == "stop": - final_answer = action.get("answer") - print(f"Stop action received, final answer: {final_answer}") - return {"messages": conversation_messages, "final_answer": final_answer} + action: N1Action = {"action_type": action_name, **args} + print(f"Executing action: {action_name}", args) scaled_action = _scale_coordinates(action, viewport_width, viewport_height) @@ -118,31 +136,31 @@ async def sampling_loop( print(f"Action failed: {e}") result = {"error": str(e)} - if result.get("base64_image") or result.get("output"): - result_content = [] - - if result.get("output"): - result_content.append({ - "type": "text", - "text": result["output"], - }) - - if result.get("base64_image"): - result_content.append({ - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{result['base64_image']}" - }, - }) - + # Build tool response message + if result.get("base64_image"): conversation_messages.append({ - "role": "user", - "content": result_content, + "role": "tool", + "tool_call_id": tc.id, + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/webp;base64,{result['base64_image']}" + }, + } + ], }) elif result.get("error"): conversation_messages.append({ - "role": "user", - "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], + "role": "tool", + "tool_call_id": tc.id, + "content": f"Action failed: {result['error']}", + }) + else: + conversation_messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": result.get("output", "OK"), }) if iteration >= max_iterations: @@ -154,27 +172,12 @@ async def sampling_loop( } -def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: - try: - # The response should be JSON - return json.loads(content) - except json.JSONDecodeError: - # Try to extract JSON from the response if it's wrapped in text - json_match = re.search(r'\{[\s\S]*\}', content) - if json_match: - try: - return json.loads(json_match.group(0)) - except json.JSONDecodeError: - print(f"Failed to parse action JSON: {json_match.group(0)}") - return None - - def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action: scaled = dict(action) - if "center_coordinates" in scaled and scaled["center_coordinates"]: - coords = scaled["center_coordinates"] - scaled["center_coordinates"] = [ + if "coordinates" in scaled and scaled["coordinates"]: + coords = scaled["coordinates"] + scaled["coordinates"] = [ round((coords[0] / 1000) * viewport_width), round((coords[1] / 1000) * viewport_height), ] diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index b6500d68..471042cd 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -1,12 +1,13 @@ /** * Yutori n1 Sampling Loop * - * Implements the agent loop for Yutori's n1 computer use model. - * n1 uses an OpenAI-compatible API with specific conventions: - * - Screenshots and tool results are sent with role: "user" + * Implements the agent loop for Yutori's n1-latest computer use model. + * n1-latest uses an OpenAI-compatible API with tool_calls: + * - Actions are returned via tool_calls in the assistant message + * - Tool results use role: "tool" with matching tool_call_id + * - The model stops by returning content without tool_calls * - Coordinates are returned in 1000x1000 space and need scaling * - * * @see https://docs.yutori.com/reference/n1 */ @@ -14,22 +15,6 @@ import OpenAI from 'openai'; import type { Kernel } from '@onkernel/sdk'; import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; -// n1 uses its own system prompt - custom prompts may degrade performance -// Per docs: "we generally do not recommend providing custom system prompts" - -interface Message { - role: 'user' | 'assistant'; - content: string | MessageContent[]; -} - -interface MessageContent { - type: 'text' | 'image_url'; - text?: string; - image_url?: { - url: string; - }; -} - interface SamplingLoopOptions { model?: string; task: string; @@ -38,14 +23,12 @@ interface SamplingLoopOptions { sessionId: string; maxTokens?: number; maxIterations?: number; - /** Viewport width for coordinate scaling */ viewportWidth?: number; - /** Viewport height for coordinate scaling */ viewportHeight?: number; } interface SamplingLoopResult { - messages: Message[]; + messages: OpenAI.ChatCompletionMessageParam[]; finalAnswer?: string; } @@ -69,26 +52,22 @@ export async function samplingLoop({ const initialScreenshot = await computerTool.screenshot(); - const conversationMessages: Message[] = [ + const conversationMessages: OpenAI.ChatCompletionMessageParam[] = [ { - role: 'user', - content: [{ type: 'text', text: task }], - }, - ]; - - if (initialScreenshot.base64Image) { - conversationMessages.push({ role: 'user', content: [ - { - type: 'image_url', - image_url: { - url: `data:image/png;base64,${initialScreenshot.base64Image}`, - }, - }, + { type: 'text', text: task }, + ...(initialScreenshot.base64Image + ? [{ + type: 'image_url' as const, + image_url: { + url: `data:image/webp;base64,${initialScreenshot.base64Image}`, + }, + }] + : []), ], - }); - } + }, + ]; let iteration = 0; let finalAnswer: string | undefined; @@ -101,7 +80,7 @@ export async function samplingLoop({ try { response = await client.chat.completions.create({ model, - messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], + messages: conversationMessages, max_tokens: maxTokens, temperature: 0.3, }); @@ -115,35 +94,48 @@ export async function samplingLoop({ throw new Error('No choices in API response'); } - const assistantMessage = response.choices[0]?.message; + const choice = response.choices[0]; + const assistantMessage = choice.message; if (!assistantMessage) { throw new Error('No response from model'); } - const responseContent = assistantMessage.content || ''; - console.log('Assistant response:', responseContent); + console.log('Assistant content:', assistantMessage.content || '(none)'); - conversationMessages.push({ - role: 'assistant', - content: responseContent, - }); + // Preserve full assistant message (including tool_calls) in history + conversationMessages.push(assistantMessage); - const parsed = parseN1Response(responseContent); + const toolCalls = assistantMessage.tool_calls; - if (!parsed || !parsed.actions || parsed.actions.length === 0) { - console.log('No actions found in response, ending loop'); + // No tool_calls means the model is done + if (!toolCalls || toolCalls.length === 0) { + finalAnswer = assistantMessage.content || undefined; + console.log('No tool_calls, model is done. Final answer:', finalAnswer); break; } - for (const action of parsed.actions) { - console.log('Executing action:', action.action_type, action); - - if (action.action_type === 'stop') { - finalAnswer = action.answer; - console.log('Stop action received, final answer:', finalAnswer); - return { messages: conversationMessages, finalAnswer }; + for (const toolCall of toolCalls) { + const actionName = toolCall.function.name; + let args: Record; + try { + args = JSON.parse(toolCall.function.arguments); + } catch { + console.error('Failed to parse tool_call arguments:', toolCall.function.arguments); + conversationMessages.push({ + role: 'tool', + tool_call_id: toolCall.id, + content: 'Error: failed to parse arguments', + }); + continue; } + const action: N1Action = { + action_type: actionName as N1Action['action_type'], + ...args, + }; + + console.log('Executing action:', actionName, args); + const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); let result: ToolResult; @@ -156,33 +148,31 @@ export async function samplingLoop({ }; } - if (result.base64Image || result.output) { - const resultContent: MessageContent[] = []; - - if (result.output) { - resultContent.push({ - type: 'text', - text: result.output, - }); - } - - if (result.base64Image) { - resultContent.push({ - type: 'image_url', - image_url: { - url: `data:image/png;base64,${result.base64Image}`, - }, - }); - } - + // Build tool response message + if (result.base64Image) { conversationMessages.push({ - role: 'user', - content: resultContent, + role: 'tool', + tool_call_id: toolCall.id, + content: [ + { + type: 'image_url', + image_url: { + url: `data:image/webp;base64,${result.base64Image}`, + }, + }, + ] as unknown as string, }); } else if (result.error) { conversationMessages.push({ - role: 'user', - content: [{ type: 'text', text: `Action failed: ${result.error}` }], + role: 'tool', + tool_call_id: toolCall.id, + content: `Action failed: ${result.error}`, + }); + } else { + conversationMessages.push({ + role: 'tool', + tool_call_id: toolCall.id, + content: result.output || 'OK', }); } } @@ -198,32 +188,13 @@ export async function samplingLoop({ }; } -function parseN1Response(content: string): { thoughts?: string; actions?: N1Action[] } | null { - try { - // The response should be JSON - const parsed = JSON.parse(content); - return parsed; - } catch { - // Try to extract JSON from the response if it's wrapped in text - const jsonMatch = content.match(/\{[\s\S]*\}/); - if (jsonMatch) { - try { - return JSON.parse(jsonMatch[0]); - } catch { - console.error('Failed to parse action JSON:', jsonMatch[0]); - } - } - return null; - } -} - function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action { const scaled = { ...action }; - if (scaled.center_coordinates) { - scaled.center_coordinates = [ - Math.round((scaled.center_coordinates[0] / 1000) * viewportWidth), - Math.round((scaled.center_coordinates[1] / 1000) * viewportHeight), + if (scaled.coordinates) { + scaled.coordinates = [ + Math.round((scaled.coordinates[0] / 1000) * viewportWidth), + Math.round((scaled.coordinates[1] / 1000) * viewportHeight), ]; } From bed71fa26fa02d4bf96baf9b229ee32b10296758 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 21:08:37 -0500 Subject: [PATCH 05/18] yutori templates: simplify extractLastAssistantMessage n1-latest returns plain text content (not JSON), so remove JSON parsing from the fallback message extraction in both entrypoints. Co-authored-by: Cursor --- pkg/templates/python/yutori-computer-use/main.py | 12 ++---------- .../typescript/yutori-computer-use/index.ts | 14 ++------------ 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py index a86e3a63..fecced28 100644 --- a/pkg/templates/python/yutori-computer-use/main.py +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -79,17 +79,9 @@ async def cua_task( def _extract_last_assistant_message(messages: list) -> str: - import json - for msg in reversed(messages): if msg.get("role") == "assistant": content = msg.get("content") - if isinstance(content, str): - # Try to parse the thoughts from JSON response - try: - parsed = json.loads(content) - if parsed.get("thoughts"): - return parsed["thoughts"] - except json.JSONDecodeError: - return content + if isinstance(content, str) and content: + return content return "Task completed" diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts index 6b02a236..7dbf69cc 100644 --- a/pkg/templates/typescript/yutori-computer-use/index.ts +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -73,18 +73,8 @@ app.action( function extractLastAssistantMessage(messages: { role: string; content: string | unknown[] }[]): string { for (let i = messages.length - 1; i >= 0; i--) { const msg = messages[i]; - if (msg.role === 'assistant') { - if (typeof msg.content === 'string') { - // Try to parse the thoughts from JSON response - try { - const parsed = JSON.parse(msg.content); - if (parsed.thoughts) { - return parsed.thoughts; - } - } catch { - return msg.content; - } - } + if (msg.role === 'assistant' && typeof msg.content === 'string' && msg.content) { + return msg.content; } } return 'Task completed'; From 73fc8502255cc74d5e514df8bfadd072a356f5b2 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 21:09:03 -0500 Subject: [PATCH 06/18] yutori templates: update READMEs for n1-latest - Update viewport docs (now using 1280 directly) - Update action table with new click variants, remove stop - Add WebP screenshot section Co-authored-by: Cursor --- .../python/yutori-computer-use/README.md | 16 +++++++++++----- .../typescript/yutori-computer-use/README.md | 16 +++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/README.md b/pkg/templates/python/yutori-computer-use/README.md index 2f8ec2fa..b3aaba5d 100644 --- a/pkg/templates/python/yutori-computer-use/README.md +++ b/pkg/templates/python/yutori-computer-use/README.md @@ -37,17 +37,24 @@ When enabled, the response will include a `replay_url` field with a link to view ## Viewport Configuration -Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. Kernel's closest supported viewport is **1200×800 at 25Hz**, which this template uses by default. +Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. -> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. The slight width difference (1200 vs 1280) should have minimal impact on accuracy. +> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. -## n1 Supported Actions +## Screenshots + +Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori. + +## n1-latest Supported Actions | Action | Description | |--------|-------------| -| `click` | Left mouse click at coordinates | +| `left_click` | Left mouse click at coordinates | +| `double_click` | Double-click at coordinates | +| `triple_click` | Triple-click at coordinates | +| `right_click` | Right mouse click at coordinates | | `scroll` | Scroll page in a direction | | `type` | Type text into focused element | | `key_press` | Send keyboard input | @@ -57,7 +64,6 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport | `refresh` | Reload current page | | `go_back` | Navigate back in history | | `goto_url` | Navigate to a URL | -| `stop` | End task with final answer | ## Resources diff --git a/pkg/templates/typescript/yutori-computer-use/README.md b/pkg/templates/typescript/yutori-computer-use/README.md index 625c94df..2c3307d9 100644 --- a/pkg/templates/typescript/yutori-computer-use/README.md +++ b/pkg/templates/typescript/yutori-computer-use/README.md @@ -37,17 +37,24 @@ When enabled, the response will include a `replay_url` field with a link to view ## Viewport Configuration -Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. Kernel's closest supported viewport is **1200×800 at 25Hz**, which this template uses by default. +Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. -> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. The slight width difference (1200 vs 1280) should have minimal impact on accuracy. +> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. -## n1 Supported Actions +## Screenshots + +Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori. + +## n1-latest Supported Actions | Action | Description | |--------|-------------| -| `click` | Left mouse click at coordinates | +| `left_click` | Left mouse click at coordinates | +| `double_click` | Double-click at coordinates | +| `triple_click` | Triple-click at coordinates | +| `right_click` | Right mouse click at coordinates | | `scroll` | Scroll page in a direction | | `type` | Type text into focused element | | `key_press` | Send keyboard input | @@ -57,7 +64,6 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport | `refresh` | Reload current page | | `go_back` | Navigate back in history | | `goto_url` | Navigate to a URL | -| `stop` | End task with final answer | ## Resources From 9bc2f02c07fc29ca268be6f7ba252d5867672e04 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 22:13:09 -0500 Subject: [PATCH 07/18] Ignore .cursor/plans in .gitignore Add a .cursor/plans entry to .gitignore and a 'Cursor' comment header to prevent committing cursor plan files. Also include a blank line for readability. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 39ae8349..010dc12a 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,9 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store + +# Cursor +.cursor/plans/ kernel # QA testing directories From 5f44c7d378f7aa128eebd16c36624c9458308ee5 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 22:34:07 -0500 Subject: [PATCH 08/18] Rename max_tokens to max_completion_tokens Rename the max_tokens parameter to max_completion_tokens in the yutori-computer-use sampling loop templates (Python and TypeScript). Update function signatures, default values (keeps 4096), interface/property name in TS, and the client.chat.completions.create payload key to use max_completion_tokens. This aligns the parameter name with the completion API field and preserves existing behavior. --- pkg/templates/python/yutori-computer-use/loop.py | 4 ++-- pkg/templates/typescript/yutori-computer-use/loop.ts | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 1e75b406..09e7b6a7 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -27,7 +27,7 @@ async def sampling_loop( api_key: str, kernel: Kernel, session_id: str, - max_tokens: int = 4096, + max_completion_tokens: int = 4096, max_iterations: int = 50, viewport_width: int = 1280, viewport_height: int = 800, @@ -66,7 +66,7 @@ async def sampling_loop( response = client.chat.completions.create( model=model, messages=conversation_messages, - max_tokens=max_tokens, + max_completion_tokens=max_completion_tokens, temperature=0.3, ) except Exception as api_error: diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 471042cd..6d6d8c22 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -21,7 +21,7 @@ interface SamplingLoopOptions { apiKey: string; kernel: Kernel; sessionId: string; - maxTokens?: number; + maxCompletionTokens?: number; maxIterations?: number; viewportWidth?: number; viewportHeight?: number; @@ -38,7 +38,7 @@ export async function samplingLoop({ apiKey, kernel, sessionId, - maxTokens = 4096, + maxCompletionTokens = 4096, maxIterations = 50, viewportWidth = 1280, viewportHeight = 800, @@ -81,7 +81,7 @@ export async function samplingLoop({ response = await client.chat.completions.create({ model, messages: conversationMessages, - max_tokens: maxTokens, + max_completion_tokens: maxCompletionTokens, temperature: 0.3, }); } catch (apiError) { From b1a51601640d52e8d536018f29a29c161a4a6ec1 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 22:45:08 -0500 Subject: [PATCH 09/18] yutori-cua: add kiosk mode option (Step 1) - Payload: optional kiosk (TS) / kiosk (Python) on cua-task - Session: create browser with kiosk_mode when true (TS + Python) - Loop: pass kioskMode/kiosk_mode into sampling loop and ComputerTool - ComputerTool: accept kioskMode/kiosk_mode param (no behavior change yet) - goto_url still uses Computer Controls (Ctrl+L); Playwright path in Step 2 Co-authored-by: Cursor --- pkg/templates/python/yutori-computer-use/loop.py | 3 ++- pkg/templates/python/yutori-computer-use/main.py | 4 ++++ pkg/templates/python/yutori-computer-use/session.py | 4 ++++ .../python/yutori-computer-use/tools/computer.py | 3 ++- pkg/templates/typescript/yutori-computer-use/index.ts | 6 +++++- pkg/templates/typescript/yutori-computer-use/loop.ts | 4 +++- pkg/templates/typescript/yutori-computer-use/session.ts | 8 ++++++++ .../typescript/yutori-computer-use/tools/computer.ts | 4 +++- 8 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 09e7b6a7..74cdef8b 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -31,6 +31,7 @@ async def sampling_loop( max_iterations: int = 50, viewport_width: int = 1280, viewport_height: int = 800, + kiosk_mode: bool = False, ) -> dict[str, Any]: """Run the n1 sampling loop until the model stops calling tools or max iterations.""" client = OpenAI( @@ -38,7 +39,7 @@ async def sampling_loop( base_url="https://api.yutori.com/v1", ) - computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) + computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height, kiosk_mode=kiosk_mode) initial_screenshot = await computer_tool.screenshot() diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py index fecced28..8952af59 100644 --- a/pkg/templates/python/yutori-computer-use/main.py +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -9,6 +9,7 @@ class QueryInput(TypedDict): query: str record_replay: Optional[bool] + kiosk: Optional[bool] class QueryOutput(TypedDict): @@ -46,10 +47,12 @@ async def cua_task( raise ValueError("Query is required") record_replay = payload.get("record_replay", False) + kiosk_mode = payload.get("kiosk", False) async with KernelBrowserSession( stealth=True, record_replay=record_replay, + kiosk_mode=kiosk_mode, ) as session: print("Kernel browser live view url:", session.live_view_url) @@ -61,6 +64,7 @@ async def cua_task( session_id=str(session.session_id), viewport_width=session.viewport_width, viewport_height=session.viewport_height, + kiosk_mode=kiosk_mode, ) final_answer = loop_result.get("final_answer") diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori-computer-use/session.py index 42dc0177..74c27deb 100644 --- a/pkg/templates/python/yutori-computer-use/session.py +++ b/pkg/templates/python/yutori-computer-use/session.py @@ -39,6 +39,9 @@ class KernelBrowserSession: record_replay: bool = False replay_grace_period: float = 5.0 # Seconds to wait before stopping replay + # Kiosk mode (hides address bar and tabs in live view) + kiosk_mode: bool = False + # Set after browser creation session_id: Optional[str] = field(default=None, init=False) live_view_url: Optional[str] = field(default=None, init=False) @@ -57,6 +60,7 @@ async def __aenter__(self) -> "KernelBrowserSession": "width": self.viewport_width, "height": self.viewport_height, }, + kiosk_mode=self.kiosk_mode, ) self.session_id = browser.session_id diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index 8ff3fe9e..63ad18cb 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -89,11 +89,12 @@ class N1Action(TypedDict, total=False): class ComputerTool: - def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: int = 800): + def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: int = 800, kiosk_mode: bool = False): self.kernel = kernel self.session_id = session_id self.width = width self.height = height + self.kiosk_mode = kiosk_mode async def execute(self, action: N1Action) -> ToolResult: action_type = action.get("action_type") diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts index 7dbf69cc..b3786f45 100644 --- a/pkg/templates/typescript/yutori-computer-use/index.ts +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -9,6 +9,7 @@ const app = kernel.app('ts-yutori-cua'); interface QueryInput { query: string; record_replay?: boolean; + kiosk?: boolean; } interface QueryOutput { @@ -31,10 +32,12 @@ app.action( throw new Error('Query is required'); } - // Create browser session with optional replay recording + // Create browser session with optional replay recording and kiosk mode + const kioskMode = payload.kiosk ?? false; const session = new KernelBrowserSession(kernel, { stealth: true, recordReplay: payload.record_replay ?? false, + kioskMode, }); await session.start(); @@ -50,6 +53,7 @@ app.action( sessionId: session.sessionId, viewportWidth: session.viewportWidth, viewportHeight: session.viewportHeight, + kioskMode, }); // Extract the result diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 6d6d8c22..d484e36a 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -25,6 +25,7 @@ interface SamplingLoopOptions { maxIterations?: number; viewportWidth?: number; viewportHeight?: number; + kioskMode?: boolean; } interface SamplingLoopResult { @@ -42,13 +43,14 @@ export async function samplingLoop({ maxIterations = 50, viewportWidth = 1280, viewportHeight = 800, + kioskMode = false, }: SamplingLoopOptions): Promise { const client = new OpenAI({ apiKey, baseURL: 'https://api.yutori.com/v1', }); - const computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); + const computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight, kioskMode); const initialScreenshot = await computerTool.screenshot(); diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori-computer-use/session.ts index d3324f0a..4a0699bd 100644 --- a/pkg/templates/typescript/yutori-computer-use/session.ts +++ b/pkg/templates/typescript/yutori-computer-use/session.ts @@ -20,6 +20,8 @@ export interface SessionOptions { viewportWidth?: number; /** Viewport height */ viewportHeight?: number; + /** Launch browser in kiosk mode (hides address bar and tabs) */ + kioskMode?: boolean; } export interface SessionInfo { @@ -39,6 +41,7 @@ const DEFAULT_OPTIONS: Required = { replayGracePeriod: 5.0, viewportWidth: 1280, viewportHeight: 800, + kioskMode: false, }; /** @@ -98,6 +101,10 @@ export class KernelBrowserSession { return this.options.viewportHeight; } + get kioskMode(): boolean { + return this.options.kioskMode; + } + get info(): SessionInfo { return { sessionId: this.sessionId, @@ -118,6 +125,7 @@ export class KernelBrowserSession { width: this.options.viewportWidth, height: this.options.viewportHeight, }, + kiosk_mode: this.options.kioskMode, }); this._sessionId = browser.session_id; diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index 7bdd50aa..8f89fcf9 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -97,12 +97,14 @@ export class ComputerTool { private sessionId: string; private width: number; private height: number; + private kioskMode: boolean; - constructor(kernel: Kernel, sessionId: string, width = 1280, height = 800) { + constructor(kernel: Kernel, sessionId: string, width = 1280, height = 800, kioskMode = false) { this.kernel = kernel; this.sessionId = sessionId; this.width = width; this.height = height; + this.kioskMode = kioskMode; } async execute(action: N1Action): Promise { From e7b98199d0ba73e58a4efa3f0c48a7e1ff54df68 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 22:47:46 -0500 Subject: [PATCH 10/18] yutori-cua: use Playwright for goto_url when kiosk mode (Step 2) When kiosk_mode is true, goto_url calls Playwright Execution API (page.goto) instead of Computer Controls so navigation works without the address bar. Non-kiosk unchanged (Ctrl+L + type + Enter). Co-authored-by: Cursor --- .../python/yutori-computer-use/tools/computer.py | 12 ++++++++++++ .../typescript/yutori-computer-use/tools/computer.ts | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py index 63ad18cb..2d5784ec 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -7,6 +7,7 @@ import asyncio import base64 +import json from io import BytesIO from typing import Literal, TypedDict @@ -267,6 +268,17 @@ async def _handle_goto_url(self, action: N1Action) -> ToolResult: if not url: raise ToolError("url is required for goto_url action") + if self.kiosk_mode: + response = self.kernel.browsers.playwright.execute( + self.session_id, + code=f"await page.goto({json.dumps(url)});", + timeout_sec=60, + ) + if not response.success: + raise ToolError(response.error or "Playwright goto failed") + await asyncio.sleep(ACTION_DELAY_S) + return await self.screenshot() + self.kernel.browsers.computer.press_key( self.session_id, keys=["ctrl+l"], diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index 8f89fcf9..faa966b1 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -300,6 +300,18 @@ export class ComputerTool { throw new ToolError('url is required for goto_url action'); } + if (this.kioskMode) { + const response = await this.kernel.browsers.playwright.execute(this.sessionId, { + code: `await page.goto(${JSON.stringify(url)});`, + timeout_sec: 60, + }); + if (!response.success) { + return { error: response.error ?? 'Playwright goto failed' }; + } + await this.sleep(ACTION_DELAY_MS); + return this.screenshot(); + } + await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: ['ctrl+l'], }); From 17c73fcee083e0f81f38731c22faf97718d5d3dd Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Tue, 17 Feb 2026 22:54:41 -0500 Subject: [PATCH 11/18] Update Yutori docs and example payloads Clarify Yutori testing instructions by replacing the previous 'computer_use'/'playwright' mode notes with a single guidance to test both the default browser and the 'kiosk: true' (Playwright) option. Update kernel invoke examples (TypeScript and Python) to remove the 'mode' field and use either no mode (default) or 'kiosk': true. Adjust the automated runtime test matrix to list 'default' and 'kiosk: true' entries. These changes simplify the documentation and align examples with the current CLI payload format. --- .cursor/commands/qa.md | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 7f5a540e..12504189 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -60,8 +60,6 @@ Here are all valid language + template combinations: | typescript | claude-agent-sdk | ts-claude-agent-sdk | ts-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | | typescript | yutori-computer-use | ts-yutori-cua | ts-yutori-cua | Yes | YUTORI_API_KEY | -> **Note:** The `yutori-computer-use` template supports two modes: `computer_use` (default, full VM screenshots) and `playwright` (viewport-only screenshots via CDP). Both modes should be tested. - | python | sample-app | py-sample-app | python-basic | No | - | | python | gemini-computer-use | py-gemini-cua | python-gemini-cua | Yes | GOOGLE_API_KEY | | python | captcha-solver | py-captcha-solver | python-captcha-solver | No | - | @@ -72,9 +70,7 @@ Here are all valid language + template combinations: | python | claude-agent-sdk | py-claude-agent-sdk | py-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | | python | yutori-computer-use | py-yutori-cua | python-yutori-cua | Yes | YUTORI_API_KEY | -> **Yutori Modes:** -> - `computer_use` (default): Uses Kernel's Computer Controls API with full VM screenshots -> - `playwright`: Uses Playwright via CDP WebSocket for viewport-only screenshots (optimized for n1 model) +> **Yutori:** Test both default browser and `"kiosk": true` (uses Playwright for goto_url when kiosk is enabled). ### Create Commands @@ -275,8 +271,8 @@ kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipe kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke ts-gemini-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' -kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "computer_use"}' -kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "playwright"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "kiosk": true}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -287,8 +283,8 @@ kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}' kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' kernel invoke python-gemini-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' -kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "computer_use"}' -kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "playwright"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "kiosk": true}' ``` ## Step 7: Automated Runtime Testing (Optional) @@ -313,8 +309,8 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | ts-openai-cua | ts-openai-cua | | | | ts-gemini-cua | ts-gemini-cua | | | | ts-claude-agent-sdk | ts-claude-agent-sdk | | | -| ts-yutori-cua | ts-yutori-cua | | mode: computer_use | -| ts-yutori-cua | ts-yutori-cua | | mode: playwright | +| ts-yutori-cua | ts-yutori-cua | | default | +| ts-yutori-cua | ts-yutori-cua | | kiosk: true | | py-sample-app | python-basic | | | | py-captcha-solver | python-captcha-solver | | | | py-browser-use | python-bu | | | @@ -323,8 +319,8 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | py-openagi-cua | python-openagi-cua | | | | py-claude-agent-sdk | py-claude-agent-sdk | | | | py-gemini-cua | python-gemini-cua | | | -| py-yutori-cua | python-yutori-cua | | mode: computer_use | -| py-yutori-cua | python-yutori-cua | | mode: playwright | +| py-yutori-cua | python-yutori-cua | | default | +| py-yutori-cua | python-yutori-cua | | kiosk: true | Status values: - **SUCCESS**: App started and returned a result From 5bd582ce4a7cac3612af9bd332df8d1ecf6f8eaa Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 08:58:26 -0500 Subject: [PATCH 12/18] Add kiosk mode docs and update usage payloads Update Python and TypeScript yutori-computer-use README templates: replace the generic example invoke payload with a concrete Magnitasks kanban drag-and-drop scenario, and add a new "Kiosk mode" section. The new section explains when to use kiosk mode (recording or single-site automation), notes that the agent may still try the address bar which can slow sessions, and provides example invoke commands for default (non-kiosk) and kiosk usage. Changes applied to pkg/templates/python/yutori-computer-use/README.md and pkg/templates/typescript/yutori-computer-use/README.md to clarify usage and improve replay/automation guidance. --- .../python/yutori-computer-use/README.md | 20 ++++++++++++++++++- .../typescript/yutori-computer-use/README.md | 20 ++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/README.md b/pkg/templates/python/yutori-computer-use/README.md index b3aaba5d..7523aff2 100644 --- a/pkg/templates/python/yutori-computer-use/README.md +++ b/pkg/templates/python/yutori-computer-use/README.md @@ -20,7 +20,7 @@ kernel deploy main.py --env-file .env ## Usage ```bash -kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items."}' ``` ## Recording Replays @@ -35,6 +35,24 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https When enabled, the response will include a `replay_url` field with a link to view the recorded session. +## Kiosk mode + +Prefer **non-kiosk mode** by default and when the agent is expected to switch domains via URL. Use **kiosk (`"kiosk": true`)** when: (1) you're recording sessions and want a cleaner UI in the replay, or (2) you're automating on a single website and the combination of the complex site layout and browser chrome (address bar, tabs) may confuse the agent. + +Note: In kiosk mode the agent may still try to use the address bar to enter URLs; it's not available, so it will eventually use `goto_url`, but those attempts may result in slowdown of the overall session. + +Default (non-kiosk): + +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com, then navigate to ign.com and describe the page"}' +``` + +With kiosk (single-site or recording): + +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://example.com in the search box and then describe the page.", "kiosk": true}' +``` + ## Viewport Configuration Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. diff --git a/pkg/templates/typescript/yutori-computer-use/README.md b/pkg/templates/typescript/yutori-computer-use/README.md index 2c3307d9..92c009d3 100644 --- a/pkg/templates/typescript/yutori-computer-use/README.md +++ b/pkg/templates/typescript/yutori-computer-use/README.md @@ -20,7 +20,7 @@ kernel deploy index.ts --env-file .env ## Usage ```bash -kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to https://www.magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items."}' ``` ## Recording Replays @@ -35,6 +35,24 @@ kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://e When enabled, the response will include a `replay_url` field with a link to view the recorded session. +## Kiosk mode + +Prefer **non-kiosk mode** by default and when the agent is expected to switch domains via URL. Use **kiosk (`"kiosk": true`)** when: (1) you're recording sessions and want a cleaner UI in the replay, or (2) you're automating on a single website and the combination of the complex site layout and browser chrome (address bar, tabs) may confuse the agent. + +Note: In kiosk mode the agent may still try to use the address bar to enter URLs; it's not available, so it will eventually use `goto_url`, but those attempts may result in slowdown of the overall session. + +Default (non-kiosk): + +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com, then navigate to ign.com and describe the page"}' +``` + +With kiosk (single-site or recording): + +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Enter https://example.com in the search box and then describe the page.", "kiosk": true}' +``` + ## Viewport Configuration Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. From 3541bbb0623939ccd1e4d3689c0cd40d48ddf130 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 09:36:53 -0500 Subject: [PATCH 13/18] Deslop Remove 'Build tool response message' comments --- pkg/templates/python/yutori-computer-use/loop.py | 1 - pkg/templates/typescript/yutori-computer-use/loop.ts | 1 - 2 files changed, 2 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 74cdef8b..73bb3aab 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -137,7 +137,6 @@ async def sampling_loop( print(f"Action failed: {e}") result = {"error": str(e)} - # Build tool response message if result.get("base64_image"): conversation_messages.append({ "role": "tool", diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index d484e36a..1280e0cb 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -150,7 +150,6 @@ export async function samplingLoop({ }; } - // Build tool response message if (result.base64Image) { conversationMessages.push({ role: 'tool', From b31643bdf68a726b1ff7c0eeab6b7bf03258affb Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 09:52:09 -0500 Subject: [PATCH 14/18] Throw ToolError on Playwright goto failure Replace returning an error object with throwing a ToolError when Playwright goto fails in ComputerTool. This standardizes error handling so callers can catch exceptions and preserves the original response error or falls back to 'Playwright goto failed'. --- pkg/templates/typescript/yutori-computer-use/tools/computer.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts index faa966b1..59b60d16 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -306,7 +306,7 @@ export class ComputerTool { timeout_sec: 60, }); if (!response.success) { - return { error: response.error ?? 'Playwright goto failed' }; + throw new ToolError(response.error ?? 'Playwright goto failed'); } await this.sleep(ACTION_DELAY_MS); return this.screenshot(); From 17222682fd4736ad8d7acf6311a7d276443eab3e Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 18:09:17 -0500 Subject: [PATCH 15/18] Use bare raise to preserve original traceback in Python loop Co-authored-by: Cursor --- pkg/templates/python/yutori-computer-use/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index 73bb3aab..fd72d205 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -72,7 +72,7 @@ async def sampling_loop( ) except Exception as api_error: print(f"API call failed: {api_error}") - raise api_error + raise if not response.choices or len(response.choices) == 0: print(f"No choices in response: {response}") From 639c0faaa20f45822e047b7e60248df843ee7b73 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 18:09:34 -0500 Subject: [PATCH 16/18] Replace double type cast with @ts-expect-error for Yutori image content Co-authored-by: Cursor --- pkg/templates/typescript/yutori-computer-use/loop.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index 1280e0cb..aa1d2957 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -154,6 +154,7 @@ export async function samplingLoop({ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, + // @ts-expect-error Yutori n1 accepts image content arrays in tool messages content: [ { type: 'image_url', @@ -161,7 +162,7 @@ export async function samplingLoop({ url: `data:image/webp;base64,${result.base64Image}`, }, }, - ] as unknown as string, + ], }); } else if (result.error) { conversationMessages.push({ From 6d878f32854d79f1f811ecf20b8da0146a9d6705 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 18:09:47 -0500 Subject: [PATCH 17/18] Use model_dump for assistant message serialization in Python loop Co-authored-by: Cursor --- .../python/yutori-computer-use/loop.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py index fd72d205..066aafb5 100644 --- a/pkg/templates/python/yutori-computer-use/loop.py +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -85,24 +85,7 @@ async def sampling_loop( print("Assistant content:", assistant_message.content or "(none)") - # Preserve full assistant message (including tool_calls) in history - assistant_dict: dict[str, Any] = { - "role": "assistant", - "content": assistant_message.content or "", - } - if assistant_message.tool_calls: - assistant_dict["tool_calls"] = [ - { - "id": tc.id, - "type": tc.type, - "function": { - "name": tc.function.name, - "arguments": tc.function.arguments, - }, - } - for tc in assistant_message.tool_calls - ] - conversation_messages.append(assistant_dict) + conversation_messages.append(assistant_message.model_dump(exclude_none=True)) tool_calls = assistant_message.tool_calls From c3757abc2710c834019fb1e1ae3b8218f32fabbb Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 18 Feb 2026 18:16:26 -0500 Subject: [PATCH 18/18] Keep as-unknown-as-string cast with explanatory comment for Yutori image content @ts-expect-error cannot suppress errors reported deep inside multi-line object literals. Restore the working cast and document the API extension. Co-authored-by: Cursor --- pkg/templates/typescript/yutori-computer-use/loop.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts index aa1d2957..e0f94795 100644 --- a/pkg/templates/typescript/yutori-computer-use/loop.ts +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -154,7 +154,7 @@ export async function samplingLoop({ conversationMessages.push({ role: 'tool', tool_call_id: toolCall.id, - // @ts-expect-error Yutori n1 accepts image content arrays in tool messages + // Yutori n1 accepts image content arrays in tool messages (not yet in OpenAI SDK types) content: [ { type: 'image_url', @@ -162,7 +162,7 @@ export async function samplingLoop({ url: `data:image/webp;base64,${result.base64Image}`, }, }, - ], + ] as unknown as string, }); } else if (result.error) { conversationMessages.push({