diff --git a/agents/__tests__/commander.test.ts b/agents/__tests__/basher.test.ts similarity index 98% rename from agents/__tests__/commander.test.ts rename to agents/__tests__/basher.test.ts index 7db0319f72..282d5571c4 100644 --- a/agents/__tests__/commander.test.ts +++ b/agents/__tests__/basher.test.ts @@ -1,6 +1,6 @@ import { describe, test, expect } from 'bun:test' -import commander from '../commander' +import commander from '../basher' import type { AgentState } from '../types/agent-definition' import type { ToolResultOutput } from '../types/util-types' @@ -19,11 +19,11 @@ describe('commander agent', () => { describe('definition', () => { test('has correct id', () => { - expect(commander.id).toBe('commander') + expect(commander.id).toBe('basher') }) test('has display name', () => { - expect(commander.displayName).toBe('Commander') + expect(commander.displayName).toBe('Basher') }) test('uses flash-lite model', () => { diff --git a/agents/base2/base-deep.ts b/agents/base2/base-deep.ts index ab35b44735..58e780eb55 100644 --- a/agents/base2/base-deep.ts +++ b/agents/base2/base-deep.ts @@ -32,7 +32,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u - Spawn the thinker-gpt after gathering context to solve complex problems or when the user asks you to think about a problem. (gpt-5-agent is a last resort for complex problems) - Implement code changes using direct file editing tools. - Prefer apply_patch for existing-file edits. Use write_file only for creating or replacing entire files when that is simpler. - - Spawn commanders sequentially if the second command depends on the the first. + - Spawn bashers sequentially if the second command depends on the the first. - **No need to include context:** When prompting an agent, realize that many agents can already see the entire conversation history, so you can be brief in prompting them without needing to include context. - **Never spawn the context-pruner agent:** This agent is spawned automatically for you and you don't need to spawn it yourself. @@ -199,7 +199,7 @@ Iteratively review until the code is clean: Thoroughly validate the changes: -1. Run any existing unit tests that cover the modified code (spawn commanders in parallel for typechecks, tests, lints as appropriate). +1. Run any existing unit tests that cover the modified code (spawn bashers in parallel for typechecks, tests, lints as appropriate). 2. Write and run additional unit tests for new functionality. Fix any test failures. 3. You MUST attempt end-to-end verification: use tools to run the actual application (or equivalent) and verify the changes work in practice. For example: - For a web app: start the server and check the relevant endpoints @@ -298,7 +298,7 @@ export function createBaseDeep(options?: { 'glob-matcher', 'researcher-web', 'researcher-docs', - 'commander', + 'basher', 'thinker-gpt', 'code-reviewer-gpt', 'gpt-5-agent', diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index f83ba93495..ba313e1347 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -76,7 +76,7 @@ export function createBase2( isMax && 'file-picker-max', 'researcher-web', 'researcher-docs', - isFree ? 'commander-lite' : 'commander', + 'basher', isDefault && 'thinker', (isDefault || isMax) && ['opus-agent', 'gpt-5-agent'], isMax && 'thinker-best-of-n-opus', @@ -125,7 +125,7 @@ export function createBase2( - Create an impressive demonstration showcasing web development capabilities - **Refactoring Awareness:** Whenever you modify an exported symbol like a function or class or variable, you should find and update all the references to it appropriately using the code_search tool. - **Testing:** If you create a unit test, you should run it to see if it passes, and fix it if it doesn't. -- **Package Management:** When adding new packages, use the commander agent to install the package rather than editing the package.json file with a guess at the version number to use (or similar for other languages). This way, you will be sure to have the latest version of the package. Do not install packages globally unless asked by the user (e.g. Don't run \`npm install -g \`). Always try to use the package manager associated with the project (e.g. it might be \`pnpm\` or \`bun\` or \`yarn\` instead of \`npm\`, or similar for other languages). +- **Package Management:** When adding new packages, use the basher agent to install the package rather than editing the package.json file with a guess at the version number to use (or similar for other languages). This way, you will be sure to have the latest version of the package. Do not install packages globally unless asked by the user (e.g. Don't run \`npm install -g \`). Always try to use the package manager associated with the project (e.g. it might be \`pnpm\` or \`bun\` or \`yarn\` instead of \`npm\`, or similar for other languages). - **Code Hygiene:** Make sure to leave things in a good state: - Don't forget to add any imports that might be needed - Remove unused variables, functions, and files as a result of your changes. @@ -152,7 +152,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u '- Implement code changes using the str_replace or write_file tools directly.', isFree && '- Spawn a code-reviewer-lite to review the changes after you have implemented the changes.', - '- Spawn commanders sequentially if the second command depends on the the first.', + '- Spawn bashers sequentially if the second command depends on the the first.', isDefault && '- Spawn a code-reviewer to review the changes after you have implemented the changes.', isMax && @@ -213,12 +213,12 @@ ${isDefault } ${isDefault - ? `[ You spawn a code-reviewer, a commander to typecheck the changes, and another commander to run tests, all in parallel ]` + ? `[ You spawn a code-reviewer, a basher to typecheck the changes, and another basher to run tests, all in parallel ]` : isFree - ? `[ You spawn a code-reviewer-lite to review the changes, and a commander to typecheck the changes, and another commander to run tests, all in parallel ]` + ? `[ You spawn a code-reviewer-lite to review the changes, and a basher to typecheck the changes, and another basher to run tests, all in parallel ]` : isMax - ? `[ You spawn a commander to typecheck the changes, and another commander to run tests, in parallel. Then, you spawn a code-reviewer-multi-prompt to review the changes. ]` - : '[ You spawn a commander to typecheck the changes and another commander to run tests, all in parallel ]' + ? `[ You spawn a basher to typecheck the changes, and another basher to run tests, in parallel. Then, you spawn a code-reviewer-multi-prompt to review the changes. ]` + : '[ You spawn a basher to typecheck the changes and another basher to run tests, all in parallel ]' } ${isDefault @@ -227,7 +227,7 @@ ${isDefault ? `[ You fix the issues found by the code-reviewer-lite and type/test errors ]` : isMax ? `[ You fix the issues found by the code-reviewer-multi-prompt and type/test errors ]` - : '[ You fix the issues found by the type/test errors and spawn more commanders to confirm ]' + : '[ You fix the issues found by the type/test errors and spawn more bashers to confirm ]' } [ All tests & typechecks pass -- you write a very short final summary of the changes you made ] @@ -298,7 +298,7 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} } } -const EXPLORE_PROMPT = `- Iteratively spawn file pickers, commanders, and web/docs researchers to gather context as needed. Use the code_search, list_directory, and glob tools directly for searching and exploring the codebase. The file-picker agent in particular is very useful to find relevant files -- try spawning multiple in parallel (say, 2-5) to explore different parts of the codebase. Use read_subtree if you need to grok a particular part of the codebase. Read all the relevant files using the read_files tool.` +const EXPLORE_PROMPT = `- Iteratively spawn file pickers, bashers, and web/docs researchers to gather context as needed. Use the code_search, list_directory, and glob tools directly for searching and exploring the codebase. The file-picker agent in particular is very useful to find relevant files -- try spawning multiple in parallel (say, 2-5) to explore different parts of the codebase. Use read_subtree if you need to grok a particular part of the codebase. Read all the relevant files using the read_files tool.` function buildImplementationInstructionsPrompt({ isSonnet, diff --git a/agents/commander.ts b/agents/basher.ts similarity index 93% rename from agents/commander.ts rename to agents/basher.ts index 41357ed660..671437bff1 100644 --- a/agents/commander.ts +++ b/agents/basher.ts @@ -5,13 +5,13 @@ import type { AgentStepContext, } from './types/agent-definition' -const commander: AgentDefinition = { - id: 'commander', +const basher: AgentDefinition = { + id: 'basher', publisher, model: 'google/gemini-3.1-flash-lite-preview', - displayName: 'Commander', + displayName: 'Basher', spawnerPrompt: - 'Runs a single terminal command and describes its output using an LLM based on what information is requested.', + 'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor.', inputSchema: { prompt: { @@ -64,7 +64,7 @@ Do not use any tools! Only analyze the output of the command.`, const command = params?.command as string | undefined if (!command) { // Using console.error because agents run in a sandboxed environment without access to structured logger - console.error('Commander agent: missing required "command" parameter') + console.error('Basher agent: missing required "command" parameter') yield { toolName: 'set_output', input: { output: 'Error: Missing required "command" parameter' }, @@ -102,4 +102,4 @@ Do not use any tools! Only analyze the output of the command.`, }, } -export default commander +export default basher diff --git a/agents/commander-lite.ts b/agents/commander-lite.ts deleted file mode 100644 index 87206223ca..0000000000 --- a/agents/commander-lite.ts +++ /dev/null @@ -1,12 +0,0 @@ -import commander from './commander' - -import type { AgentDefinition } from './types/agent-definition' - -const definition: AgentDefinition = { - ...commander, - id: 'commander-lite', - displayName: 'Commander Lite', - model: 'google/gemini-3.1-flash-lite-preview', -} - -export default definition diff --git a/agents/context-pruner.ts b/agents/context-pruner.ts index dbb3c3cc57..bbf495baa1 100644 --- a/agents/context-pruner.ts +++ b/agents/context-pruner.ts @@ -299,8 +299,7 @@ const definition: AgentDefinition = { 'file-picker', 'researcher-web', 'researcher-docs', - 'commander', - 'commander-lite', + 'basher', 'code-reviewer', 'code-reviewer-multi-prompt', ] diff --git a/agents/general-agent/general-agent.ts b/agents/general-agent/general-agent.ts index f13f5f0945..26f2099589 100644 --- a/agents/general-agent/general-agent.ts +++ b/agents/general-agent/general-agent.ts @@ -56,7 +56,7 @@ export const createGeneralAgent = (options: { 'code-searcher', 'directory-lister', 'glob-matcher', - 'commander', + 'basher', 'context-pruner', ), toolNames: [ @@ -69,7 +69,7 @@ export const createGeneralAgent = (options: { instructionsPrompt: buildArray( `Use the spawn_agents tool to spawn agents to help you complete the user request.`, - !isGpt5 && `If you need to find more information in the codebase, file-picker is really good at finding relevant files. You should spawn multiple agents in parallel when possible to speed up the process. (e.g. spawn 3 file-pickers + 1 code-searcher + 1 researcher-web in one spawn_agents call or 3 commanders in one spawn_agents call).`, + !isGpt5 && `If you need to find more information in the codebase, file-picker is really good at finding relevant files. You should spawn multiple agents in parallel when possible to speed up the process. (e.g. spawn 3 file-pickers + 1 code-searcher + 1 researcher-web in one spawn_agents call or 3 bashers in one spawn_agents call).`, ).join('\n'), handleSteps: function* ({ params }) { diff --git a/bun.lock b/bun.lock index 46d56e6c72..c9c10fdbe6 100644 --- a/bun.lock +++ b/bun.lock @@ -51,8 +51,8 @@ "dependencies": { "@codebuff/sdk": "workspace:*", "@gravity-ai/api": "^0.1.2", - "@opentui/core": "0.1.74", - "@opentui/react": "0.1.74", + "@opentui/core": "0.1.87", + "@opentui/react": "0.1.87", "@tanstack/react-query": "^5.90.12", "commander": "^14.0.1", "immer": "^10.1.3", @@ -1012,21 +1012,21 @@ "@opentelemetry/semantic-conventions": ["@opentelemetry/semantic-conventions@1.38.0", "", {}, "sha512-kocjix+/sSggfJhwXqClZ3i9Y/MI0fp7b+g7kCRm6psy2dsf8uApTRclwG18h8Avm7C9+fnt+O36PspJ/OzoWg=="], - "@opentui/core": ["@opentui/core@0.1.74", "", { "dependencies": { "bun-ffi-structs": "0.1.2", "diff": "8.0.2", "jimp": "1.6.0", "yoga-layout": "3.2.1" }, "optionalDependencies": { "@dimforge/rapier2d-simd-compat": "^0.17.3", "@opentui/core-darwin-arm64": "0.1.74", "@opentui/core-darwin-x64": "0.1.74", "@opentui/core-linux-arm64": "0.1.74", "@opentui/core-linux-x64": "0.1.74", "@opentui/core-win32-arm64": "0.1.74", "@opentui/core-win32-x64": "0.1.74", "bun-webgpu": "0.1.4", "planck": "^1.4.2", "three": "0.177.0" }, "peerDependencies": { "web-tree-sitter": "0.25.10" } }, "sha512-g4W16ymv12JdgZ+9B4t7mpIICvzWy2+eHERfmDf80ALduOQCUedKQdULcBFhVCYUXIkDRtIy6CID5thMAah3FA=="], + "@opentui/core": ["@opentui/core@0.1.87", "", { "dependencies": { "bun-ffi-structs": "0.1.2", "diff": "8.0.2", "jimp": "1.6.0", "marked": "17.0.1", "yoga-layout": "3.2.1" }, "optionalDependencies": { "@dimforge/rapier2d-simd-compat": "^0.17.3", "@opentui/core-darwin-arm64": "0.1.87", "@opentui/core-darwin-x64": "0.1.87", "@opentui/core-linux-arm64": "0.1.87", "@opentui/core-linux-x64": "0.1.87", "@opentui/core-win32-arm64": "0.1.87", "@opentui/core-win32-x64": "0.1.87", "bun-webgpu": "0.1.5", "planck": "^1.4.2", "three": "0.177.0" }, "peerDependencies": { "web-tree-sitter": "0.25.10" } }, "sha512-dhsmMv0IqKftwG7J/pBrLBj2armsYIg5R3LBvciRQI/6X89GufP4l1u0+QTACAx6iR4SYJJNVNQ2tdX8LM9rMw=="], - "@opentui/core-darwin-arm64": ["@opentui/core-darwin-arm64@0.1.74", "", { "os": "darwin", "cpu": "arm64" }, "sha512-rfmlDLtm/u17CnuhJgCxPeYMvOST+A2MOdVOk46IurtHO849bdYqK6iudKNlFRs1FOrymgSKF9GlWBHAOKeRjg=="], + "@opentui/core-darwin-arm64": ["@opentui/core-darwin-arm64@0.1.87", "", { "os": "darwin", "cpu": "arm64" }, "sha512-G8oq85diOfkU6n0T1CxCle7oDmpKxwhcdhZ9khBMU5IrfLx9ZDuCM3F6MsiRQWdvPPCq2oomNbd64bYkPamYgw=="], - "@opentui/core-darwin-x64": ["@opentui/core-darwin-x64@0.1.74", "", { "os": "darwin", "cpu": "x64" }, "sha512-WAD8orsDV0ZdW/5GwjOOB4FY96772xbkz+rcV7WRzEFUVaqoBaC04IuqYzS9d5s+cjkbT5Cpj47hrVYkkVQKng=="], + "@opentui/core-darwin-x64": ["@opentui/core-darwin-x64@0.1.87", "", { "os": "darwin", "cpu": "x64" }, "sha512-MYTFQfOHm6qO7YaY4GHK9u/oJlXY6djaaxl5I+k4p2mk3vvuFIl/AP1ypITwBFjyV5gyp7PRWFp4nGfY9oN8bw=="], - "@opentui/core-linux-arm64": ["@opentui/core-linux-arm64@0.1.74", "", { "os": "linux", "cpu": "arm64" }, "sha512-lgmHzrzLy4e+rgBS+lhtsMLLgIMLbtLNMm6EzVPyYVDlLDGjM7+ulXMem7AtpaRrWrUUl4REiG9BoQUsCFDwYA=="], + "@opentui/core-linux-arm64": ["@opentui/core-linux-arm64@0.1.87", "", { "os": "linux", "cpu": "arm64" }, "sha512-he8o1h5M6oskRJ7wE+xKJgmWnv5ZwN6gB3M/Z+SeHtOMPa5cZmi3TefTjG54llEgFfx0F9RcqHof7TJ/GNxRkw=="], - "@opentui/core-linux-x64": ["@opentui/core-linux-x64@0.1.74", "", { "os": "linux", "cpu": "x64" }, "sha512-8Mn2WbdBQ29xCThuPZezjDhd1N3+fXwKkGvCBOdTI0le6h2A/vCNbfUVjwfr/EGZSRXxCG+Yapol34BAULGpOA=="], + "@opentui/core-linux-x64": ["@opentui/core-linux-x64@0.1.87", "", { "os": "linux", "cpu": "x64" }, "sha512-aiUwjPlH4yDcB8/6YDKSmMkaoGAAltL0Xo0AzXyAtJXWK5tkCSaYjEVwzJ/rYRkr4Magnad+Mjth4AQUWdR2AA=="], - "@opentui/core-win32-arm64": ["@opentui/core-win32-arm64@0.1.74", "", { "os": "win32", "cpu": "arm64" }, "sha512-dvYUXz03avnI6ZluyLp00HPmR0UT/IE/6QS97XBsgJlUTtpnbKkBtB5jD1NHwWkElaRj1Qv2QP36ngFoJqbl9g=="], + "@opentui/core-win32-arm64": ["@opentui/core-win32-arm64@0.1.87", "", { "os": "win32", "cpu": "arm64" }, "sha512-cmP0pOyREjWGniHqbDmaMY7U+1AyagrD8VseJbU0cGpNgVpG2/gbrJUGdfdLB0SNb+mzLdx6SOjdxtrElwRCQA=="], - "@opentui/core-win32-x64": ["@opentui/core-win32-x64@0.1.74", "", { "os": "win32", "cpu": "x64" }, "sha512-3wfWXaAKOIlDQz6ZZIESf2M+YGZ7uFHijjTEM8w/STRlLw8Y6+QyGYi1myHSM4d6RSO+/s2EMDxvjDf899W9vQ=="], + "@opentui/core-win32-x64": ["@opentui/core-win32-x64@0.1.87", "", { "os": "win32", "cpu": "x64" }, "sha512-N2GErAAP8iODf2RPp86pilPaVKiD6G4pkpZL5nLGbKsl0bndrVTpSqZcn8+/nQwFZDPD/AsiRTYNOfWOblhzOw=="], - "@opentui/react": ["@opentui/react@0.1.74", "", { "dependencies": { "@opentui/core": "0.1.74", "react-reconciler": "^0.32.0" }, "peerDependencies": { "react": ">=19.0.0", "react-devtools-core": "^7.0.1", "ws": "^8.18.0" } }, "sha512-2wiTVtBcbjNuWJjVDaSNdfVM9x9Cs7U+wCRPMmzVrYYCbWGjYQcA0Ump+XSKJpN+swzZRDBYHIw9xBlgUUnoLw=="], + "@opentui/react": ["@opentui/react@0.1.87", "", { "dependencies": { "@opentui/core": "0.1.87", "react-reconciler": "^0.32.0" }, "peerDependencies": { "react": ">=19.0.0", "react-devtools-core": "^7.0.1", "ws": "^8.18.0" } }, "sha512-FTYYs/L2AbcJbCvezlK9Klsw45AbGkwpyfjNsHP0N3BIxc3QiI5pYFpre6ZSq0feJNODmg+s9UapTCv4LtfROg=="], "@panva/hkdf": ["@panva/hkdf@1.2.1", "", {}, "sha512-6oclG6Y3PiDFcoyk8srjLfVKyMfVCKJ27JwNPViuXziFpmdz+MZnZN/aKY0JGXgYuO/VghU0jcOAZgWXZ1Dmrw=="], @@ -1598,15 +1598,15 @@ "bun-types": ["bun-types@1.3.5", "", { "dependencies": { "@types/node": "*" } }, "sha512-inmAYe2PFLs0SUbFOWSVD24sg1jFlMPxOjOSSCYqUgn4Hsc3rDc7dFvfVYjFPNHtov6kgUeulV4SxbuIV/stPw=="], - "bun-webgpu": ["bun-webgpu@0.1.4", "", { "dependencies": { "@webgpu/types": "^0.1.60" }, "optionalDependencies": { "bun-webgpu-darwin-arm64": "^0.1.4", "bun-webgpu-darwin-x64": "^0.1.4", "bun-webgpu-linux-x64": "^0.1.4", "bun-webgpu-win32-x64": "^0.1.4" } }, "sha512-Kw+HoXl1PMWJTh9wvh63SSRofTA8vYBFCw0XEP1V1fFdQEDhI8Sgf73sdndE/oDpN/7CMx0Yv/q8FCvO39ROMQ=="], + "bun-webgpu": ["bun-webgpu@0.1.5", "", { "dependencies": { "@webgpu/types": "^0.1.60" }, "optionalDependencies": { "bun-webgpu-darwin-arm64": "^0.1.5", "bun-webgpu-darwin-x64": "^0.1.5", "bun-webgpu-linux-x64": "^0.1.5", "bun-webgpu-win32-x64": "^0.1.5" } }, "sha512-91/K6S5whZKX7CWAm9AylhyKrLGRz6BUiiPiM/kXadSnD4rffljCD/q9cNFftm5YXhx4MvLqw33yEilxogJvwA=="], - "bun-webgpu-darwin-arm64": ["bun-webgpu-darwin-arm64@0.1.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-eDgLN9teKTfmvrCqgwwmWNsNszxYs7IZdCqk0S1DCarvMhr4wcajoSBlA/nQA0/owwLduPTS8xxCnQp4/N/gDg=="], + "bun-webgpu-darwin-arm64": ["bun-webgpu-darwin-arm64@0.1.5", "", { "os": "darwin", "cpu": "arm64" }, "sha512-qM7W5IaFpWYGPDcNiQ8DOng3noQ97gxpH2MFH1mGsdKwI0T4oy++egSh5Z7s6AQx8WKgc9GzAsTUM4KZkFdacw=="], - "bun-webgpu-darwin-x64": ["bun-webgpu-darwin-x64@0.1.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-X+PjwJUWenUmdQBP8EtdItMyieQ6Nlpn+BH518oaouDiSnWj5+b0Y7DNDZJq7Ezom4EaxmqL/uGYZK3aCQ7CXg=="], + "bun-webgpu-darwin-x64": ["bun-webgpu-darwin-x64@0.1.5", "", { "os": "darwin", "cpu": "x64" }, "sha512-oVoIsme27pcXB68YxnQSAgdNGCa4A3PGWYIBUewOh9VnJaoik4JenGb5Yy+svGE+ETFhQXV9nhHqgMPsDRrO6A=="], - "bun-webgpu-linux-x64": ["bun-webgpu-linux-x64@0.1.4", "", { "os": "linux", "cpu": "x64" }, "sha512-zMLs2YIGB+/jxrYFXaFhVKX/GBt05UTF45lc9srcHc9JXGjEj+12CIo1CHLTAWatXMTqt0Jsu6ukWEoWVT/ayA=="], + "bun-webgpu-linux-x64": ["bun-webgpu-linux-x64@0.1.5", "", { "os": "linux", "cpu": "x64" }, "sha512-+SYt09k+xDEl/GfcU7L1zdNgm7IlvAFKV5Xl/auBwuprKG5UwXNhjRlRAWfhTMCUZWN+NDf8E+ZQx0cQi9K2/g=="], - "bun-webgpu-win32-x64": ["bun-webgpu-win32-x64@0.1.4", "", { "os": "win32", "cpu": "x64" }, "sha512-Z5yAK28xrcm8Wb5k7TZ8FJKpOI/r+aVCRdlHYAqI2SDJFN3nD4mJs900X6kNVmG/xFzb5yOuKVYWGg+6ZXWbyA=="], + "bun-webgpu-win32-x64": ["bun-webgpu-win32-x64@0.1.5", "", { "os": "win32", "cpu": "x64" }, "sha512-zvnUl4EAsQbKsmZVu+lEJcH8axQ7MiCfqg2OmnHd6uw1THABmHaX0GbpKiHshdgadNN2Nf+4zDyTJB5YMcAdrA=="], "bundle-name": ["bundle-name@4.1.0", "", { "dependencies": { "run-applescript": "^7.0.0" } }, "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q=="], @@ -2640,7 +2640,7 @@ "markdown-table": ["markdown-table@3.0.4", "", {}, "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw=="], - "marked": ["marked@16.4.1", "", { "bin": { "marked": "bin/marked.js" } }, "sha512-ntROs7RaN3EvWfy3EZi14H4YxmT6A5YvywfhO+0pm+cH/dnSQRmdAmoFIc3B9aiwTehyk7pESH4ofyBY+V5hZg=="], + "marked": ["marked@17.0.1", "", { "bin": { "marked": "bin/marked.js" } }, "sha512-boeBdiS0ghpWcSwoNm/jJBwdpFaMnZWRzjA6SkUMYb40SVaN1x7mmfGKp0jvexGcx+7y2La5zRZsYFZI6Qpypg=="], "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], @@ -4066,6 +4066,8 @@ "mdast-util-frontmatter/escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="], + "mermaid/marked": ["marked@16.4.1", "", { "bin": { "marked": "bin/marked.js" } }, "sha512-ntROs7RaN3EvWfy3EZi14H4YxmT6A5YvywfhO+0pm+cH/dnSQRmdAmoFIc3B9aiwTehyk7pESH4ofyBY+V5hZg=="], + "mermaid/uuid": ["uuid@11.1.0", "", { "bin": { "uuid": "dist/esm/bin/uuid" } }, "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A=="], "mlly/pkg-types": ["pkg-types@1.3.1", "", { "dependencies": { "confbox": "^0.1.8", "mlly": "^1.7.4", "pathe": "^2.0.1" } }, "sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ=="], diff --git a/cli/package.json b/cli/package.json index 135823c3ef..38a554cdbb 100644 --- a/cli/package.json +++ b/cli/package.json @@ -30,8 +30,8 @@ "dependencies": { "@codebuff/sdk": "workspace:*", "@gravity-ai/api": "^0.1.2", - "@opentui/core": "0.1.74", - "@opentui/react": "0.1.74", + "@opentui/core": "0.1.87", + "@opentui/react": "0.1.87", "@tanstack/react-query": "^5.90.12", "commander": "^14.0.1", "immer": "^10.1.3", diff --git a/cli/src/commands/command-registry.ts b/cli/src/commands/command-registry.ts index 0732ed3b7c..b5b81d5800 100644 --- a/cli/src/commands/command-registry.ts +++ b/cli/src/commands/command-registry.ts @@ -533,6 +533,30 @@ const ALL_COMMANDS: CommandDefinition[] = [ return { openChatHistory: true } }, }), + defineCommandWithArgs({ + name: 'interview', + handler: (params, args) => { + const trimmedArgs = args.trim() + + params.saveToHistory(params.inputValue.trim()) + clearInput(params) + + // If user provided text directly, send it immediately + if (trimmedArgs) { + params.sendMessage({ + content: buildInterviewPrompt(trimmedArgs), + agentMode: params.agentMode, + }) + setTimeout(() => { + params.scrollToLatest() + }, 0) + return + } + + // Otherwise enter interview mode + useChatStore.getState().setInputMode('interview') + }, + }), defineCommandWithArgs({ name: 'plan', handler: (params, args) => { @@ -572,30 +596,6 @@ const ALL_COMMANDS: CommandDefinition[] = [ useChatStore.getState().setInputMode('plan') }, }), - defineCommandWithArgs({ - name: 'interview', - handler: (params, args) => { - const trimmedArgs = args.trim() - - params.saveToHistory(params.inputValue.trim()) - clearInput(params) - - // If user provided text directly, send it immediately - if (trimmedArgs) { - params.sendMessage({ - content: buildInterviewPrompt(trimmedArgs), - agentMode: params.agentMode, - }) - setTimeout(() => { - params.scrollToLatest() - }, 0) - return - } - - // Otherwise enter interview mode - useChatStore.getState().setInputMode('interview') - }, - }), defineCommandWithArgs({ name: 'review', handler: (params, args) => { diff --git a/cli/src/components/blocks/agent-branch-item.tsx b/cli/src/components/blocks/agent-branch-item.tsx index 67f6b6d6b5..95a9dafda8 100644 --- a/cli/src/components/blocks/agent-branch-item.tsx +++ b/cli/src/components/blocks/agent-branch-item.tsx @@ -8,6 +8,7 @@ import { MAX_COLLAPSED_LINES, truncateToLines } from '../../utils/strings' import { BORDER_CHARS } from '../../utils/ui-constants' import { Button } from '../button' import { CollapseButton } from '../collapse-button' +import { ShimmerText } from '../shimmer-text' interface AgentBranchItemProps { name: string @@ -286,6 +287,20 @@ export const AgentBranchItem = memo((props: AgentBranchItemProps) => { {onToggle && } )} + {isStreaming && isExpanded && ( + + + + )} ) diff --git a/cli/src/data/slash-commands.ts b/cli/src/data/slash-commands.ts index 283e8195ee..50dd90f0d2 100644 --- a/cli/src/data/slash-commands.ts +++ b/cli/src/data/slash-commands.ts @@ -123,6 +123,11 @@ const ALL_SLASH_COMMANDS: SlashCommand[] = [ description: 'Subscribe to get more usage', aliases: ['strong', 'sub', 'buy-credits'], }, + { + id: 'interview', + label: 'interview', + description: 'AI asks a series of questions to flesh out request into a spec', + }, { id: 'plan', label: 'plan', @@ -133,11 +138,6 @@ const ALL_SLASH_COMMANDS: SlashCommand[] = [ label: 'review', description: 'Review code changes with GPT 5.4', }, - { - id: 'interview', - label: 'interview', - description: 'AI asks a series of questions to flesh out request into a spec', - }, { id: 'new', label: 'new', diff --git a/cli/src/utils/constants.ts b/cli/src/utils/constants.ts index faae7ac15c..775778be97 100644 --- a/cli/src/utils/constants.ts +++ b/cli/src/utils/constants.ts @@ -37,8 +37,7 @@ export const COLLAPSED_BY_DEFAULT_AGENT_IDS = [ 'code-reviewer-selector', 'thinker-selector', 'best-of-n-selector', - 'commander', - 'commander-lite', + 'basher', 'code-searcher', 'directory-lister', 'glob-matcher', diff --git a/cli/src/utils/sdk-event-handlers.ts b/cli/src/utils/sdk-event-handlers.ts index 6648cea2b7..6f3b94649d 100644 --- a/cli/src/utils/sdk-event-handlers.ts +++ b/cli/src/utils/sdk-event-handlers.ts @@ -371,7 +371,7 @@ const updateSpawnAgentBlocks = ( if (result?.value) { const { content, hasError } = extractSpawnAgentResultContent(result.value) - // Preserve streamed content (agents like commander stream their output) + // Preserve streamed content (agents like basher stream their output) const hasStreamedContent = block.blocks.length > 0 if (hasError || content || hasStreamedContent) { return { diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index 90eab2c6bf..2f44ca8a9a 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -30,7 +30,7 @@ export const FREE_MODE_AGENT_MODELS: Record> = { 'researcher-docs': new Set(['google/gemini-3.1-flash-lite-preview']), // Command execution - 'commander-lite': new Set(['google/gemini-3.1-flash-lite-preview']), + 'basher': new Set(['google/gemini-3.1-flash-lite-preview']), // Editor for free mode 'editor-lite': new Set(['minimax/minimax-m2.5']), diff --git a/common/src/tools/params/tool/spawn-agents.ts b/common/src/tools/params/tool/spawn-agents.ts index fd126845ff..6c7f2b16cb 100644 --- a/common/src/tools/params/tool/spawn-agents.ts +++ b/common/src/tools/params/tool/spawn-agents.ts @@ -37,11 +37,11 @@ The prompt field is a simple string, while params is a JSON object that gets val Each agent available is already defined as another tool, or, dynamically defined later in the conversation. -**IMPORTANT**: \`agent_type\` must be an actual agent name (e.g., \`commander\`, \`code-searcher\`, \`opus-agent\`), NOT a tool name like \`read_files\`, \`str_replace\`, \`code_search\`, etc. If you need to call a tool, use it directly as a tool call instead of wrapping it in spawn_agents. +**IMPORTANT**: \`agent_type\` must be an actual agent name (e.g., \`basher\`, \`code-searcher\`, \`opus-agent\`), NOT a tool name like \`read_files\`, \`str_replace\`, \`code_search\`, etc. If you need to call a tool, use it directly as a tool call instead of wrapping it in spawn_agents. You can call agents either as direct tool calls (e.g., \`example-agent\`) or use \`spawn_agents\`. Both formats work, but **prefer using spawn_agents** because it allows you to spawn multiple agents in parallel for better performance. Both use the same schema with nested \`prompt\` and \`params\` fields. -**IMPORTANT**: Many agents have REQUIRED fields in their params schema. Check the agent's schema before spawning - if params has required fields, you MUST include them in the params object. For example, code-searcher requires \`searchQueries\`, commander requires \`command\`. +**IMPORTANT**: Many agents have REQUIRED fields in their params schema. Check the agent's schema before spawning - if params has required fields, you MUST include them in the params object. For example, code-searcher requires \`searchQueries\`, basher requires \`command\`. Example: ${$getNativeToolCallExampleString({ @@ -50,7 +50,7 @@ ${$getNativeToolCallExampleString({ input: { agents: [ { - agent_type: 'commander', + agent_type: 'basher', prompt: 'Check if tests pass', params: { command: 'npm test', diff --git a/docs/architecture.md b/docs/architecture.md index 7e2adb3e89..4c60d4ae22 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -92,7 +92,7 @@ Prompt-based and programmatic agent definitions that ship with Codebuff. - `reviewer/` — Code review agent with multi-prompt variant - `researcher/` — Web search and docs search agents - `general-agent/` — General-purpose agents (opus-agent, gpt-5-agent) - - `commander.ts` / `commander-lite.ts` — Terminal command execution agents + - `basher.ts` — Terminal command execution agent (id: 'basher', displayName: 'Basher') - `context-pruner.ts` — Conversation summarization to manage context length - **Depends on:** `common` (for agent definition types and tool params) diff --git a/freebuff/e2e/agent/freebuff-tester.ts b/freebuff/e2e/agent/freebuff-tester.ts index a58d6dfb49..e4cf221423 100644 --- a/freebuff/e2e/agent/freebuff-tester.ts +++ b/freebuff/e2e/agent/freebuff-tester.ts @@ -43,7 +43,7 @@ through tmux tools. Follow these steps: Key things to verify: - The CLI starts without errors or crashes -- Branding shows "Freebuff" (not "Codebuff") +- The startup screen has visible content (non-empty output) - Commands work as expected - Error messages are user-friendly diff --git a/freebuff/e2e/tests/ads-behavior.e2e.test.ts b/freebuff/e2e/tests/ads-behavior.e2e.test.ts index 1ba9fe4d4e..5876d51bea 100644 --- a/freebuff/e2e/tests/ads-behavior.e2e.test.ts +++ b/freebuff/e2e/tests/ads-behavior.e2e.test.ts @@ -15,10 +15,11 @@ describe('Freebuff: Ads Behavior', () => { }) test( - 'ads:enable command is not available', + 'ads commands are not available', async () => { const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + session = await FreebuffSession.start(binary) + await session.waitForReady() // Type "/ads" to check for ads commands in autocomplete await session.send('/ads', { noEnter: true }) @@ -32,46 +33,17 @@ describe('Freebuff: Ads Behavior', () => { ) test( - 'ads:disable command is not available', + 'startup screen does not show ad-related UI', async () => { const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + session = await FreebuffSession.start(binary) + await session.waitForReady() - // Try to send the /ads:disable command - await session.send('/ads:disable') - const output = await session.capture(3) - - // The command should not be recognized - // It should NOT show "Ads disabled" confirmation - expect(output).not.toMatch(/ads disabled/i) - }, - TEST_TIMEOUT, - ) - - test( - 'does not show credits earned from ads', - async () => { - const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary, { waitSeconds: 5 }) const output = await session.capture() - // In Freebuff, ads don't show "+X credits" because credits don't apply - // Check the startup screen doesn't mention ad credits + // Ads are always enabled in Freebuff — no credits or toggle UI expect(output).not.toMatch(/\+\d+ credits/) - }, - TEST_TIMEOUT, - ) - - test( - 'does not show "Hide ads" option', - async () => { - const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary, { waitSeconds: 5 }) - const output = await session.capture() - - // In Freebuff, the "Hide ads" link is not shown because ads are mandatory expect(output).not.toContain('Hide ads') - // Also should not mention /ads:enable as a way to re-enable expect(output).not.toContain('/ads:enable') }, TEST_TIMEOUT, diff --git a/freebuff/e2e/tests/agent-startup.e2e.test.ts b/freebuff/e2e/tests/agent-startup.e2e.test.ts index 6d436758a8..04a10e7332 100644 --- a/freebuff/e2e/tests/agent-startup.e2e.test.ts +++ b/freebuff/e2e/tests/agent-startup.e2e.test.ts @@ -60,9 +60,8 @@ describe('Freebuff: Agent-driven E2E', () => { prompt: 'Start Freebuff using the start_freebuff tool. Then capture the output ' + 'with capture_freebuff_output (waitSeconds: 3). Verify that:\n' + - '1. The CLI started without errors\n' + - '2. The output contains "freebuff" (case-insensitive)\n' + - '3. The output does NOT contain "codebuff" (case-insensitive)\n' + + '1. The CLI started without errors (no FATAL, panic, or crash messages)\n' + + '2. The output has visible content (not a blank screen)\n' + 'Finally, call stop_freebuff to clean up. Report your findings.', agentDefinitions: [freebuffTesterAgent], customToolDefinitions: tmuxTools.tools, diff --git a/freebuff/e2e/tests/code-edit.e2e.test.ts b/freebuff/e2e/tests/code-edit.e2e.test.ts index 957ccac7f9..e95f09a7cf 100644 --- a/freebuff/e2e/tests/code-edit.e2e.test.ts +++ b/freebuff/e2e/tests/code-edit.e2e.test.ts @@ -52,11 +52,14 @@ describe('Freebuff: Code Edit', () => { initialFiles: { 'index.js': initialContent }, }) + // Wait for the CLI to be fully ready before sending input + await session.waitForReady() + // Verify the file was created expect(session.readFile('index.js')).toBe(initialContent) // Send a prompt asking freebuff to add a console.log - await session.send("Add a console.log('hello world') to index.js") + await session.send('Add console.log("hello world") to index.js') // Wait for the file to be modified with the console.log const finalContent = await session.waitForFileContent( diff --git a/freebuff/e2e/tests/help-command.e2e.test.ts b/freebuff/e2e/tests/help-command.e2e.test.ts index 173a3425b8..7c93d795f1 100644 --- a/freebuff/e2e/tests/help-command.e2e.test.ts +++ b/freebuff/e2e/tests/help-command.e2e.test.ts @@ -47,7 +47,8 @@ describe('Freebuff: /help slash command', () => { 'shows help content when /help is entered', async () => { const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + session = await FreebuffSession.start(binary) + await session.waitForReady() await session.send('/help') const output = await session.capture(2) @@ -62,7 +63,8 @@ describe('Freebuff: /help slash command', () => { 'does not show subscription commands in help', async () => { const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + session = await FreebuffSession.start(binary) + await session.waitForReady() await session.send('/help') const output = await session.capture(2) diff --git a/freebuff/e2e/tests/knowledge-file.e2e.test.ts b/freebuff/e2e/tests/knowledge-file.e2e.test.ts new file mode 100644 index 0000000000..4d28cebd4b --- /dev/null +++ b/freebuff/e2e/tests/knowledge-file.e2e.test.ts @@ -0,0 +1,64 @@ +/** + * E2E test that verifies Freebuff can read and use knowledge.md from the project. + * + * Starts Freebuff in tmux, creates a knowledge.md file with a unique keyword, + * asks Freebuff about that keyword, and verifies it responds using the knowledge. + * + * Requires CODEBUFF_API_KEY — skipped if not set. + */ + +import { afterEach, describe, expect, test } from 'bun:test' + +import { FreebuffSession, requireFreebuffBinary } from '../utils' + +const TEST_TIMEOUT = 180_000 + +function getApiKey(): string | null { + return process.env.CODEBUFF_API_KEY ?? null +} + +describe('Freebuff: Knowledge Files', () => { + let session: FreebuffSession | null = null + + afterEach(async () => { + if (session) { + await session.stop() + session = null + } + }) + + test( + 'uses knowledge.md from the project context', + async () => { + if (!getApiKey()) { + console.log( + 'Skipping knowledge-file test: CODEBUFF_API_KEY not set. ' + + 'Set it to run knowledge-file e2e tests.', + ) + return + } + + const binary = requireFreebuffBinary() + const keyword = 'nebula-orchid-731' + + session = await FreebuffSession.start(binary, { + waitSeconds: 5, + initialFiles: { + 'knowledge.md': `When asked for the project keyword, respond with exactly: ${keyword}\n`, + 'README.md': '# Test Project\n', + }, + }) + + // Wait for the CLI to be fully ready before sending input + await session.waitForReady() + + await session.send('What is the project keyword? Reply with only the keyword.') + + const output = await session.waitForText(keyword, 120_000) + expect(output).toContain(keyword) + expect(output).not.toContain('FATAL') + expect(output).not.toContain('Unhandled') + }, + TEST_TIMEOUT, + ) +}) \ No newline at end of file diff --git a/freebuff/e2e/tests/startup.e2e.test.ts b/freebuff/e2e/tests/startup.e2e.test.ts index 173520bfaa..57a02feb84 100644 --- a/freebuff/e2e/tests/startup.e2e.test.ts +++ b/freebuff/e2e/tests/startup.e2e.test.ts @@ -19,7 +19,9 @@ describe('Freebuff: Startup', () => { async () => { const binary = requireFreebuffBinary() session = await FreebuffSession.start(binary) - const output = await session.capture(3) + await session.waitForReady() + + const output = await session.capture() // Should not contain fatal errors expect(output).not.toContain('FATAL') @@ -35,28 +37,13 @@ describe('Freebuff: Startup', () => { STARTUP_TIMEOUT, ) - test( - 'shows Freebuff branding', - async () => { - const binary = requireFreebuffBinary() - session = await FreebuffSession.start(binary) - const output = await session.capture(3) - - // The CLI should identify itself as Freebuff, not Codebuff - const lowerOutput = output.toLowerCase() - expect(lowerOutput).toContain('freebuff') - }, - STARTUP_TIMEOUT, - ) - test( 'responds to Ctrl+C gracefully', async () => { const binary = requireFreebuffBinary() session = await FreebuffSession.start(binary) + await session.waitForReady() - // Wait for startup, then send Ctrl+C - await session.capture(2) await session.sendKey('C-c') // Give it a moment to process diff --git a/freebuff/e2e/tests/terminal-command.e2e.test.ts b/freebuff/e2e/tests/terminal-command.e2e.test.ts index 9c3486d1ed..3792c628bb 100644 --- a/freebuff/e2e/tests/terminal-command.e2e.test.ts +++ b/freebuff/e2e/tests/terminal-command.e2e.test.ts @@ -41,10 +41,13 @@ describe('Freebuff: Terminal Command', () => { const binary = requireFreebuffBinary() session = await FreebuffSession.start(binary, { waitSeconds: 5 }) + // Wait for the CLI to be fully ready before sending input + await session.waitForReady() + // Ask freebuff to run a shell command whose output can only come from // actual terminal execution (not file-writing tools) await session.send( - 'Use the terminal to run: date +%s > timestamp.txt && echo done', + 'Execute a shell command in the terminal to write the current Unix timestamp in seconds to timestamp.txt', ) // Wait for the file to be created by the terminal command diff --git a/freebuff/e2e/utils/freebuff-session.ts b/freebuff/e2e/utils/freebuff-session.ts index 5521534434..d2c5633086 100644 --- a/freebuff/e2e/utils/freebuff-session.ts +++ b/freebuff/e2e/utils/freebuff-session.ts @@ -103,9 +103,33 @@ export class FreebuffSession { } catch { // ignore } + const terminalOutput = await this.capture() throw new Error( `Timed out after ${timeoutMs}ms waiting for "${pattern}" in ${relativePath}.\n` + - `Last content:\n${finalContent}`, + `Last content:\n${finalContent}\n` + + `Terminal output:\n${terminalOutput}`, + ) + } + + /** + * Wait for the CLI to be fully initialized and ready for input. + * Polls terminal output until enough non-empty lines are visible, + * indicating the TUI has rendered its initial layout. + */ + async waitForReady(timeoutMs = 30_000, minLines = 5): Promise { + const start = Date.now() + while (Date.now() - start < timeoutMs) { + const output = await this.capture() + const nonEmptyLines = output + .split('\n') + .filter((line) => line.trim().length > 0) + if (nonEmptyLines.length >= minLines) return + await new Promise((resolve) => setTimeout(resolve, 250)) + } + const finalOutput = await this.capture() + throw new Error( + `Timed out after ${timeoutMs}ms waiting for CLI to be ready.\n` + + `Last output:\n${finalOutput}`, ) } diff --git a/freebuff/e2e/utils/tmux-custom-tools.ts b/freebuff/e2e/utils/tmux-custom-tools.ts index 92af618934..f37fae014d 100644 --- a/freebuff/e2e/utils/tmux-custom-tools.ts +++ b/freebuff/e2e/utils/tmux-custom-tools.ts @@ -54,7 +54,8 @@ export function createFreebuffTmuxTools(binaryPath: string): { ] } session = await FreebuffSession.start(binaryPath) - const initialOutput = await session.capture(2) + await session.waitForReady() + const initialOutput = await session.capture() return [ { type: 'json', diff --git a/freebuff/package.json b/freebuff/package.json index 03fb9d35e4..8ca95f2f6d 100644 --- a/freebuff/package.json +++ b/freebuff/package.json @@ -14,6 +14,7 @@ "e2e:ads": "bun test e2e/tests/ads-behavior.e2e.test.ts", "e2e:agent": "bun test e2e/tests/agent-startup.e2e.test.ts", "e2e:code-edit": "bun test e2e/tests/code-edit.e2e.test.ts", - "e2e:terminal-command": "bun test e2e/tests/terminal-command.e2e.test.ts" + "e2e:terminal-command": "bun test e2e/tests/terminal-command.e2e.test.ts", + "e2e:knowledge-file": "bun test e2e/tests/knowledge-file.e2e.test.ts" } } diff --git a/packages/agent-runtime/src/tools/handlers/tool/spawn-agent-utils.ts b/packages/agent-runtime/src/tools/handlers/tool/spawn-agent-utils.ts index ae24c9287d..77dac6b366 100644 --- a/packages/agent-runtime/src/tools/handlers/tool/spawn-agent-utils.ts +++ b/packages/agent-runtime/src/tools/handlers/tool/spawn-agent-utils.ts @@ -162,70 +162,6 @@ export function getMatchingSpawn( return null } -/** - * Synchronously transforms spawn_agents input to use 'commander-lite' instead of 'commander' - * when the parent agent doesn't have access to 'commander' but does have access to 'commander-lite'. - * This should be called BEFORE the tool call is streamed to the UI. - */ -export function transformSpawnAgentsInput( - input: Record, - spawnableAgents: AgentTemplateType[], -): Record { - const agents = input.agents - if (!Array.isArray(agents)) { - return input - } - - let hasTransformation = false - const transformedAgents = agents.map((agent) => { - if (typeof agent !== 'object' || agent === null) { - return agent - } - - const agentEntry = agent as Record - const agentTypeStr = agentEntry.agent_type - if (typeof agentTypeStr !== 'string') { - return agent - } - - // Check if this is 'commander' - const { agentId } = parseAgentId(agentTypeStr) - if (agentId !== 'commander') { - return agent - } - - // Check if 'commander' is available in spawnableAgents - const commanderType = getMatchingSpawn(spawnableAgents, agentTypeStr) - if (commanderType) { - // Commander is available, no transformation needed - return agent - } - - // Check if 'commander-lite' is available as a fallback - const commanderLiteType = getMatchingSpawn(spawnableAgents, 'commander-lite') - if (!commanderLiteType) { - // Neither available, let validation handle the error - return agent - } - - // Transform commander -> commander-lite - hasTransformation = true - return { - ...agentEntry, - agent_type: commanderLiteType, - } - }) - - if (!hasTransformation) { - return input - } - - return { - ...input, - agents: transformedAgents, - } -} - /** * Validates agent template and permissions */ diff --git a/packages/agent-runtime/src/tools/tool-executor.ts b/packages/agent-runtime/src/tools/tool-executor.ts index 23d2e7880d..ad527e0932 100644 --- a/packages/agent-runtime/src/tools/tool-executor.ts +++ b/packages/agent-runtime/src/tools/tool-executor.ts @@ -9,7 +9,6 @@ import { getAgentShortName } from '../templates/prompts' import { codebuffToolHandlers } from './handlers/list' import { getMatchingSpawn, - transformSpawnAgentsInput, } from './handlers/tool/spawn-agent-utils' import { getAgentTemplate } from '../templates/agent-registry' import { ensureZodSchema } from './prompts' @@ -192,18 +191,11 @@ export async function executeToolCall( return previousToolCallFinished } - // Transform spawn_agents input to use commander-lite fallback before streaming - // This ensures the UI shows the correct agent type from the start - const transformedInput = - toolName === 'spawn_agents' - ? transformSpawnAgentsInput(input, agentTemplate.spawnableAgents) - : input - // TODO: Allow tools to provide a validation function, and move this logic into the spawn_agents validation function. // Pre-validate spawn_agents to filter out non-existent agents before streaming - let effectiveInput = transformedInput + let effectiveInput = input if (toolName === 'spawn_agents') { - const agents = (transformedInput as Record).agents + const agents = (input as Record).agents if (Array.isArray(agents)) { const BASE_AGENTS = [ 'base', @@ -284,7 +276,7 @@ export async function executeToolCall( } const errorMsg = `Some agents could not be spawned: ${errors.join('; ')}. Proceeding with valid agents only.` onResponseChunk({ type: 'error', message: errorMsg }) - effectiveInput = { ...transformedInput, agents: validAgents } + effectiveInput = { ...input, agents: validAgents } } } } diff --git a/scripts/test-canopywave-long.ts b/scripts/test-canopywave-long.ts new file mode 100644 index 0000000000..154e08ea76 --- /dev/null +++ b/scripts/test-canopywave-long.ts @@ -0,0 +1,391 @@ +#!/usr/bin/env bun + +/** + * Test script to verify CanopyWave prompt caching across a 10-turn conversation. + * + * Uses a very large system prompt (~5k+ input tokens) with low output (max 100 tokens) + * to measure how well CanopyWave caches the shared prefix across turns. + * + * Usage: + * bun scripts/test-canopywave-long.ts + */ + +export { } + +const CANOPYWAVE_BASE_URL = 'https://inference.canopywave.io/v1' +const CANOPYWAVE_MODEL = 'minimax/minimax-m2.5' + +// Pricing constants — same model as Fireworks/SiliconFlow +const INPUT_COST_PER_TOKEN = 0.30 / 1_000_000 +const CACHED_INPUT_COST_PER_TOKEN = 0.03 / 1_000_000 +const OUTPUT_COST_PER_TOKEN = 1.20 / 1_000_000 + +const MAX_TOKENS = 100 + +function computeCost(usage: Record): { cost: number; breakdown: string } { + const inputTokens = typeof usage.prompt_tokens === 'number' ? usage.prompt_tokens : 0 + const outputTokens = typeof usage.completion_tokens === 'number' ? usage.completion_tokens : 0 + const promptDetails = usage.prompt_tokens_details as Record | undefined + const cachedTokens = typeof promptDetails?.cached_tokens === 'number' ? promptDetails.cached_tokens : 0 + const nonCachedInput = Math.max(0, inputTokens - cachedTokens) + + const inputCost = nonCachedInput * INPUT_COST_PER_TOKEN + const cachedCost = cachedTokens * CACHED_INPUT_COST_PER_TOKEN + const outputCost = outputTokens * OUTPUT_COST_PER_TOKEN + const totalCost = inputCost + cachedCost + outputCost + + const breakdown = [ + `${nonCachedInput} non-cached input × $0.30/M = $${inputCost.toFixed(8)}`, + `${cachedTokens} cached input × $0.03/M = $${cachedCost.toFixed(8)}`, + `${outputTokens} output × $1.20/M = $${outputCost.toFixed(8)}`, + `Total: $${totalCost.toFixed(8)}`, + ].join('\n ') + + return { cost: totalCost, breakdown } +} + +// Very large system prompt to push input tokens to ~5k+ +// Random seed to prevent cache hits on repeated runs +const SEED_STRING = `Seed: ${Math.random().toString(36).slice(2, 10)}` + +const SYSTEM_PROMPT = `You are an expert software architect, technical writer, and senior engineering consultant. +${SEED_STRING} +You always respond with brief, concise answers — one or two sentences at most. +You provide practical advice grounded in real-world engineering experience. + +Your areas of expertise include: +- Distributed systems design and architecture patterns (microservices, event-driven, CQRS, saga patterns, choreography vs orchestration, bulkhead pattern, circuit breaker, retry with exponential backoff, sidecar pattern, ambassador pattern, strangler fig pattern, anti-corruption layer) +- Database design and optimization (relational databases including PostgreSQL, MySQL, SQL Server; document databases including MongoDB, CouchDB, DynamoDB; graph databases including Neo4j, ArangoDB, JanusGraph; time-series databases including InfluxDB, TimescaleDB, QuestDB; wide-column stores including Cassandra, ScyllaDB, HBase; sharding strategies including hash-based, range-based, geographic; replication topologies including primary-replica, multi-primary, chain replication; connection pooling with PgBouncer, ProxySQL; query optimization techniques including index selection, query plan analysis, materialized views, covering indexes, partial indexes, expression indexes) +- Cloud infrastructure and deployment (AWS services including EC2, ECS, EKS, Lambda, S3, DynamoDB, RDS, Aurora, ElastiCache, CloudFront, Route53, IAM, VPC, SQS, SNS, Kinesis, Step Functions; GCP services including GKE, Cloud Run, Cloud Functions, BigQuery, Spanner, Pub/Sub, Cloud Storage; Azure services including AKS, Azure Functions, Cosmos DB, Azure SQL; container orchestration with Kubernetes including deployments, stateful sets, daemon sets, jobs, CronJobs, custom resource definitions, operators, Helm charts, Kustomize; infrastructure as code with Terraform, Pulumi, CloudFormation, CDK; service mesh with Istio, Linkerd, Consul Connect; load balancers including ALB, NLB, HAProxy, Nginx, Envoy; auto-scaling including HPA, VPA, KEDA, cluster autoscaler) +- Programming languages and their ecosystems (TypeScript/JavaScript with Node.js, Deno, Bun; Python with FastAPI, Django, Flask, SQLAlchemy, Pydantic; Rust with Tokio, Actix, Axum, Serde; Go with Gin, Echo, GORM; Java with Spring Boot, Quarkus, Micronaut, Hibernate; C++ with Boost, gRPC, Abseil; Kotlin with Ktor, Spring; Scala with Akka, ZIO, Cats Effect; Elixir with Phoenix, Ecto, LiveView; Haskell with Servant, Yesod, Persistent) +- API design principles (REST architectural constraints, Richardson Maturity Model, HATEOAS, content negotiation; GraphQL including schema design, resolvers, DataLoader, subscriptions, federation; gRPC including protobuf schema design, streaming patterns, interceptors, deadline propagation; WebSocket patterns for real-time communication; Server-Sent Events for unidirectional streaming; OpenAPI/Swagger specification; API versioning strategies including URL path, header, query parameter; pagination patterns including cursor-based, offset, keyset; rate limiting algorithms including token bucket, leaky bucket, sliding window; API gateway patterns) +- Security best practices (authentication protocols including OAuth 2.0, OIDC, SAML, WebAuthn, FIDO2; authorization models including RBAC, ABAC, ReBAC, PBAC; encryption at rest with AES-256, at transit with TLS 1.3; OWASP Top 10 including injection, broken authentication, sensitive data exposure, XXE, broken access control, security misconfiguration, XSS, insecure deserialization, known vulnerabilities, insufficient logging; Content Security Policy headers; CORS configuration; DDoS mitigation with WAF, rate limiting, geo-blocking; secret management with HashiCorp Vault, AWS Secrets Manager, GCP Secret Manager; certificate management including Let's Encrypt, cert-manager, mTLS; supply chain security with SBOM, Sigstore, dependency scanning) +- Performance optimization and profiling (caching strategies including write-through, write-behind, read-through, cache-aside, refresh-ahead; cache invalidation patterns; CDN configuration with CloudFront, Fastly, Cloudflare; connection pooling for HTTP, database, Redis; async patterns including event loops, worker threads, thread pools, coroutines; WebAssembly for compute-intensive operations; JIT compilation optimization; memory profiling with heap snapshots, allocation tracking; CPU profiling with flame graphs, perf, async-profiler; load testing with k6, Locust, Artillery, Gatling; performance budgets and real user monitoring) +- Testing methodologies (unit testing with Jest, Vitest, pytest, Go testing; integration testing with Testcontainers, Docker Compose; end-to-end testing with Playwright, Cypress, Selenium; property-based testing with fast-check, Hypothesis, QuickCheck; mutation testing with Stryker, PITest; snapshot testing; contract testing with Pact, Spring Cloud Contract; chaos engineering with Chaos Monkey, Litmus, Gremlin; load testing; fuzz testing with AFL, LibFuzzer; visual regression testing; accessibility testing) +- CI/CD pipelines and DevOps practices (GitHub Actions workflows, Jenkins pipelines, GitLab CI, CircleCI; ArgoCD for GitOps; deployment strategies including blue-green, canary, rolling update, recreate; feature flag systems with LaunchDarkly, Flagsmith, Unleash; trunk-based development; semantic versioning and conventional commits; artifact management with Artifactory, Nexus, ECR, GCR; infrastructure pipeline including Terraform plan/apply, drift detection; security scanning in CI including SAST, DAST, SCA, secret scanning; release management including changelogs, release notes, semantic-release) +- Monitoring and observability (metrics collection with Prometheus, StatsD, Datadog; visualization with Grafana, Kibana; distributed tracing with Jaeger, Zipkin, Tempo, OpenTelemetry; log aggregation with Elasticsearch, Loki, CloudWatch; alerting with PagerDuty, OpsGenie, VictorOps; SLO/SLI definition and error budgets; synthetic monitoring; real user monitoring; custom business metrics; incident management processes; postmortem culture; runbook automation) +- Data engineering and analytics (stream processing with Apache Kafka, Flink, Spark Streaming, Kinesis; batch processing with Spark, Hadoop, dbt; data warehousing with Snowflake, BigQuery, Redshift, ClickHouse; data lake architecture with Delta Lake, Apache Iceberg, Apache Hudi; ETL/ELT patterns; data quality frameworks with Great Expectations, dbt tests; schema evolution and backward compatibility; data governance and lineage tracking; real-time analytics with materialized views, OLAP cubes) +- Machine learning operations (model serving with TensorFlow Serving, TorchServe, Triton; MLOps pipelines with MLflow, Kubeflow, Metaflow; feature stores with Feast, Tecton; model monitoring for drift detection; A/B testing for ML models; experiment tracking; model versioning and registry; GPU cluster management; inference optimization with quantization, pruning, distillation) + +When providing responses, you follow these conventions: +- Keep answers extremely brief — one or two sentences maximum +- Be direct and actionable +- Use concrete examples over abstract advice +- Reference specific tools, libraries, or patterns by name + +Additional context for this conversation: +- We are working on a high-traffic web application that serves 50 million requests per day across 3 regions +- The system needs to handle bursty traffic patterns with 10x spikes during peak hours and flash sales +- Data consistency is important but eventual consistency is acceptable for most read paths with a 5-second staleness budget +- The team is experienced with TypeScript and Node.js but open to other technologies for specific use cases +- We use PostgreSQL 16 as our primary database with logical replication to read replicas and Redis 7 Cluster for caching +- The application is deployed on Kubernetes 1.29 in a multi-region setup across US-East-1, US-West-2, and EU-West-1 +- We need to maintain 99.95% uptime SLA with a target p99 latency of 150ms for API endpoints and 50ms for cached reads +- Cost optimization is a secondary concern after reliability and developer experience, but we spend $2.5M/year on infrastructure +- The codebase is approximately 750k lines of TypeScript across 80+ microservices with an additional 200k lines of Python for ML services +- We use an event-driven architecture with Kafka (3 clusters, 500+ topics) for inter-service communication with exactly-once semantics +- All services expose both REST (OpenAPI 3.1) and gRPC (protobuf v3) endpoints with automatic code generation +- We have a comprehensive monitoring stack with Prometheus (50M time series), Grafana (200+ dashboards), Jaeger, and PagerDuty +- Database migrations are managed with Drizzle ORM with automated rollback capabilities and zero-downtime schema changes +- The frontend is a Next.js 15 application with React Server Components, streaming SSR, and partial prerendering +- We use feature flags extensively via LaunchDarkly with 500+ active flags and automated cleanup for stale flags +- The CI/CD pipeline runs 5000+ tests (unit, integration, e2e) with a target of under 8 minutes using distributed execution on BuildKite +- We practice trunk-based development with short-lived feature branches, PR previews, and automated merge queues +- The team consists of 60 engineers across 10 squads, each owning 5-12 services with clear domain boundaries +- We use a mono-repo structure managed with Turborepo and Bun workspaces with remote caching +- All inter-service communication uses Protocol Buffers for serialization with a shared schema registry and backward compatibility enforcement +- We have a custom API gateway built on Envoy that handles authentication, rate limiting, request routing, and observability injection +- The system processes approximately 100TB of data per day through our analytics pipeline (Kafka → Flink → ClickHouse + BigQuery) +- Mobile clients communicate via a BFF (Backend for Frontend) layer with GraphQL federation across 12 subgraphs +- We have a custom feature flag evaluation engine that supports complex targeting rules including percentage rollouts, user segments, and geographic targeting +- The deployment pipeline supports multi-region blue-green deployments with automated rollback on SLO violation detection +- We use HashiCorp Vault for secret management with automatic rotation policies for database credentials, API keys, and certificates +- Our observability stack includes custom instrumentation for business metrics including revenue, conversion, engagement, and error rates +- The team follows an RFC process for architectural decisions with ADRs stored in the repo and reviewed by the architecture guild +- We have a dedicated platform team of 8 engineers that maintains shared infrastructure, developer tooling, and internal SDKs +- All services implement health checks (liveness + readiness), graceful shutdown handlers, and circuit breakers via a shared middleware library +- We use PgBouncer in transaction mode for PostgreSQL connection pooling (max 500 connections per region) and Redis Cluster with 6 shards per region +- The system supports multi-tenancy with tenant isolation at the database level using row-level security and per-tenant connection pools +- We have a custom schema registry for Kafka topic schemas with backward/forward compatibility validation and automated consumer migration +- Our error handling follows a structured error taxonomy with 200+ error codes, retry policies, and dead-letter queues for unprocessable messages +- We use structured logging with JSON format, correlation IDs, and trace context propagation across all services via OpenTelemetry +- The frontend uses a design system with 300+ components maintained by a dedicated UI platform team with visual regression testing via Chromatic +- We have automated performance regression testing that runs nightly against production-like data with 10% traffic replay +- Our incident response process includes automated runbook execution, escalation policies, and post-incident review within 48 hours +- We maintain a service catalog with dependency graphs, SLO definitions, on-call schedules, and cost attribution per service +- The platform supports A/B testing with Bayesian statistical significance calculations, multi-armed bandit allocation, and segment analysis +- We use GitOps for all infrastructure management with Terraform modules in a dedicated repo and Atlantis for plan/apply workflows +- Our security posture includes weekly penetration testing, continuous dependency scanning with Snyk, SAST with Semgrep, and DAST with OWASP ZAP +- We have a data mesh architecture for analytics with 15 domain-owned data products, each with defined SLAs and data contracts +- The system supports webhook delivery with at-least-once semantics, configurable retry policies (exponential backoff up to 24h), and delivery status tracking +- We use OpenTelemetry Collector for telemetry pipeline with custom processors for PII redaction, sampling, and cost-based routing +- Our caching strategy uses L1 (in-process LRU, 100MB per pod), L2 (Redis Cluster, 500GB), and L3 (CloudFront, 30+ edge locations) with coordinated invalidation +- We maintain backward compatibility for 3 API versions simultaneously with automated deprecation notices, usage tracking, and migration guides +- The platform includes a developer portal with API documentation, SDK generation, sandbox environments, and usage analytics +- We use Temporal for workflow orchestration across 20+ long-running business processes including order fulfillment, payment processing, and user onboarding +- Our ML platform serves 50+ models in production with A/B testing, shadow mode deployment, and automated retraining pipelines +- The search infrastructure uses Elasticsearch clusters with 500M+ documents, custom analyzers, and learning-to-rank models +- We have a notification system that delivers 10M+ messages daily across email, push, SMS, and in-app channels with template management and delivery optimization +- The billing system processes $50M+ in monthly transactions with Stripe integration, usage-based billing, and revenue recognition +- We use Crossplane for provisioning cloud resources as Kubernetes custom resources with drift detection and reconciliation +- Our edge computing layer uses Cloudflare Workers for geo-routing, A/B test assignment, and personalization at the edge +- The platform includes a custom query builder for internal dashboards that generates optimized SQL for ClickHouse and PostgreSQL +- We maintain a shared protobuf definition repository with 500+ message types, automated code generation for 6 languages, and breaking change detection` + +const TURN_PROMPTS = [ + 'Give a brief one-sentence answer: What is the single most important principle when designing distributed systems?', + 'Give a brief one-sentence answer: What is the biggest mistake teams make when adopting microservices?', + 'Give a brief one-sentence answer: When should you choose eventual consistency over strong consistency?', + 'Give a brief one-sentence answer: What is the most underrated database optimization technique?', + 'Give a brief one-sentence answer: What is the best approach to handle cascading failures in a microservice architecture?', + 'Give a brief one-sentence answer: When is it better to use gRPC over REST?', + 'Give a brief one-sentence answer: What is the most effective caching strategy for a read-heavy workload?', + 'Give a brief one-sentence answer: What is the key to successful trunk-based development at scale?', + 'Give a brief one-sentence answer: What metric best predicts production reliability?', + 'Give a brief one-sentence answer: What is the most important thing to get right in an observability stack?', +] + +interface ConversationMessage { + role: string + content: string +} + +interface TurnResult { + label: string + usage: Record | null + elapsedMs: number + outputTokens: number + ttftMs?: number + outputTokensPerSec?: number + responseContent: string +} + +async function makeConversationStreamRequest( + label: string, + apiKey: string, + conversationMessages: ConversationMessage[], +): Promise { + console.log(`── ${label} (streaming) ──`) + const startTime = Date.now() + let ttftMs: number | undefined + + const response = await fetch(`${CANOPYWAVE_BASE_URL}/chat/completions`, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: CANOPYWAVE_MODEL, + messages: conversationMessages, + max_tokens: MAX_TOKENS, + stream: true, + stream_options: { include_usage: true }, + }), + }) + + if (!response.ok) { + const errorText = await response.text() + console.error(`❌ CanopyWave streaming API returned ${response.status}: ${errorText}`) + return { label, usage: null, elapsedMs: Date.now() - startTime, outputTokens: 0, responseContent: '' } + } + + const reader = response.body?.getReader() + if (!reader) { + console.error('❌ No response body reader') + return { label, usage: null, elapsedMs: Date.now() - startTime, outputTokens: 0, responseContent: '' } + } + + const decoder = new TextDecoder() + let streamContent = '' + let chunkCount = 0 + let streamUsage: Record | null = null + let firstContentChunkTime: number | undefined + + let done = false + while (!done) { + const result = await reader.read() + done = result.done + if (done) break + + const text = decoder.decode(result.value, { stream: true }) + const lines = text.split('\n').filter((l) => l.startsWith('data: ')) + + for (const line of lines) { + const raw = line.slice('data: '.length) + if (raw === '[DONE]') continue + + try { + const chunk = JSON.parse(raw) + chunkCount++ + const delta = chunk.choices?.[0]?.delta + if (delta?.content) { + if (firstContentChunkTime === undefined) { + firstContentChunkTime = Date.now() + ttftMs = firstContentChunkTime - startTime + } + streamContent += delta.content + } + if (delta?.reasoning_content) { + // Skip reasoning content for this test + } + if (chunk.usage) streamUsage = chunk.usage + } catch { + // skip non-JSON lines + } + } + } + + const elapsedMs = Date.now() - startTime + const outputTokens = streamUsage && typeof streamUsage.completion_tokens === 'number' + ? streamUsage.completion_tokens + : 0 + + const generationTimeMs = firstContentChunkTime !== undefined + ? Date.now() - firstContentChunkTime + : elapsedMs + const outputTokensPerSec = generationTimeMs > 0 + ? (outputTokens / (generationTimeMs / 1000)) + : 0 + + // Print compact per-turn stats + const inputTokens = streamUsage && typeof streamUsage.prompt_tokens === 'number' ? streamUsage.prompt_tokens : 0 + const promptDetails = streamUsage?.prompt_tokens_details as Record | undefined + const cachedTokens = typeof promptDetails?.cached_tokens === 'number' ? promptDetails.cached_tokens : 0 + const cacheRate = inputTokens > 0 ? ((cachedTokens / inputTokens) * 100).toFixed(1) : '0.0' + const cost = streamUsage ? `$${computeCost(streamUsage).cost.toFixed(6)}` : 'err' + + console.log(` ✅ ${(elapsedMs / 1000).toFixed(2)}s | TTFT ${ttftMs !== undefined ? (ttftMs / 1000).toFixed(2) + 's' : 'n/a'} | ${inputTokens} in (${cachedTokens} cached, ${cacheRate}%) | ${outputTokens} out @ ${outputTokensPerSec.toFixed(1)} tok/s | ${cost}`) + console.log(` Response: ${streamContent.slice(0, 150)}${streamContent.length > 150 ? '...' : ''}`) + console.log() + + return { label, usage: streamUsage, elapsedMs, outputTokens, ttftMs, outputTokensPerSec, responseContent: streamContent } +} + +async function main() { + const apiKey = process.env.CANOPYWAVE_API_KEY + if (!apiKey) { + console.error('❌ CANOPYWAVE_API_KEY is not set. Add it to .env.local or pass it directly.') + process.exit(1) + } + + console.log('🧪 CanopyWave 10-Turn Conversation Caching Test') + console.log('='.repeat(60)) + console.log(`Model: ${CANOPYWAVE_MODEL}`) + console.log(`Base URL: ${CANOPYWAVE_BASE_URL}`) + console.log(`Max tokens: ${MAX_TOKENS} (low output per turn)`) + console.log(`Turns: ${TURN_PROMPTS.length}`) + console.log(`Pricing: $0.30/M input, $0.03/M cached, $1.20/M output`) + console.log('='.repeat(60)) + console.log() + + const conversationHistory: ConversationMessage[] = [ + { role: 'system', content: SYSTEM_PROMPT }, + ] + + const results: TurnResult[] = [] + + for (let i = 0; i < TURN_PROMPTS.length; i++) { + conversationHistory.push({ role: 'user', content: TURN_PROMPTS[i] }) + + const label = `Turn ${i + 1}/${TURN_PROMPTS.length}${i === 0 ? ' (cold)' : ''}` + const result = await makeConversationStreamRequest(label, apiKey, [...conversationHistory]) + results.push(result) + + if (result.responseContent) { + conversationHistory.push({ role: 'assistant', content: result.responseContent }) + } + } + + // ── Summary table ── + console.log('━'.repeat(120)) + console.log('SUMMARY') + console.log('━'.repeat(120)) + console.log() + + console.log(' Turn | Time | TTFT | Input | Cached | Cache% | Output | tok/s | e2e t/s | Cost') + console.log(' ' + '-'.repeat(110)) + + let totalCost = 0 + let totalInputTokens = 0 + let totalCachedTokens = 0 + let totalOutputTokens = 0 + let totalElapsedMs = 0 + + for (const r of results) { + const time = `${(r.elapsedMs / 1000).toFixed(2)}s` + const ttft = r.ttftMs !== undefined ? `${(r.ttftMs / 1000).toFixed(2)}s` : 'n/a' + const tokSec = r.outputTokensPerSec !== undefined ? r.outputTokensPerSec.toFixed(1) : 'n/a' + const e2eTokSec = r.elapsedMs > 0 ? (r.outputTokens / (r.elapsedMs / 1000)).toFixed(1) : 'n/a' + const cost = r.usage ? computeCost(r.usage).cost : 0 + const costStr = r.usage ? `$${cost.toFixed(6)}` : 'err' + + const inputTokens = r.usage && typeof r.usage.prompt_tokens === 'number' ? r.usage.prompt_tokens : 0 + const promptDetails = r.usage?.prompt_tokens_details as Record | undefined + const cachedTokens = typeof promptDetails?.cached_tokens === 'number' ? promptDetails.cached_tokens : 0 + const cacheRate = inputTokens > 0 ? `${((cachedTokens / inputTokens) * 100).toFixed(1)}%` : '0.0%' + + totalCost += cost + totalInputTokens += inputTokens + totalCachedTokens += cachedTokens + totalOutputTokens += r.outputTokens + totalElapsedMs += r.elapsedMs + + console.log( + ` ${r.label.padEnd(4).slice(0, 25).padEnd(25)} | ${time.padStart(8)} | ${ttft.padStart(7)} | ${String(inputTokens).padStart(6)} | ${String(cachedTokens).padStart(6)} | ${cacheRate.padStart(7)} | ${String(r.outputTokens).padStart(6)} | ${tokSec.padStart(6)} | ${e2eTokSec.padStart(7)} | ${costStr}`, + ) + } + + console.log(' ' + '-'.repeat(110)) + + const overallCacheRate = totalInputTokens > 0 ? ((totalCachedTokens / totalInputTokens) * 100).toFixed(1) : '0.0' + const totalTimeStr = `${(totalElapsedMs / 1000).toFixed(2)}s` + const overallTokSec = totalElapsedMs > 0 ? (totalOutputTokens / (totalElapsedMs / 1000)).toFixed(1) : 'n/a' + console.log(` ${'TOTAL'.padEnd(25)} | ${totalTimeStr.padStart(8)} | | ${String(totalInputTokens).padStart(6)} | ${String(totalCachedTokens).padStart(6)} | ${(overallCacheRate + '%').padStart(7)} | ${String(totalOutputTokens).padStart(6)} | | ${overallTokSec.padStart(7)} | $${totalCost.toFixed(6)}`) + console.log() + + // ── Cost analysis ── + console.log('━'.repeat(120)) + console.log('COST ANALYSIS') + console.log('━'.repeat(120)) + console.log() + + // What would the cost be without caching? + const costWithoutCaching = totalInputTokens * INPUT_COST_PER_TOKEN + totalOutputTokens * OUTPUT_COST_PER_TOKEN + const savings = costWithoutCaching - totalCost + const savingsPercent = costWithoutCaching > 0 ? ((savings / costWithoutCaching) * 100).toFixed(1) : '0.0' + + console.log(` Total cost (actual): $${totalCost.toFixed(6)}`) + console.log(` Total cost (no caching): $${costWithoutCaching.toFixed(6)}`) + console.log(` Savings from caching: $${savings.toFixed(6)} (${savingsPercent}%)`) + console.log() + console.log(` Total input tokens: ${totalInputTokens}`) + console.log(` Total cached tokens: ${totalCachedTokens}`) + console.log(` Overall cache hit rate: ${overallCacheRate}%`) + console.log(` Total output tokens: ${totalOutputTokens}`) + console.log() + + // TTFT analysis + const ttfts = results.filter((r) => r.ttftMs !== undefined).map((r) => r.ttftMs!) + if (ttfts.length > 0) { + const avgTtft = ttfts.reduce((a, b) => a + b, 0) / ttfts.length + const minTtft = Math.min(...ttfts) + const maxTtft = Math.max(...ttfts) + console.log(` TTFT — avg: ${(avgTtft / 1000).toFixed(2)}s, min: ${(minTtft / 1000).toFixed(2)}s, max: ${(maxTtft / 1000).toFixed(2)}s`) + + if (results[0].ttftMs !== undefined && ttfts.length > 1) { + const coldTtft = results[0].ttftMs + const warmTtfts = ttfts.slice(1) + const avgWarmTtft = warmTtfts.reduce((a, b) => a + b, 0) / warmTtfts.length + console.log(` TTFT — cold (turn 1): ${(coldTtft / 1000).toFixed(2)}s, avg warm (turns 2-${TURN_PROMPTS.length}): ${(avgWarmTtft / 1000).toFixed(2)}s`) + if (avgWarmTtft < coldTtft) { + console.log(` ✅ Warm TTFT is ${((1 - avgWarmTtft / coldTtft) * 100).toFixed(1)}% faster than cold TTFT`) + } + } + } + + console.log() + console.log('Done!') +} + +main() diff --git a/scripts/test-siliconflow.ts b/scripts/test-siliconflow.ts index 845db4a3cb..c62d9d47c8 100644 --- a/scripts/test-siliconflow.ts +++ b/scripts/test-siliconflow.ts @@ -256,9 +256,9 @@ async function makeConversationStreamRequest( } async function main() { - const apiKey = process.env.SILICONFLOW_API_KEY + const apiKey = process.env.SILICON_FLOW_API_KEY if (!apiKey) { - console.error('❌ SILICONFLOW_API_KEY is not set. Add it to .env.local or pass it directly.') + console.error('❌ SILICON_FLOW_API_KEY is not set. Add it to .env.local or pass it directly.') process.exit(1) } diff --git a/web/src/content/advanced/how-does-it-work.mdx b/web/src/content/advanced/how-does-it-work.mdx index d1f98f536d..accdc2c3d4 100644 --- a/web/src/content/advanced/how-does-it-work.mdx +++ b/web/src/content/advanced/how-does-it-work.mdx @@ -26,7 +26,7 @@ The main agent ("Buffy") runs on Claude Opus 4.6. It reads your prompt, gathers - [**Thinker**](/publishers/codebuff/agents/thinker) (GPT-5.1, Gemini 2.5 Pro) - works through hard problems - [**Editor**](/publishers/codebuff/agents/editor) (GPT-5.1, Claude Opus 4.6) - writes and modifies code - [**Reviewer**](/publishers/codebuff/agents/reviewer) (Claude Opus 4.6, MiniMax M2.5 in Free mode) - catches bugs and style issues -- [**Commander**](/publishers/codebuff/agents/commander) (Grok 4 Fast or Claude Sonnet 4.5) - runs terminal commands +- [**Basher**](/publishers/codebuff/agents/basher) (Gemini 3.1 Flash Lite) - runs terminal commands ## Best-of-N Selection (Max Mode) @@ -38,6 +38,6 @@ In Max mode, Codebuff spawns multiple editors with different strategies. A selec 2. File pickers and searchers find relevant code 3. Thinkers analyze the problem if needed 4. Editors generate changes -5. Reviewers check for issues; commanders run tests +5. Reviewers check for issues; bashers run tests The server is stateless. It streams requests to model providers (Anthropic, OpenAI, Google, xAI) over websockets. Your code stays local; only relevant context is sent. diff --git a/web/src/content/agents/overview.mdx b/web/src/content/agents/overview.mdx index d189a62f9d..e008e7e2b4 100644 --- a/web/src/content/agents/overview.mdx +++ b/web/src/content/agents/overview.mdx @@ -29,7 +29,7 @@ Control agents with TypeScript generator functions. Orchestrate workflows, branc - [`codebuff/thinker`](/publishers/codebuff/agents/thinker) - Deep thinking and problem analysis - [`codebuff/researcher`](/publishers/codebuff/agents/researcher) - Web search and documentation lookup - [`codebuff/file-picker`](/publishers/codebuff/agents/file-picker) - File discovery in your codebase -- [`codebuff/commander`](/publishers/codebuff/agents/commander) - Terminal command execution +- [`codebuff/basher`](/publishers/codebuff/agents/basher) - Terminal command execution in bash shell - [`codebuff/code-searcher`](/publishers/codebuff/agents/code-searcher) - Search patterns in code files Browse all available agents at the [Agent Store](https://codebuff.com/store).