diff --git a/src/core/ai/gateway.ts b/src/core/ai/gateway.ts index 0db796f66..c8428e3ae 100644 --- a/src/core/ai/gateway.ts +++ b/src/core/ai/gateway.ts @@ -105,24 +105,32 @@ export function configureGateway(config: AIGatewayConfig): void { */ const _warnedRecipes = new Set(); +function getUsableMaxBatchTokens(maxBatchTokens: number | undefined): number | undefined { + return typeof maxBatchTokens === 'number' && maxBatchTokens > 0 ? maxBatchTokens : undefined; +} + /** * Walk every registered recipe with an `embedding` touchpoint. Each one * missing `max_batch_tokens` gets exactly one stderr line per process for - * its first appearance. Recipes WITH the field stay quiet. The - * recursive-halving safety net only fires when `max_batch_tokens` is set, - * so a recipe that forgets it has no protection if the provider has a - * batch cap. Loud-fail over silent-skip per CLAUDE.md; a future - * Cohere/Mistral/Jina recipe that inherits the embedding-touchpoint - * pattern but forgets the cap re-creates the v0.27 Voyage backfill loop. - * The warning calls that out before production traffic hits it. + * its first appearance. Recipes WITH the field stay quiet. A `0` value is + * an explicit sentinel for model-specific or proxy-defined limits where the + * shipped recipe cannot truthfully advertise one static cap. + * + * The recursive-halving safety net only fires after a token-limit miss, so a + * recipe that accidentally omits a known cap loses the proactive pre-split. + * Loud-fail over silent-skip per CLAUDE.md; a future Cohere/Mistral/Jina + * recipe that inherits the embedding-touchpoint pattern but forgets the cap + * re-creates the v0.27 Voyage backfill loop. The warning calls that out + * before production traffic hits it. */ function warnRecipesMissingBatchTokens(): void { for (const recipe of listRecipes()) { const embedding = recipe.touchpoints?.embedding; - if (!embedding || embedding.max_batch_tokens !== undefined) continue; + const maxBatchTokens = embedding?.max_batch_tokens; + if (!embedding || getUsableMaxBatchTokens(maxBatchTokens) !== undefined || maxBatchTokens === 0) continue; // OpenAI is the canonical "no cap declared, fast path is intentional" - // recipe; suppress the warning for it. Every other recipe missing the - // field is suspicious. + // recipe; suppress the warning for it. Every other accidental omission + // is suspicious. if (recipe.id === 'openai') continue; if (_warnedRecipes.has(recipe.id)) continue; _warnedRecipes.add(recipe.id); @@ -357,7 +365,7 @@ export async function embed(texts: string[]): Promise { const expected = cfg.embedding_dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS; const embedding = recipe.touchpoints?.embedding; - const maxBatchTokens = embedding?.max_batch_tokens; + const maxBatchTokens = getUsableMaxBatchTokens(embedding?.max_batch_tokens); const charsPerToken = embedding?.chars_per_token ?? DEFAULT_CHARS_PER_TOKEN; // Pre-split is gated on max_batch_tokens. Recipes without it (e.g. OpenAI) diff --git a/src/core/ai/recipes/google.ts b/src/core/ai/recipes/google.ts index 58e47cab3..70c24aeb2 100644 --- a/src/core/ai/recipes/google.ts +++ b/src/core/ai/recipes/google.ts @@ -16,6 +16,10 @@ export const google: Recipe = { dims_options: [768, 1536, 3072], cost_per_1m_tokens_usd: 0.15, price_last_verified: '2026-04-20', + // Google documents gemini-embedding-001 at 250 input texts / 20K + // tokens per request. Use the raw cap here; gateway.ts applies the + // recipe-level safety factor before pre-splitting. + max_batch_tokens: 20_000, }, expansion: { models: ['gemini-2.0-flash', 'gemini-2.0-flash-lite'], diff --git a/src/core/ai/recipes/litellm-proxy.ts b/src/core/ai/recipes/litellm-proxy.ts index 8f7da2dea..64c87177c 100644 --- a/src/core/ai/recipes/litellm-proxy.ts +++ b/src/core/ai/recipes/litellm-proxy.ts @@ -26,6 +26,10 @@ export const litellmProxy: Recipe = { default_dims: 0, // user must declare --embedding-dimensions explicitly cost_per_1m_tokens_usd: undefined, price_last_verified: '2026-04-20', + // Sentinel: LiteLLM proxies arbitrary upstream embedding models, so the + // real batch cap lives in the proxy/model config (`max_input_tokens`), + // not in a single truthful static recipe constant. + max_batch_tokens: 0, }, }, setup_hint: 'Run LiteLLM (https://docs.litellm.ai) in front of any provider; set LITELLM_BASE_URL + pass --embedding-model litellm: and --embedding-dimensions .', diff --git a/src/core/ai/recipes/ollama.ts b/src/core/ai/recipes/ollama.ts index 31fac0ae6..8df75a606 100644 --- a/src/core/ai/recipes/ollama.ts +++ b/src/core/ai/recipes/ollama.ts @@ -17,6 +17,10 @@ export const ollama: Recipe = { default_dims: 768, // nomic-embed-text native dim cost_per_1m_tokens_usd: 0, price_last_verified: '2026-04-20', + // Sentinel: Ollama's embed cap is model-/runtime-specific (`num_ctx`, + // context window, truncate=true by default), so there is no truthful + // static provider-wide max_batch_tokens value for this recipe. + max_batch_tokens: 0, }, }, setup_hint: 'Install Ollama from https://ollama.ai, then `ollama pull nomic-embed-text` and `ollama serve`.', diff --git a/test/ai/adaptive-embed-batch.test.ts b/test/ai/adaptive-embed-batch.test.ts index 6cd8d06ab..6316bd8d5 100644 --- a/test/ai/adaptive-embed-batch.test.ts +++ b/test/ai/adaptive-embed-batch.test.ts @@ -39,6 +39,7 @@ import { __getShrinkStateForTests, } from '../../src/core/ai/gateway.ts'; import { AIConfigError, AITransientError } from '../../src/core/ai/errors.ts'; +import { getRecipe } from '../../src/core/ai/recipes/index.ts'; // --------- Test helpers --------- @@ -357,34 +358,28 @@ describe('shrink-on-miss adaptive cache', () => { // --------- 7. Startup warning (D9-B) --------- -describe('startup warning for recipes missing max_batch_tokens', () => { +describe('startup warning hygiene for shipped recipes', () => { beforeEach(() => resetGateway()); - test('first configureGateway call warns about each missing-cap recipe; subsequent calls suppressed', () => { + test('shipped embedding recipes are either capped or explicitly opt out', () => { + expect(getRecipe('google')!.touchpoints.embedding!.max_batch_tokens).toBe(20_000); + expect(getRecipe('ollama')!.touchpoints.embedding!.max_batch_tokens).toBe(0); + expect(getRecipe('litellm')!.touchpoints.embedding!.max_batch_tokens).toBe(0); + }); + + test('configureGateway stays quiet for the shipped registry; subsequent calls remain quiet', () => { const warnings: string[] = []; const original = console.warn; console.warn = (msg: string) => warnings.push(String(msg)); try { configureOpenAI(); const firstCallCount = warnings.length; - // Reconfigure: the warning should NOT re-fire for the same recipes - // within one process (we already told the operator). configureOpenAI(); expect(warnings.length).toBe(firstCallCount); } finally { console.warn = original; } - // The warning text should match the documented contract. - const contractMatch = warnings.filter(w => - w.includes('[ai.gateway]') && w.includes('declares an embedding touchpoint'), - ); - expect(contractMatch.length).toBeGreaterThan(0); - - // Voyage declares max_batch_tokens → suppressed. OpenAI is the - // canonical fast-path recipe → also suppressed by id. Both must be - // absent from the warnings. - expect(warnings.find(w => w.includes('"voyage"'))).toBeUndefined(); - expect(warnings.find(w => w.includes('"openai"'))).toBeUndefined(); + expect(warnings).toEqual([]); }); });