Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions src/core/ai/gateway.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,24 +105,32 @@ export function configureGateway(config: AIGatewayConfig): void {
*/
const _warnedRecipes = new Set<string>();

function getUsableMaxBatchTokens(maxBatchTokens: number | undefined): number | undefined {
return typeof maxBatchTokens === 'number' && maxBatchTokens > 0 ? maxBatchTokens : undefined;
}

/**
* Walk every registered recipe with an `embedding` touchpoint. Each one
* missing `max_batch_tokens` gets exactly one stderr line per process for
* its first appearance. Recipes WITH the field stay quiet. The
* recursive-halving safety net only fires when `max_batch_tokens` is set,
* so a recipe that forgets it has no protection if the provider has a
* batch cap. Loud-fail over silent-skip per CLAUDE.md; a future
* Cohere/Mistral/Jina recipe that inherits the embedding-touchpoint
* pattern but forgets the cap re-creates the v0.27 Voyage backfill loop.
* The warning calls that out before production traffic hits it.
* its first appearance. Recipes WITH the field stay quiet. A `0` value is
* an explicit sentinel for model-specific or proxy-defined limits where the
* shipped recipe cannot truthfully advertise one static cap.
*
* The recursive-halving safety net only fires after a token-limit miss, so a
* recipe that accidentally omits a known cap loses the proactive pre-split.
* Loud-fail over silent-skip per CLAUDE.md; a future Cohere/Mistral/Jina
* recipe that inherits the embedding-touchpoint pattern but forgets the cap
* re-creates the v0.27 Voyage backfill loop. The warning calls that out
* before production traffic hits it.
*/
function warnRecipesMissingBatchTokens(): void {
for (const recipe of listRecipes()) {
const embedding = recipe.touchpoints?.embedding;
if (!embedding || embedding.max_batch_tokens !== undefined) continue;
const maxBatchTokens = embedding?.max_batch_tokens;
if (!embedding || getUsableMaxBatchTokens(maxBatchTokens) !== undefined || maxBatchTokens === 0) continue;
// OpenAI is the canonical "no cap declared, fast path is intentional"
// recipe; suppress the warning for it. Every other recipe missing the
// field is suspicious.
// recipe; suppress the warning for it. Every other accidental omission
// is suspicious.
if (recipe.id === 'openai') continue;
if (_warnedRecipes.has(recipe.id)) continue;
_warnedRecipes.add(recipe.id);
Expand Down Expand Up @@ -357,7 +365,7 @@ export async function embed(texts: string[]): Promise<Float32Array[]> {
const expected = cfg.embedding_dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS;

const embedding = recipe.touchpoints?.embedding;
const maxBatchTokens = embedding?.max_batch_tokens;
const maxBatchTokens = getUsableMaxBatchTokens(embedding?.max_batch_tokens);
const charsPerToken = embedding?.chars_per_token ?? DEFAULT_CHARS_PER_TOKEN;

// Pre-split is gated on max_batch_tokens. Recipes without it (e.g. OpenAI)
Expand Down
4 changes: 4 additions & 0 deletions src/core/ai/recipes/google.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ export const google: Recipe = {
dims_options: [768, 1536, 3072],
cost_per_1m_tokens_usd: 0.15,
price_last_verified: '2026-04-20',
// Google documents gemini-embedding-001 at 250 input texts / 20K
// tokens per request. Use the raw cap here; gateway.ts applies the
// recipe-level safety factor before pre-splitting.
max_batch_tokens: 20_000,
},
expansion: {
models: ['gemini-2.0-flash', 'gemini-2.0-flash-lite'],
Expand Down
4 changes: 4 additions & 0 deletions src/core/ai/recipes/litellm-proxy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ export const litellmProxy: Recipe = {
default_dims: 0, // user must declare --embedding-dimensions explicitly
cost_per_1m_tokens_usd: undefined,
price_last_verified: '2026-04-20',
// Sentinel: LiteLLM proxies arbitrary upstream embedding models, so the
// real batch cap lives in the proxy/model config (`max_input_tokens`),
// not in a single truthful static recipe constant.
max_batch_tokens: 0,
},
},
setup_hint: 'Run LiteLLM (https://docs.litellm.ai) in front of any provider; set LITELLM_BASE_URL + pass --embedding-model litellm:<model> and --embedding-dimensions <N>.',
Expand Down
4 changes: 4 additions & 0 deletions src/core/ai/recipes/ollama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ export const ollama: Recipe = {
default_dims: 768, // nomic-embed-text native dim
cost_per_1m_tokens_usd: 0,
price_last_verified: '2026-04-20',
// Sentinel: Ollama's embed cap is model-/runtime-specific (`num_ctx`,
// context window, truncate=true by default), so there is no truthful
// static provider-wide max_batch_tokens value for this recipe.
max_batch_tokens: 0,
},
},
setup_hint: 'Install Ollama from https://ollama.ai, then `ollama pull nomic-embed-text` and `ollama serve`.',
Expand Down
25 changes: 10 additions & 15 deletions test/ai/adaptive-embed-batch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import {
__getShrinkStateForTests,
} from '../../src/core/ai/gateway.ts';
import { AIConfigError, AITransientError } from '../../src/core/ai/errors.ts';
import { getRecipe } from '../../src/core/ai/recipes/index.ts';

// --------- Test helpers ---------

Expand Down Expand Up @@ -357,34 +358,28 @@ describe('shrink-on-miss adaptive cache', () => {

// --------- 7. Startup warning (D9-B) ---------

describe('startup warning for recipes missing max_batch_tokens', () => {
describe('startup warning hygiene for shipped recipes', () => {
beforeEach(() => resetGateway());

test('first configureGateway call warns about each missing-cap recipe; subsequent calls suppressed', () => {
test('shipped embedding recipes are either capped or explicitly opt out', () => {
expect(getRecipe('google')!.touchpoints.embedding!.max_batch_tokens).toBe(20_000);
expect(getRecipe('ollama')!.touchpoints.embedding!.max_batch_tokens).toBe(0);
expect(getRecipe('litellm')!.touchpoints.embedding!.max_batch_tokens).toBe(0);
});

test('configureGateway stays quiet for the shipped registry; subsequent calls remain quiet', () => {
const warnings: string[] = [];
const original = console.warn;
console.warn = (msg: string) => warnings.push(String(msg));
try {
configureOpenAI();
const firstCallCount = warnings.length;
// Reconfigure: the warning should NOT re-fire for the same recipes
// within one process (we already told the operator).
configureOpenAI();
expect(warnings.length).toBe(firstCallCount);
} finally {
console.warn = original;
}

// The warning text should match the documented contract.
const contractMatch = warnings.filter(w =>
w.includes('[ai.gateway]') && w.includes('declares an embedding touchpoint'),
);
expect(contractMatch.length).toBeGreaterThan(0);

// Voyage declares max_batch_tokens → suppressed. OpenAI is the
// canonical fast-path recipe → also suppressed by id. Both must be
// absent from the warnings.
expect(warnings.find(w => w.includes('"voyage"'))).toBeUndefined();
expect(warnings.find(w => w.includes('"openai"'))).toBeUndefined();
expect(warnings).toEqual([]);
});
});