tobi · jonesj38 · Feb 4, 2026 · Feb 4, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/README.md b/README.md
@@ -515,6 +515,30 @@ Supported model families:
 > since vectors are not cross-compatible between models. The prompt format is
 > automatically adjusted for each model family.
 
+### OpenAI Embeddings (Optional)
+
+As an alternative to local embedding models, you can use OpenAI's API for faster, more reliable embeddings:
+
+```yaml
+# ~/.config/qmd/index.yml
+embedding:
+  provider: openai
+  openai:
+    api_key: sk-...  # Optional, falls back to QMD_OPENAI_API_KEY or OPENAI_API_KEY env var
+    model: text-embedding-3-small  # Optional, this is the default
+    expansion_model: gpt-4o-mini  # Optional, model for query expansion/reranking
+    base_url: https://api.openai.com/v1  # Optional, for OpenAI-compatible APIs (Ollama, vLLM, etc.)
+```
+
+Benefits:
+- **~10x faster** than local CPU inference
+- **No GPU required** - works on any machine
+- **More reliable** - no local model loading issues
+- **Cost:** ~$0.02 per 1M tokens (very cheap)
+- **OpenAI-compatible** - works with Ollama, vLLM, Azure, and other compatible APIs via `base_url`
+
+When using OpenAI embeddings, query expansion and reranking use the OpenAI API instead of local models.
+
 ## Installation
 
 ```sh

diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -49,8 +49,10 @@
     "better-sqlite3": "12.8.0",
     "fast-glob": "3.3.3",
     "node-llama-cpp": "3.18.1",
+    "openai": "^4.77.0",
     "picomatch": "4.0.4",
     "sqlite-vec": "0.1.9",
+    "tiktoken": "^1.0.22",
     "web-tree-sitter": "0.26.7",
     "yaml": "2.8.3",
     "zod": "4.2.1"

diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
@@ -78,7 +78,7 @@ import {
   type ReindexResult,
   type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, getDefaultEmbeddingLLM, getEmbeddingConfig, withLLMSession, pullModels, setEmbeddingConfig, isUsingOpenAI, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -98,6 +98,7 @@ import {
   listAllContexts,
   setConfigIndexName,
   loadConfig,
+  getEmbeddingConfig as getEmbeddingConfigFromYaml,
 } from "../collections.js";
 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
 
@@ -454,8 +455,14 @@ async function showStatus(): Promise<void> {
     console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
   }
 
-  // Models
-  {
+  // Models / Provider info
+  if (isUsingOpenAI()) {
+    const embCfg = getEmbeddingConfig();
+    console.log(`\n${c.bold}Provider${c.reset}`);
+    console.log(`  Mode:        ${c.green}OpenAI-compatible${c.reset}`);
+    console.log(`  Base URL:    ${embCfg.openai?.baseURL || process.env.QMD_OPENAI_BASE_URL || '(default)'}`);
+    console.log(`  Embed model: ${embCfg.openai?.embedModel || 'text-embedding-3-small'}`);
+  } else {
     // hf:org/repo/file.gguf → https://huggingface.co/org/repo
     const hfLink = (uri: string) => {
       const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
@@ -465,38 +472,37 @@ async function showStatus(): Promise<void> {
     console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
     console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
     console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
-  }
 
-  // Device / GPU info
-  console.log(`\n${c.bold}Device${c.reset}`);
-  try {
-    const llm = getDefaultLlamaCpp();
-    const device = await llm.getDeviceInfo({ allowBuild: false });
-    if (device.gpu) {
-      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
-      if (device.gpuDevices.length > 0) {
-        // Deduplicate and count GPUs
-        const counts = new Map<string, number>();
-        for (const name of device.gpuDevices) {
-          counts.set(name, (counts.get(name) || 0) + 1);
+    // Device / GPU info (local mode only — skip in OpenAI mode to avoid triggering compilation)
+    console.log(`\n${c.bold}Device${c.reset}`);
+    try {
+      const llm = getDefaultLlamaCpp();
+      const device = await llm.getDeviceInfo({ allowBuild: false });
+      if (device.gpu) {
+        console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
+        if (device.gpuDevices.length > 0) {
+          const counts = new Map<string, number>();
+          for (const name of device.gpuDevices) {
+            counts.set(name, (counts.get(name) || 0) + 1);
+          }
+          const deviceStr = Array.from(counts.entries())
+            .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
+            .join(', ');
+          console.log(`  Devices:  ${deviceStr}`);
         }
-        const deviceStr = Array.from(counts.entries())
-          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
-          .join(', ');
-        console.log(`  Devices:  ${deviceStr}`);
+        if (device.vram) {
+          console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+        }
+      } else {
+        console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
+        console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
       }
-      if (device.vram) {
-        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+      console.log(`  CPU:      ${device.cpuCores} math cores`);
+    } catch (error) {
+      console.log(`  Status:   ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
+      if (error instanceof Error && error.message) {
+        console.log(`  ${c.dim}${error.message}${c.reset}`);
       }
-    } else {
-      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
-      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
-    }
-    console.log(`  CPU:      ${device.cpuCores} math cores`);
-  } catch (error) {
-    console.log(`  Status:   ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
-    if (error instanceof Error && error.message) {
-      console.log(`  ${c.dim}${error.message}${c.reset}`);
     }
   }
 
@@ -1704,34 +1710,37 @@ async function vectorIndex(
 
   const startTime = Date.now();
 
-  const result = await generateEmbeddings(storeInstance, {
-    force,
-    model,
-    maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
-    maxBatchBytes: batchOptions?.maxBatchBytes,
-    chunkStrategy: batchOptions?.chunkStrategy,
-    onProgress: (info) => {
-      if (info.totalBytes === 0) return;
-      const percent = (info.bytesProcessed / info.totalBytes) * 100;
-      progress.set(percent);
-
-      const elapsed = (Date.now() - startTime) / 1000;
-      const bytesPerSec = info.bytesProcessed / elapsed;
-      const remainingBytes = info.totalBytes - info.bytesProcessed;
-      const etaSec = remainingBytes / bytesPerSec;
-
-      const bar = renderProgressBar(percent);
-      const percentStr = percent.toFixed(0).padStart(3);
-      const throughput = `${formatBytes(bytesPerSec)}/s`;
-      const eta = elapsed > 2 ? formatETA(etaSec) : "...";
-      const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";
-
-      if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
-    },
-  });
+  let result: Awaited<ReturnType<typeof generateEmbeddings>>;
+  try {
+    result = await generateEmbeddings(storeInstance, {
+      force,
+      model,
+      maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
+      maxBatchBytes: batchOptions?.maxBatchBytes,
+      chunkStrategy: batchOptions?.chunkStrategy,
+      onProgress: (info) => {
+        if (info.totalBytes === 0) return;
+        const percent = (info.bytesProcessed / info.totalBytes) * 100;
+        progress.set(percent);
 
-  progress.clear();
-  cursor.show();
+        const elapsed = (Date.now() - startTime) / 1000;
+        const bytesPerSec = info.bytesProcessed / elapsed;
+        const remainingBytes = info.totalBytes - info.bytesProcessed;
+        const etaSec = remainingBytes / bytesPerSec;
+
+        const bar = renderProgressBar(percent);
+        const percentStr = percent.toFixed(0).padStart(3);
+        const throughput = `${formatBytes(bytesPerSec)}/s`;
+        const eta = elapsed > 2 ? formatETA(etaSec) : "...";
+        const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";
+
+        if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
+      },
+    });
+  } finally {
+    progress.clear();
+    cursor.show();
+  }
 
   const totalTimeSec = result.durationMs / 1000;
 
@@ -2235,10 +2244,8 @@ function search(query: string, opts: OutputOptions): void {
 
   // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
   const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
-  const results = filterByCollections(
-    searchFTS(db, query, fetchLimit, singleCollection),
-    collectionNames
-  );
+  // Pass collections directly to searchFTS (it now supports arrays)
+  const results = searchFTS(db, query, fetchLimit, collectionNames.length > 0 ? collectionNames : undefined);
 
   // Add context to results
   const resultsWithContext = results.map(r => ({
@@ -2286,7 +2293,7 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
   checkIndexHealth(store.db);
 
-  await withLLMSession(async () => {
+  const llmSession = async () => {
     let results = await vectorSearchQuery(store, query, {
       collection: singleCollection,
       limit: opts.all ? 500 : (opts.limit || 10),
@@ -2324,7 +2331,15 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
       context: r.context,
       docid: r.docid,
     })), query, { ...opts, limit: results.length });
-  }, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
+  };
+
+  if (isUsingOpenAI()) {
+    await llmSession();
+  } else {
+    await withLLMSession(async () => llmSession(),
+      { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' }
+    );
+  }
 }
 
 async function querySearch(query: string, opts: OutputOptions, _embedModel: string = DEFAULT_EMBED_MODEL, _rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
@@ -2342,7 +2357,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
   // Intent can come from --intent flag or from intent: line in query document
   const intent = opts.intent || parsed?.intent;
 
-  await withLLMSession(async () => {
+  const querySession = async () => {
     let results;
 
     if (parsed) {
@@ -2462,7 +2477,15 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
       docid: r.docid,
       explain: r.explain,
     })), displayQuery, { ...opts, limit: results.length });
-  }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
+  };
+
+  if (isUsingOpenAI()) {
+    await querySession();
+  } else {
+    await withLLMSession(async () => querySession(),
+      { maxDuration: 10 * 60 * 1000, name: 'querySearch' }
+    );
+  }
 }
 
 // Parse CLI arguments using util.parseArgs
@@ -2847,6 +2870,31 @@ if (isMain) {
     process.exit(cli.values.help ? 0 : 1);
   }
 
+  // Load embedding configuration.
+  // Priority: YAML config > env vars > default (local).
+  // Setting QMD_OPENAI_BASE_URL alone is enough to activate OpenAI mode.
+  const embeddingYamlConfig = getEmbeddingConfigFromYaml();
+  const useOpenAI = embeddingYamlConfig.provider === 'openai'
+    || !!process.env.QMD_OPENAI_BASE_URL
+    || process.env.QMD_OPENAI === '1';
+
+  if (useOpenAI) {
+    setEmbeddingConfig({
+      provider: 'openai',
+      openai: {
+        apiKey: embeddingYamlConfig.openai?.api_key || process.env.QMD_OPENAI_API_KEY,
+        embedModel: embeddingYamlConfig.openai?.model || process.env.QMD_OPENAI_EMBED_MODEL,
+        expansionModel: embeddingYamlConfig.openai?.expansion_model,
+        rerankModel: embeddingYamlConfig.openai?.rerank_model,
+        baseURL: embeddingYamlConfig.openai?.base_url || process.env.QMD_OPENAI_BASE_URL,
+        chatBaseURL: embeddingYamlConfig.openai?.chat_base_url,
+        chatApiKey: embeddingYamlConfig.openai?.chat_api_key,
+        rerankBaseURL: embeddingYamlConfig.openai?.rerank_base_url,
+        rerankApiKey: embeddingYamlConfig.openai?.rerank_api_key,
+      },
+    });
+  }
+
   switch (cli.command) {
     case "context": {
       const subcommand = cli.args[0];

diff --git a/src/collections.ts b/src/collections.ts
@@ -42,6 +42,24 @@ export interface ModelsConfig {
   generate?: string;
 }
 
+/**
+ * Embedding provider configuration (optional in config file)
+ */
+export interface EmbeddingProviderConfig {
+  provider?: 'local' | 'openai';  // Default: 'local'
+  openai?: {
+    api_key?: string;             // Falls back to QMD_OPENAI_API_KEY / OPENAI_API_KEY env var
+    model?: string;               // Default: 'text-embedding-3-small'
+    expansion_model?: string;     // Default: 'gpt-4o-mini'
+    rerank_model?: string;        // Default: falls back to expansion_model
+    base_url?: string;            // Base URL for embeddings (OpenAI-compatible)
+    chat_base_url?: string;       // Separate base URL for expansion (falls back to base_url)
+    chat_api_key?: string;        // Separate API key for chat endpoint (falls back to api_key)
+    rerank_base_url?: string;     // Separate base URL for reranking (falls back to chat_base_url)
+    rerank_api_key?: string;      // Separate API key for rerank endpoint (falls back to chat_api_key)
+  };
+}
+
 /**
  * The complete configuration file structure
  */
@@ -51,6 +69,7 @@ export interface CollectionConfig {
   editor_uri_template?: string;               // Alias for editor_uri
   collections: Record<string, Collection>;    // Collection name -> config
   models?: ModelsConfig;
+  embedding?: EmbeddingProviderConfig;        // Optional embedding provider settings
 }
 
 /**
@@ -510,3 +529,12 @@ export function isValidCollectionName(name: string): boolean {
   // Allow alphanumeric, hyphens, underscores
   return /^[a-zA-Z0-9_-]+$/.test(name);
 }
+
+/**
+ * Get embedding configuration from config file
+ * Returns default (local) config if not specified
+ */
+export function getEmbeddingConfig(): EmbeddingProviderConfig {
+  const config = loadConfig();
+  return config.embedding || { provider: 'local' };
+}
diff --git a/src/db.ts b/src/db.ts
@@ -69,6 +69,7 @@ export interface Database {
   exec(sql: string): void;
   prepare(sql: string): Statement;
   loadExtension(path: string): void;
+  transaction<T>(fn: () => T): () => T;
   close(): void;
 }