tobi · alexleach · Mar 27, 2026 · Feb 4, 2026 · Feb 9, 2026 · Feb 28, 2026
diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -49,8 +49,10 @@
     "better-sqlite3": "^12.4.5",
     "fast-glob": "^3.3.0",
     "node-llama-cpp": "^3.17.1",
+    "openai": "^6.33.0",
     "picomatch": "^4.0.0",
     "sqlite-vec": "^0.1.7-alpha.2",
+    "tiktoken": "^1.0.22",
     "yaml": "^2.8.2",
     "zod": "4.2.1"
   },

diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
@@ -76,7 +76,7 @@ import {
   syncConfigToDb,
   type ReindexResult,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, getDefaultEmbeddingLLM, withLLMSession, pullModels, setEmbeddingConfig, isUsingOpenAI, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -96,6 +96,7 @@ import {
   listAllContexts,
   setConfigIndexName,
   loadConfig,
+  getEmbeddingConfig as getEmbeddingConfigFromYaml,
 } from "../collections.js";
 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
 
@@ -276,7 +277,6 @@ function computeDisplayPath(
   return filepath;
 }
 
-
 function formatTimeAgo(date: Date): string {
   const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
   if (seconds < 60) return `${seconds}s ago`;
@@ -426,34 +426,40 @@ async function showStatus(): Promise<void> {
     console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
   }
 
-  // Device / GPU info
-  try {
-    const llm = getDefaultLlamaCpp();
-    const device = await llm.getDeviceInfo();
-    console.log(`\n${c.bold}Device${c.reset}`);
-    if (device.gpu) {
-      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
-      if (device.gpuDevices.length > 0) {
-        // Deduplicate and count GPUs
-        const counts = new Map<string, number>();
-        for (const name of device.gpuDevices) {
-          counts.set(name, (counts.get(name) || 0) + 1);
+  // Device / GPU info (local llama-cpp mode only)
+  if (!isUsingOpenAI()) {
+    try {
+      const llm = getDefaultLlamaCpp();
+      const device = await llm.getDeviceInfo();
+      console.log(`
+${c.bold}Device${c.reset}`);
+      if (device.gpu) {
+        console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
+        if (device.gpuDevices.length > 0) {
+          const counts = new Map<string, number>();
+          for (const name of device.gpuDevices) {
+            counts.set(name, (counts.get(name) || 0) + 1);
+          }
+          const deviceStr = Array.from(counts.entries())
+            .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
+            .join(', ');
+          console.log(`  Devices:  ${deviceStr}`);
         }
-        const deviceStr = Array.from(counts.entries())
-          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
-          .join(', ');
-        console.log(`  Devices:  ${deviceStr}`);
-      }
-      if (device.vram) {
-        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+        if (device.vram) {
+          console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+        }
+      } else {
+        console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
+        console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
       }
-    } else {
-      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
-      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
+      console.log(`  CPU:      ${device.cpuCores} math cores`);
+    } catch {
+      // Don't fail status if LLM init fails
     }
-    console.log(`  CPU:      ${device.cpuCores} math cores`);
-  } catch {
-    // Don't fail status if LLM init fails
+  } else {
+    console.log(`
+${c.bold}Provider${c.reset}`);
+    console.log(`  Embeddings/Rerank/Expansion: OpenAI-compatible endpoint`);
   }
 
   // Tips section
@@ -2147,7 +2153,7 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
   checkIndexHealth(store.db);
 
-  await withLLMSession(async () => {
+  const run = async () => {
     let results = await vectorSearchQuery(store, query, {
       collection: singleCollection,
       limit: opts.all ? 500 : (opts.limit || 10),
@@ -2185,7 +2191,13 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
       context: r.context,
       docid: r.docid,
     })), query, { ...opts, limit: results.length });
-  }, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
+  };
+
+  if (isUsingOpenAI()) {
+    await run();
+  } else {
+    await withLLMSession(async () => run(), { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
+  }
 }
 
 async function querySearch(query: string, opts: OutputOptions, _embedModel: string = DEFAULT_EMBED_MODEL, _rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
@@ -2698,6 +2710,21 @@ if (isMain) {
     process.exit(cli.values.help ? 0 : 1);
   }
 
+  // Load embedding configuration from config file or env var
+  const embeddingYamlConfig = getEmbeddingConfigFromYaml();
+  const useOpenAI = process.env.QMD_OPENAI === '1' || embeddingYamlConfig.provider === 'openai';
+
+  if (useOpenAI) {
+    setEmbeddingConfig({
+      provider: 'openai',
+      openai: {
+        apiKey: process.env.OPENAI_API_KEY || embeddingYamlConfig.openai?.api_key,
+        baseURL: process.env.OPENAI_BASE_URL || embeddingYamlConfig.openai?.base_url,
+        embedModel: embeddingYamlConfig.openai?.model,
+      },
+    });
+  }
+
   switch (cli.command) {
     case "context": {
       const subcommand = cli.args[0];

diff --git a/src/collections.ts b/src/collections.ts
@@ -33,12 +33,25 @@ export interface Collection {
   includeByDefault?: boolean; // Include in queries by default (default: true)
 }
 
+/**
+ * Embedding provider configuration (optional in config file)
+ */
+export interface EmbeddingProviderConfig {
+  provider?: 'local' | 'openai';  // Default: 'local'
+  openai?: {
+    api_key?: string;             // Falls back to OPENAI_API_KEY env var
+    base_url?: string;            // Falls back to OPENAI_BASE_URL env var
+    model?: string;               // Default: 'text-embedding-3-small'
+  };
+}
+
 /**
  * The complete configuration file structure
  */
 export interface CollectionConfig {
   global_context?: string;                    // Context applied to all collections
-  collections: Record<string, Collection>;    // Collection name -> config
+  collections: Record<string, Collection>;   // Collection name -> config
+  embedding?: EmbeddingProviderConfig;        // Optional embedding provider settings
 }
 
 /**
@@ -498,3 +511,12 @@ export function isValidCollectionName(name: string): boolean {
   // Allow alphanumeric, hyphens, underscores
   return /^[a-zA-Z0-9_-]+$/.test(name);
 }
+
+/**
+ * Get embedding configuration from config file
+ * Returns default (local) config if not specified
+ */
+export function getEmbeddingConfig(): EmbeddingProviderConfig {
+  const config = loadConfig();
+  return config.embedding || { provider: 'local' };
+}
diff --git a/src/llm.ts b/src/llm.ts
@@ -317,6 +317,16 @@ export interface LLM {
    */
   embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
 
+  /**
+   * Get embeddings for multiple texts in a batch
+   */
+  embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
+
+  /**
+   * Get the model name used for embeddings
+   */
+  getModelName(): string;
+
   /**
    * Generate text completion
    */
@@ -443,6 +453,13 @@ export class LlamaCpp implements LLM {
     this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
   }
 
+  /**
+   * Get the model name used for embeddings
+   */
+  getModelName(): string {
+    return this.embedModelUri;
+  }
+
   /**
    * Reset the inactivity timer. Called after each model operation.
    * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -1544,3 +1561,60 @@ export async function disposeDefaultLlamaCpp(): Promise<void> {
     defaultLlamaCpp = null;
   }
 }
+
+// =============================================================================
+// OpenAI Embedding Support
+// =============================================================================
+
+import { OpenAIEmbedding, type OpenAIConfig } from "./openai-llm.js";
+
+/**
+ * Embedding provider configuration
+ */
+export type EmbeddingProvider = 'local' | 'openai';
+
+export type EmbeddingConfig = {
+  provider: EmbeddingProvider;
+  openai?: OpenAIConfig;
+};
+
+// Default embedding config: use local llama-cpp
+let embeddingConfig: EmbeddingConfig = { provider: 'local' };
+let openAIEmbedding: OpenAIEmbedding | null = null;
+
+/**
+ * Set the embedding configuration. Call before using embeddings.
+ */
+export function setEmbeddingConfig(config: EmbeddingConfig): void {
+  embeddingConfig = config;
+  // Reset OpenAI instance if config changes
+  openAIEmbedding = null;
+}
+
+/**
+ * Get the current embedding configuration
+ */
+export function getEmbeddingConfig(): EmbeddingConfig {
+  return embeddingConfig;
+}
+
+/**
+ * Check if using OpenAI for embeddings
+ */
+export function isUsingOpenAI(): boolean {
+  return embeddingConfig.provider === 'openai';
+}
+
+/**
+ * Get the appropriate LLM for embeddings based on config.
+ * Returns OpenAI embedding client if configured, otherwise local LlamaCpp.
+ */
+export function getDefaultEmbeddingLLM(): LLM {
+  if (embeddingConfig.provider === 'openai') {
+    if (!openAIEmbedding) {
+      openAIEmbedding = new OpenAIEmbedding(embeddingConfig.openai);
+    }
+    return openAIEmbedding;
+  }
+  return getDefaultLlamaCpp();
+}