tobi · jhsmith409 · Apr 6, 2026 · Apr 6, 2026 · Apr 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,15 @@
 
 ## [Unreleased]
 
+### Changes
+
+- Remote embedding, reranking, and query expansion via OpenAI-compatible API
+  (vLLM, Ollama, OpenAI, etc.). Set `QMD_EMBED_API_URL` / `QMD_EMBED_API_MODEL`
+  (and optionally `QMD_RERANK_API_*` / `QMD_EXPAND_API_*`) env vars or add
+  the equivalent keys to `models:` in `index.yml`. Local generation and
+  tokenization are preserved via a hybrid routing layer. Includes circuit
+  breakers, dimension validation, and batch splitting.
+
 ### Fixes
 
 - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529

diff --git a/README.md b/README.md
@@ -939,6 +939,41 @@ Uses node-llama-cpp's `createRankingContext()` and `rankAndSort()` API for cross
 
 Used for generating query variations via `LlamaChatSession`.
 
+### Remote Embedding & Reranking
+
+QMD can offload embedding and reranking to a remote OpenAI-compatible server (vLLM, Ollama, LM Studio, OpenAI, etc.) while keeping query expansion local.
+
+**Environment variables** (presence of `QMD_EMBED_API_URL` activates remote mode):
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `QMD_EMBED_API_URL` | Yes | Base URL, e.g. `http://gpu-host:8000/v1` |
+| `QMD_EMBED_API_MODEL` | Yes | Model name, e.g. `BAAI/bge-m3` |
+| `QMD_EMBED_API_KEY` | No | Bearer token for auth |
+| `QMD_RERANK_API_URL` | No | Rerank endpoint (defaults to embed URL) |
+| `QMD_RERANK_API_MODEL` | No | Rerank model name |
+| `QMD_RERANK_API_KEY` | No | Rerank auth (defaults to embed key) |
+
+**YAML config** (`~/.config/qmd/index.yml`):
+```yaml
+models:
+  embed_api_url: "http://gpu-host:8000/v1"
+  embed_api_model: "BAAI/bge-m3"
+  rerank_api_model: "BAAI/bge-reranker-v2-m3"
+```
+
+**Example with vLLM:**
+```sh
+# Start vLLM with an embedding model
+vllm serve BAAI/bge-m3 --task embed
+
+# Point QMD at it
+export QMD_EMBED_API_URL=http://localhost:8000/v1
+export QMD_EMBED_API_MODEL=BAAI/bge-m3
+qmd embed
+qmd query "your search query"
+```
+
 ## License
 
 MIT
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
@@ -78,7 +78,9 @@ import {
   type ReindexResult,
   type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLLM, setDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { RemoteLLM, remoteConfigFromEnv } from "../remote-llm.js";
+import { HybridLLM } from "../hybrid-llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -121,11 +123,28 @@ function getStore(): ReturnType<typeof createStore> {
       const config = loadConfig();
       syncConfigToDb(store.db, config);
       if (config.models) {
-        setDefaultLlamaCpp(new LlamaCpp({
+        const localLlm = new LlamaCpp({
           embedModel: config.models.embed,
           generateModel: config.models.generate,
           rerankModel: config.models.rerank,
-        }));
+        });
+
+        // Check if remote embedding is configured (env vars take precedence over YAML)
+        const remoteConfig = remoteConfigFromEnv(config.models);
+        if (remoteConfig) {
+          const remoteLlm = new RemoteLLM(remoteConfig);
+          setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
+        } else {
+          setDefaultLLM(localLlm);
+        }
+      } else {
+        // No YAML models config — still check env vars for remote embedding
+        const remoteConfig = remoteConfigFromEnv();
+        if (remoteConfig) {
+          const remoteLlm = new RemoteLLM(remoteConfig);
+          const localLlm = new LlamaCpp();
+          setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
+        }
       }
     } catch {
       // Config may not exist yet — that's fine, DB works without it
@@ -1681,6 +1700,9 @@ async function vectorIndex(
   const storeInstance = getStore();
   const db = storeInstance.db;
 
+  // Use the actual model name from the configured LLM (may be remote, not the default GGUF URI)
+  model = getDefaultLLM().embedModelName;
+
   if (force) {
     console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
   }

diff --git a/src/collections.ts b/src/collections.ts
@@ -40,6 +40,18 @@ export interface ModelsConfig {
   embed?: string;
   rerank?: string;
   generate?: string;
+  /** Remote embedding API base URL (e.g. http://gpu-host:8000/v1) */
+  embed_api_url?: string;
+  /** Remote embedding model name (e.g. BAAI/bge-m3) */
+  embed_api_model?: string;
+  /** Bearer token for remote embedding API */
+  embed_api_key?: string;
+  /** Remote rerank API base URL (defaults to embed_api_url) */
+  rerank_api_url?: string;
+  /** Remote rerank model name */
+  rerank_api_model?: string;
+  /** Bearer token for remote rerank API */
+  rerank_api_key?: string;
 }
 
 /**

diff --git a/src/hybrid-llm.ts b/src/hybrid-llm.ts
@@ -0,0 +1,70 @@
+/**
+ * hybrid-llm.ts - Compositor that routes LLM operations between remote and local backends
+ *
+ * Embed/rerank → remote (GPU-heavy, benefits from offloading)
+ * Generate → local LlamaCpp
+ * ExpandQuery → remote when expandApiModel is configured, otherwise local LlamaCpp
+ * tokenize/countTokens → local LlamaCpp (CPU-cheap, needed for chunking)
+ */
+
+import type {
+  LLM,
+  EmbedOptions,
+  EmbeddingResult,
+  GenerateOptions,
+  GenerateResult,
+  ModelInfo,
+  Queryable,
+  RerankDocument,
+  RerankOptions,
+  RerankResult,
+} from "./llm.js";
+import { RemoteLLM } from "./remote-llm.js";
+
+export class HybridLLM implements LLM {
+  constructor(
+    private readonly remote: LLM,
+    private readonly local: LLM,
+  ) {}
+
+  get embedModelName(): string {
+    return this.remote.embedModelName;
+  }
+
+  // Route to remote
+  embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    return this.remote.embed(text, options);
+  }
+
+  embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]> {
+    return this.remote.embedBatch(texts, options);
+  }
+
+  rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult> {
+    return this.remote.rerank(query, documents, options);
+  }
+
+  // Route to local
+  generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null> {
+    return this.local.generate(prompt, options);
+  }
+
+  /**
+   * Route expandQuery to remote when the remote backend supports it
+   * (i.e., RemoteLLM with expandApiModel configured), otherwise fall back to local.
+   */
+  expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }): Promise<Queryable[]> {
+    if (this.remote instanceof RemoteLLM && this.remote.supportsExpand) {
+      return this.remote.expandQuery(query, options);
+    }
+    return this.local.expandQuery(query, options);
+  }
+
+  modelExists(model: string): Promise<ModelInfo> {
+    return this.local.modelExists(model);
+  }
+
+  async dispose(): Promise<void> {
+    await Promise.all([this.remote.dispose(), this.local.dispose()]);
+  }
+}