tobi · paralizeer · Apr 1, 2026
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
@@ -2183,7 +2183,7 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
   checkIndexHealth(store.db);
 
-  await withLLMSession(async () => {
+  const runSearch = async () => {
     let results = await vectorSearchQuery(store, query, {
       collection: singleCollection,
       limit: opts.all ? 500 : (opts.limit || 10),
@@ -2221,7 +2221,14 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
       context: r.context,
       docid: r.docid,
     })), query, { ...opts, limit: results.length });
-  }, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
+  };
+
+  // Skip local LLM session when using remote Ollama for embeddings
+  if (process.env.OLLAMA_EMBED_URL) {
+    await runSearch();
+  } else {
+    await withLLMSession(runSearch, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
+  }
 }
 
 async function querySearch(query: string, opts: OutputOptions, _embedModel: string = DEFAULT_EMBED_MODEL, _rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
@@ -2239,7 +2246,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
   // Intent can come from --intent flag or from intent: line in query document
   const intent = opts.intent || parsed?.intent;
 
-  await withLLMSession(async () => {
+  const runQuery = async () => {
     let results;
 
     if (parsed) {
@@ -2359,7 +2366,14 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
       docid: r.docid,
       explain: r.explain,
     })), displayQuery, { ...opts, limit: results.length });
-  }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
+  };
+
+  // Skip local LLM session when using remote Ollama for embeddings
+  if (process.env.OLLAMA_EMBED_URL) {
+    await runQuery();
+  } else {
+    await withLLMSession(runQuery, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
+  }
 }
 
 // Parse CLI arguments using util.parseArgs

diff --git a/src/store.ts b/src/store.ts
@@ -39,6 +39,42 @@ import type {
 // =============================================================================
 
 const HOME = process.env.HOME || "/tmp";
+
+// Remote Ollama embedding support — when OLLAMA_EMBED_URL is set, all embedding
+// and tokenization operations use the remote Ollama HTTP API instead of
+// node-llama-cpp. This enables QMD on platforms without local GPU/Vulkan
+// (ARM64 VPS, Docker, CI) and with remote Ollama instances (Tailscale, LAN).
+const OLLAMA_EMBED_URL = process.env.OLLAMA_EMBED_URL;
+const OLLAMA_EMBED_MODEL = process.env.OLLAMA_EMBED_MODEL || "nomic-embed-text";
+
+interface OllamaEmbedResult {
+  embedding: number[];
+  model: string;
+}
+
+async function ollamaEmbed(text: string): Promise<OllamaEmbedResult> {
+  const res = await fetch(`${OLLAMA_EMBED_URL}/api/embed`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: OLLAMA_EMBED_MODEL, input: text }),
+  });
+  if (!res.ok) throw new Error(`Ollama embed failed: ${res.status} ${await res.text()}`);
+  const data = await res.json() as { embeddings: number[][] };
+  const embedding = data.embeddings[0];
+  if (!embedding) throw new Error('Ollama returned empty embeddings array');
+  return { embedding, model: OLLAMA_EMBED_MODEL };
+}
+
+async function ollamaEmbedBatch(texts: string[]): Promise<OllamaEmbedResult[]> {
+  const res = await fetch(`${OLLAMA_EMBED_URL}/api/embed`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: OLLAMA_EMBED_MODEL, input: texts }),
+  });
+  if (!res.ok) throw new Error(`Ollama embed batch failed: ${res.status} ${await res.text()}`);
+  const data = await res.json() as { embeddings: number[][] };
+  return data.embeddings.map(e => ({ embedding: e, model: OLLAMA_EMBED_MODEL }));
+}
 export const DEFAULT_EMBED_MODEL = "embeddinggemma";
 export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
 export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
@@ -1407,6 +1443,67 @@ export async function generateEmbeddings(
   const totalDocs = docsToEmbed.length;
   const startTime = Date.now();
 
+  // Remote Ollama mode: bypass local LLM entirely
+  if (OLLAMA_EMBED_URL) {
+    let chunksEmbedded = 0;
+    let errors = 0;
+    let bytesProcessed = 0;
+    let totalChunks = 0;
+    let vectorTableInitialized = false;
+    const BATCH_SIZE = 32;
+    const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
+
+    for (const batchMeta of batches) {
+      const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
+      const batchChunks: ChunkItem[] = [];
+      const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
+
+      for (const doc of batchDocs) {
+        if (!doc.body.trim()) continue;
+        const title = extractTitle(doc.body, doc.path);
+        const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy);
+        for (let seq = 0; seq < chunks.length; seq++) {
+          batchChunks.push({ hash: doc.hash, title, text: chunks[seq]!.text, seq, pos: chunks[seq]!.pos, tokens: chunks[seq]!.tokens, bytes: encoder.encode(chunks[seq]!.text).length });
+        }
+      }
+
+      totalChunks += batchChunks.length;
+      if (batchChunks.length === 0) { bytesProcessed += batchBytes; options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors }); continue; }
+
+      if (!vectorTableInitialized) {
+        const firstResult = await ollamaEmbed(batchChunks[0]!.text);
+        store.ensureVecTable(firstResult.embedding.length);
+        vectorTableInitialized = true;
+      }
+
+      for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
+        const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
+        const chunkBatch = batchChunks.slice(batchStart, batchEnd);
+        const texts = chunkBatch.map(chunk => chunk.text);
+        try {
+          const embeddings = await ollamaEmbedBatch(texts);
+          for (let i = 0; i < chunkBatch.length; i++) {
+            const chunk = chunkBatch[i]!;
+            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embeddings[i]!.embedding), model, now);
+            chunksEmbedded++;
+          }
+        } catch {
+          for (const chunk of chunkBatch) {
+            try {
+              const result = await ollamaEmbed(chunk.text);
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+              chunksEmbedded++;
+            } catch { errors++; }
+          }
+        }
+        options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed: bytesProcessed + batchBytes, totalBytes, errors });
+      }
+      bytesProcessed += batchBytes;
+    }
+
+    return { docsProcessed: totalDocs, chunksEmbedded, errors, durationMs: Date.now() - startTime };
+  }
+
   // Use store's LlamaCpp or global singleton, wrapped in a session
   const llm = getLlm(store);
 
@@ -2201,15 +2298,20 @@ export async function chunkDocumentByTokens(
   chunkStrategy: ChunkStrategy = "regex",
   signal?: AbortSignal
 ): Promise<{ text: string; pos: number; tokens: number }[]> {
-  const llm = getDefaultLlamaCpp();
-
   // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
-  // If chunks exceed limit, they'll be re-split with actual ratio
   const avgCharsPerToken = 3;
   const maxChars = maxTokens * avgCharsPerToken;
   const overlapChars = overlapTokens * avgCharsPerToken;
   const windowChars = windowTokens * avgCharsPerToken;
 
+  // Remote Ollama mode: skip local tokenizer, use char-based chunking
+  if (OLLAMA_EMBED_URL) {
+    const charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
+    return charChunks.map(c => ({ text: c.text, pos: c.pos, tokens: Math.ceil(c.text.length / avgCharsPerToken) }));
+  }
+
+  const llm = getDefaultLlamaCpp();
+
   // Chunk in character space with conservative estimate
   // Use AST-aware chunking for the first pass when filepath/strategy provided
   let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
@@ -3078,6 +3180,11 @@ export async function searchVec(db: Database, query: string, model: string, limi
 // =============================================================================
 
 async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp): Promise<number[] | null> {
+  // Remote Ollama mode: bypass local LLM entirely
+  if (OLLAMA_EMBED_URL && !session && !llmOverride) {
+    const result = await ollamaEmbed(text);
+    return result.embedding;
+  }
   // Format text using the appropriate prompt template
   const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
   const result = session
@@ -3147,6 +3254,11 @@ export function insertEmbedding(
 // =============================================================================
 
 export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]> {
+  // Remote Ollama mode: skip LLM-based HYDE query expansion (no local model)
+  if (OLLAMA_EMBED_URL && !llmOverride) {
+    return [{ type: 'vec' as const, query }];
+  }
+
   // Check cache first — stored as JSON preserving types
   const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
   const cached = getCachedResult(db, cacheKey);