tobi · jaylfc · Apr 5, 2026 · Apr 5, 2026 · Apr 6, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,15 @@
 
 ### Added
 
+- **Remote model server** (`qmd serve`): HTTP server for embedding,
+  reranking, and query expansion. Supports `local` (node-llama-cpp)
+  and `rkllama` (Rockchip NPU) backends.
+- **Index endpoints** on `qmd serve`: `/status`, `/collections`,
+  `/search?q=X`, `/browse` for remote memory browsing. Enables
+  TinyAgentOS and other tools to access agent memory over HTTP
+  without direct SQLite access.
+- **Batch embedding**: `POST /embed-batch` sends all chunks in one
+  rkllama API call, reducing HTTP overhead.
 - AST-aware chunking for code files via `web-tree-sitter`. Supported
   languages: TypeScript/JavaScript, Python, Go, and Rust. Code files
   are chunked at function, class, and import boundaries instead of

diff --git a/README.md b/README.md
@@ -1,8 +1,15 @@
-# QMD - Query Markup Documents
+# QMD - Query Markup Documents (with Remote Model Server Support)
+
+> **Fork note:** This fork adds `qmd serve` — a shared model server so multiple QMD clients
+> (e.g. OpenClaw agents in separate LXC containers) can share a single set of embedding,
+> reranking, and query expansion models over HTTP instead of each loading their own into RAM.
+> See [Remote Model Server](#remote-model-server) below. Tracks upstream [tobi/qmd](https://github.com/tobi/qmd).
+>
+> Related upstream issues: [#489](https://github.com/tobi/qmd/issues/489), [#490](https://github.com/tobi/qmd/issues/490), [#502](https://github.com/tobi/qmd/issues/502), [#480](https://github.com/tobi/qmd/issues/480)
 
 An on-device search engine for everything you need to remember. Index your markdown notes, meeting transcripts, documentation, and knowledge bases. Search with keywords or natural language. Ideal for your agentic flows.
 
-QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking—all running locally via node-llama-cpp with GGUF models.
+QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking—all running locally via node-llama-cpp with GGUF models. **This fork also supports remote model serving** for shared/multi-agent deployments.
 
 ![QMD Architecture](assets/qmd-architecture.png)
 
@@ -912,6 +919,70 @@ Uses node-llama-cpp's `createRankingContext()` and `rankAndSort()` API for cross
 
 Used for generating query variations via `LlamaChatSession`.
 
+## Remote Model Server
+
+Share embedding, reranking, and query expansion models across multiple QMD clients over HTTP. Load models once, serve many clients.
+
+### Problem
+
+When running multiple QMD instances (e.g. agents in LXC containers, Docker, or separate machines), each loads its own copy of the embedding, reranker, and query expansion models into RAM. On memory-constrained devices, this is wasteful. On ARM64 or headless servers without GPU drivers, `node-llama-cpp` can't compile at all.
+
+### Solution
+
+Run `qmd serve` once on a host with GPU/NPU access, then point clients at it:
+
+```sh
+# On the host (loads models once)
+qmd serve --port 7832
+qmd serve --port 7832 --bind 0.0.0.0    # expose to network
+
+# With rkllama NPU backend (RK3588)
+qmd serve --backend rkllama --rkllama-url http://localhost:8080
+
+# On each client (no local models needed, no compilation)
+QMD_SERVER=http://your-host:7832 qmd query "how does auth work"
+
+# Or per-command
+qmd query --server http://your-host:7832 "search terms"
+```
+
+### Server Endpoints
+
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/embed` | POST | Embed a single text |
+| `/embed-batch` | POST | Batch embed multiple texts |
+| `/rerank` | POST | Rerank documents by relevance |
+| `/expand` | POST | Expand a query (lex/vec/hyde) |
+| `/tokenize` | POST | Count tokens in text |
+| `/health` | GET | Server status + loaded models |
+| `/status` | GET | Index health (doc counts, embedding status) |
+| `/collections` | GET | List collections with doc counts |
+| `/search?q=X` | GET | FTS5 keyword search (optional `&collection=`, `&limit=`) |
+| `/browse` | GET | Paginated chunk listing (optional `&collection=`, `&limit=`, `&offset=`) |
+
+The index endpoints (`/status`, `/collections`, `/search`, `/browse`) require a QMD database to be present. They return 503 if no database is loaded.
+
+### Agent/Container Integration
+
+Set the `QMD_SERVER` environment variable so clients use the remote server automatically:
+
+```bash
+export QMD_SERVER=http://your-host:7832
+
+# Or in a systemd service
+Environment=QMD_SERVER=http://your-host:7832
+```
+
+### Use Cases
+
+- **Multi-agent setups**: Multiple agents sharing one embedding server
+- **LXC/Docker containers**: Agents in isolated containers accessing host-level GPU/NPU models
+- **ARM64/headless servers**: No local GPU drivers needed — bypass `node-llama-cpp` compilation entirely
+- **Low-memory devices**: ARM SBCs (Orange Pi, Raspberry Pi) where RAM is scarce
+- **Full pipeline**: Unlike Ollama (embeddings only), `qmd serve` handles embed + rerank + query expansion
+```
+
 ## License
 
 MIT
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
@@ -77,7 +77,9 @@ import {
   type ReindexResult,
   type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLLM, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { RemoteLLM } from "../llm-remote.js";
+import { startServer } from "../serve.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -453,8 +455,12 @@ async function showStatus(): Promise<void> {
     console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
   }
 
-  // Device / GPU info
+  // Device / GPU info (skip if using remote server - no local GPU to report)
   try {
+    if (process.env.QMD_SERVER) {
+      console.log(`\n${c.bold}Device${c.reset}`);
+      console.log(`  Remote:   ${c.green}${process.env.QMD_SERVER}${c.reset} (QMD_SERVER)`);
+    } else {
     const llm = getDefaultLlamaCpp();
     const device = await llm.getDeviceInfo();
     console.log(`\n${c.bold}Device${c.reset}`);
@@ -479,6 +485,7 @@ async function showStatus(): Promise<void> {
       console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
     }
     console.log(`  CPU:      ${device.cpuCores} math cores`);
+    } // close else block for non-remote
   } catch {
     // Don't fail status if LLM init fails
   }
@@ -2416,6 +2423,12 @@ function parseCLI() {
       http: { type: "boolean" },
       daemon: { type: "boolean" },
       port: { type: "string" },
+      // Remote model server options
+      server: { type: "string" },  // URL of qmd serve instance (e.g. http://host:7832)
+      bind: { type: "string" },    // Bind address for qmd serve (default: 0.0.0.0)
+      backend: { type: "string" }, // Backend for qmd serve: "local" or "ollama"
+      "backend-url": { type: "string" }, // URL of Ollama-compatible server
+      "rkllama-url": { type: "string" }, // Deprecated alias for --backend-url
     },
     allowPositionals: true,
     strict: false, // Allow unknown options to pass through
@@ -2620,6 +2633,13 @@ function showHelp(): void {
   console.log("    --max-batch-mb <n>          - Cap UTF-8 MB loaded into memory per embedding batch");
   console.log("  qmd cleanup                   - Clear caches, vacuum DB");
   console.log("");
+  console.log("Model server (shared models over network):");
+  console.log("  qmd serve [--port 7832] [--bind 0.0.0.0]  - Start model server (local backend)");
+  console.log("  qmd serve --backend ollama                - Use Ollama-compatible server");
+  console.log("  qmd serve --backend ollama --backend-url http://host:11434");
+  console.log("  qmd query --server http://host:7832 <q>   - Use remote models instead of local");
+  console.log("  QMD_SERVER=http://host:7832 qmd query <q> - Same via env var");
+  console.log("");
   console.log("Query syntax (qmd query):");
   console.log("  QMD queries are either a single expand query (no prefix) or a multi-line");
   console.log("  document where every line is typed with lex:, vec:, or hyde:. This grammar");
@@ -2742,6 +2762,12 @@ if (isMain) {
     process.exit(cli.values.help ? 0 : 1);
   }
 
+  // Configure remote model server if --server is set or QMD_SERVER env var
+  const serverUrl = (cli.values.server as string) || process.env.QMD_SERVER;
+  if (serverUrl && cli.command !== "serve") {
+    setDefaultLLM(new RemoteLLM({ serverUrl }));
+  }
+
   switch (cli.command) {
     case "context": {
       const subcommand = cli.args[0];
@@ -3063,6 +3089,26 @@ if (isMain) {
       await querySearch(cli.query, cli.opts);
       break;
 
+    case "serve": {
+      // Remove top-level cursor handlers so shutdown handlers work
+      process.removeAllListeners("SIGTERM");
+      process.removeAllListeners("SIGINT");
+      const servePort = Number(cli.values.port) || 7832;
+      const serveBind = (cli.values.bind as string) || "0.0.0.0";
+      const serveBackend = ((cli.values.backend as string) || process.env.QMD_SERVE_BACKEND || "local") as "local" | "ollama";
+      const backendUrl = (cli.values["backend-url"] as string) || (cli.values["rkllama-url"] as string) || process.env.RKLLAMA_URL || "http://localhost:11434";
+      await startServer({
+        port: servePort,
+        bind: serveBind,
+        backend: serveBackend,
+        backendUrl: serveBackend === "ollama" ? backendUrl : undefined,
+        config: {
+          embedModel: process.env.QMD_EMBED_MODEL || undefined,
+        },
+      });
+      break;
+    }
+
     case "mcp": {
       const sub = cli.args[0]; // stop | status | undefined
 

diff --git a/src/llm-remote.ts b/src/llm-remote.ts
@@ -0,0 +1,152 @@
+/**
+ * llm-remote.ts - Remote LLM implementation for QMD
+ *
+ * Connects to a `qmd serve` instance over HTTP, implementing the same LLM
+ * interface as LlamaCpp but without loading any models locally.
+ *
+ * Usage:
+ *   qmd query "search terms" --server http://192.168.6.123:7832
+ */
+
+import type {
+  LLM,
+  EmbedOptions,
+  EmbeddingResult,
+  GenerateOptions,
+  GenerateResult,
+  ModelInfo,
+  Queryable,
+  RerankDocument,
+  RerankOptions,
+  RerankResult,
+} from "./llm.js";
+
+// ---------------------------------------------------------------------------
+// Config
+// ---------------------------------------------------------------------------
+
+export interface RemoteLLMConfig {
+  /** Base URL of the qmd serve instance, e.g. "http://192.168.6.123:7832" */
+  serverUrl: string;
+  /** Request timeout in ms (default: 300 000 — 5 minutes, generous for CPU-only ARM SBCs) */
+  timeoutMs?: number;
+}
+
+// ---------------------------------------------------------------------------
+// Implementation
+// ---------------------------------------------------------------------------
+
+export class RemoteLLM implements LLM {
+  private readonly baseUrl: string;
+  private readonly timeoutMs: number;
+
+  constructor(config: RemoteLLMConfig) {
+    // Normalise: strip trailing slash
+    this.baseUrl = config.serverUrl.replace(/\/+$/, "");
+    this.timeoutMs = config.timeoutMs ?? 300_000;
+  }
+
+  // ---- helpers ----------------------------------------------------------
+
+  private async post<T>(path: string, body: unknown): Promise<T> {
+    const url = `${this.baseUrl}${path}`;
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), this.timeoutMs);
+
+    try {
+      const res = await fetch(url, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(body),
+        signal: controller.signal,
+      });
+
+      if (!res.ok) {
+        const text = await res.text().catch(() => "");
+        throw new Error(`qmd-server ${path} returned ${res.status}: ${text}`);
+      }
+
+      return (await res.json()) as T;
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
+  private async get<T>(path: string): Promise<T> {
+    const url = `${this.baseUrl}${path}`;
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), this.timeoutMs);
+
+    try {
+      const res = await fetch(url, { signal: controller.signal });
+      if (!res.ok) {
+        throw new Error(`qmd-server ${path} returned ${res.status}`);
+      }
+      return (await res.json()) as T;
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
+  // ---- LLM interface ----------------------------------------------------
+
+  async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    return this.post<EmbeddingResult | null>("/embed", { text, options });
+  }
+
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    return this.post<(EmbeddingResult | null)[]>("/embed-batch", { texts });
+  }
+
+  async generate(_prompt: string, _options?: GenerateOptions): Promise<GenerateResult | null> {
+    // Generation is not exposed via serve (only used internally for query expansion)
+    // expandQuery handles this end-to-end
+    return null;
+  }
+
+  async modelExists(model: string): Promise<ModelInfo> {
+    try {
+      const health = await this.get<{ ok: boolean; models: Record<string, string> }>("/health");
+      const loaded = Object.values(health.models);
+      return {
+        name: model,
+        exists: loaded.some((m) => m.includes(model) || model.includes(m)),
+      };
+    } catch {
+      return { name: model, exists: false };
+    }
+  }
+
+  async expandQuery(
+    query: string,
+    options?: { context?: string; includeLexical?: boolean; intent?: string },
+  ): Promise<Queryable[]> {
+    return this.post<Queryable[]>("/expand", { query, options });
+  }
+
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    _options?: RerankOptions,
+  ): Promise<RerankResult> {
+    return this.post<RerankResult>("/rerank", { query, documents });
+  }
+
+  /**
+   * Tokenize remotely - falls back to char-based estimate on failure.
+   */
+  async tokenize(text: string): Promise<number[]> {
+    try {
+      const result = await this.post<{ tokens: number }>("/tokenize", { text });
+      // Return a dummy token array of the right length (actual IDs don't matter for chunking)
+      return new Array(result.tokens).fill(0);
+    } catch {
+      // Fallback: ~4 chars per token
+      return new Array(Math.ceil(text.length / 4)).fill(0);
+    }
+  }
+
+  async dispose(): Promise<void> {
+    // Nothing to dispose - we don't own the models
+  }
+}