diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e56c278..a6b9b91f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ ### Added +- **Remote model server** (`qmd serve`): HTTP server for embedding, + reranking, and query expansion. Supports `local` (node-llama-cpp) + and `rkllama` (Rockchip NPU) backends. +- **Index endpoints** on `qmd serve`: `/status`, `/collections`, + `/search?q=X`, `/browse` for remote memory browsing. Enables + TinyAgentOS and other tools to access agent memory over HTTP + without direct SQLite access. +- **Batch embedding**: `POST /embed-batch` sends all chunks in one + rkllama API call, reducing HTTP overhead. - AST-aware chunking for code files via `web-tree-sitter`. Supported languages: TypeScript/JavaScript, Python, Go, and Rust. Code files are chunked at function, class, and import boundaries instead of diff --git a/README.md b/README.md index 6206e32b..a308b7d0 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,15 @@ -# QMD - Query Markup Documents +# QMD - Query Markup Documents (with Remote Model Server Support) + +> **Fork note:** This fork adds `qmd serve` — a shared model server so multiple QMD clients +> (e.g. OpenClaw agents in separate LXC containers) can share a single set of embedding, +> reranking, and query expansion models over HTTP instead of each loading their own into RAM. +> See [Remote Model Server](#remote-model-server) below. Tracks upstream [tobi/qmd](https://github.com/tobi/qmd). +> +> Related upstream issues: [#489](https://github.com/tobi/qmd/issues/489), [#490](https://github.com/tobi/qmd/issues/490), [#502](https://github.com/tobi/qmd/issues/502), [#480](https://github.com/tobi/qmd/issues/480) An on-device search engine for everything you need to remember. Index your markdown notes, meeting transcripts, documentation, and knowledge bases. Search with keywords or natural language. Ideal for your agentic flows. -QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking—all running locally via node-llama-cpp with GGUF models. +QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking—all running locally via node-llama-cpp with GGUF models. **This fork also supports remote model serving** for shared/multi-agent deployments. ![QMD Architecture](assets/qmd-architecture.png) @@ -912,6 +919,70 @@ Uses node-llama-cpp's `createRankingContext()` and `rankAndSort()` API for cross Used for generating query variations via `LlamaChatSession`. +## Remote Model Server + +Share embedding, reranking, and query expansion models across multiple QMD clients over HTTP. Load models once, serve many clients. + +### Problem + +When running multiple QMD instances (e.g. agents in LXC containers, Docker, or separate machines), each loads its own copy of the embedding, reranker, and query expansion models into RAM. On memory-constrained devices, this is wasteful. On ARM64 or headless servers without GPU drivers, `node-llama-cpp` can't compile at all. + +### Solution + +Run `qmd serve` once on a host with GPU/NPU access, then point clients at it: + +```sh +# On the host (loads models once) +qmd serve --port 7832 +qmd serve --port 7832 --bind 0.0.0.0 # expose to network + +# With rkllama NPU backend (RK3588) +qmd serve --backend rkllama --rkllama-url http://localhost:8080 + +# On each client (no local models needed, no compilation) +QMD_SERVER=http://your-host:7832 qmd query "how does auth work" + +# Or per-command +qmd query --server http://your-host:7832 "search terms" +``` + +### Server Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/embed` | POST | Embed a single text | +| `/embed-batch` | POST | Batch embed multiple texts | +| `/rerank` | POST | Rerank documents by relevance | +| `/expand` | POST | Expand a query (lex/vec/hyde) | +| `/tokenize` | POST | Count tokens in text | +| `/health` | GET | Server status + loaded models | +| `/status` | GET | Index health (doc counts, embedding status) | +| `/collections` | GET | List collections with doc counts | +| `/search?q=X` | GET | FTS5 keyword search (optional `&collection=`, `&limit=`) | +| `/browse` | GET | Paginated chunk listing (optional `&collection=`, `&limit=`, `&offset=`) | + +The index endpoints (`/status`, `/collections`, `/search`, `/browse`) require a QMD database to be present. They return 503 if no database is loaded. + +### Agent/Container Integration + +Set the `QMD_SERVER` environment variable so clients use the remote server automatically: + +```bash +export QMD_SERVER=http://your-host:7832 + +# Or in a systemd service +Environment=QMD_SERVER=http://your-host:7832 +``` + +### Use Cases + +- **Multi-agent setups**: Multiple agents sharing one embedding server +- **LXC/Docker containers**: Agents in isolated containers accessing host-level GPU/NPU models +- **ARM64/headless servers**: No local GPU drivers needed — bypass `node-llama-cpp` compilation entirely +- **Low-memory devices**: ARM SBCs (Orange Pi, Raspberry Pi) where RAM is scarce +- **Full pipeline**: Unlike Ollama (embeddings only), `qmd serve` handles embed + rerank + query expansion +``` + ## License MIT diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 7216965a..b7fe1a26 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -77,7 +77,9 @@ import { type ReindexResult, type ChunkStrategy, } from "../store.js"; -import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js"; +import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLLM, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js"; +import { RemoteLLM } from "../llm-remote.js"; +import { startServer } from "../serve.js"; import { formatSearchResults, formatDocuments, @@ -453,8 +455,12 @@ async function showStatus(): Promise { console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); } - // Device / GPU info + // Device / GPU info (skip if using remote server - no local GPU to report) try { + if (process.env.QMD_SERVER) { + console.log(`\n${c.bold}Device${c.reset}`); + console.log(` Remote: ${c.green}${process.env.QMD_SERVER}${c.reset} (QMD_SERVER)`); + } else { const llm = getDefaultLlamaCpp(); const device = await llm.getDeviceInfo(); console.log(`\n${c.bold}Device${c.reset}`); @@ -479,6 +485,7 @@ async function showStatus(): Promise { console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); } console.log(` CPU: ${device.cpuCores} math cores`); + } // close else block for non-remote } catch { // Don't fail status if LLM init fails } @@ -2416,6 +2423,12 @@ function parseCLI() { http: { type: "boolean" }, daemon: { type: "boolean" }, port: { type: "string" }, + // Remote model server options + server: { type: "string" }, // URL of qmd serve instance (e.g. http://host:7832) + bind: { type: "string" }, // Bind address for qmd serve (default: 0.0.0.0) + backend: { type: "string" }, // Backend for qmd serve: "local" or "ollama" + "backend-url": { type: "string" }, // URL of Ollama-compatible server + "rkllama-url": { type: "string" }, // Deprecated alias for --backend-url }, allowPositionals: true, strict: false, // Allow unknown options to pass through @@ -2620,6 +2633,13 @@ function showHelp(): void { console.log(" --max-batch-mb - Cap UTF-8 MB loaded into memory per embedding batch"); console.log(" qmd cleanup - Clear caches, vacuum DB"); console.log(""); + console.log("Model server (shared models over network):"); + console.log(" qmd serve [--port 7832] [--bind 0.0.0.0] - Start model server (local backend)"); + console.log(" qmd serve --backend ollama - Use Ollama-compatible server"); + console.log(" qmd serve --backend ollama --backend-url http://host:11434"); + console.log(" qmd query --server http://host:7832 - Use remote models instead of local"); + console.log(" QMD_SERVER=http://host:7832 qmd query - Same via env var"); + console.log(""); console.log("Query syntax (qmd query):"); console.log(" QMD queries are either a single expand query (no prefix) or a multi-line"); console.log(" document where every line is typed with lex:, vec:, or hyde:. This grammar"); @@ -2742,6 +2762,12 @@ if (isMain) { process.exit(cli.values.help ? 0 : 1); } + // Configure remote model server if --server is set or QMD_SERVER env var + const serverUrl = (cli.values.server as string) || process.env.QMD_SERVER; + if (serverUrl && cli.command !== "serve") { + setDefaultLLM(new RemoteLLM({ serverUrl })); + } + switch (cli.command) { case "context": { const subcommand = cli.args[0]; @@ -3063,6 +3089,26 @@ if (isMain) { await querySearch(cli.query, cli.opts); break; + case "serve": { + // Remove top-level cursor handlers so shutdown handlers work + process.removeAllListeners("SIGTERM"); + process.removeAllListeners("SIGINT"); + const servePort = Number(cli.values.port) || 7832; + const serveBind = (cli.values.bind as string) || "0.0.0.0"; + const serveBackend = ((cli.values.backend as string) || process.env.QMD_SERVE_BACKEND || "local") as "local" | "ollama"; + const backendUrl = (cli.values["backend-url"] as string) || (cli.values["rkllama-url"] as string) || process.env.RKLLAMA_URL || "http://localhost:11434"; + await startServer({ + port: servePort, + bind: serveBind, + backend: serveBackend, + backendUrl: serveBackend === "ollama" ? backendUrl : undefined, + config: { + embedModel: process.env.QMD_EMBED_MODEL || undefined, + }, + }); + break; + } + case "mcp": { const sub = cli.args[0]; // stop | status | undefined diff --git a/src/llm-remote.ts b/src/llm-remote.ts new file mode 100644 index 00000000..c06e3b3b --- /dev/null +++ b/src/llm-remote.ts @@ -0,0 +1,152 @@ +/** + * llm-remote.ts - Remote LLM implementation for QMD + * + * Connects to a `qmd serve` instance over HTTP, implementing the same LLM + * interface as LlamaCpp but without loading any models locally. + * + * Usage: + * qmd query "search terms" --server http://192.168.6.123:7832 + */ + +import type { + LLM, + EmbedOptions, + EmbeddingResult, + GenerateOptions, + GenerateResult, + ModelInfo, + Queryable, + RerankDocument, + RerankOptions, + RerankResult, +} from "./llm.js"; + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +export interface RemoteLLMConfig { + /** Base URL of the qmd serve instance, e.g. "http://192.168.6.123:7832" */ + serverUrl: string; + /** Request timeout in ms (default: 300 000 — 5 minutes, generous for CPU-only ARM SBCs) */ + timeoutMs?: number; +} + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +export class RemoteLLM implements LLM { + private readonly baseUrl: string; + private readonly timeoutMs: number; + + constructor(config: RemoteLLMConfig) { + // Normalise: strip trailing slash + this.baseUrl = config.serverUrl.replace(/\/+$/, ""); + this.timeoutMs = config.timeoutMs ?? 300_000; + } + + // ---- helpers ---------------------------------------------------------- + + private async post(path: string, body: unknown): Promise { + const url = `${this.baseUrl}${path}`; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeoutMs); + + try { + const res = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + signal: controller.signal, + }); + + if (!res.ok) { + const text = await res.text().catch(() => ""); + throw new Error(`qmd-server ${path} returned ${res.status}: ${text}`); + } + + return (await res.json()) as T; + } finally { + clearTimeout(timer); + } + } + + private async get(path: string): Promise { + const url = `${this.baseUrl}${path}`; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeoutMs); + + try { + const res = await fetch(url, { signal: controller.signal }); + if (!res.ok) { + throw new Error(`qmd-server ${path} returned ${res.status}`); + } + return (await res.json()) as T; + } finally { + clearTimeout(timer); + } + } + + // ---- LLM interface ---------------------------------------------------- + + async embed(text: string, options?: EmbedOptions): Promise { + return this.post("/embed", { text, options }); + } + + async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + return this.post<(EmbeddingResult | null)[]>("/embed-batch", { texts }); + } + + async generate(_prompt: string, _options?: GenerateOptions): Promise { + // Generation is not exposed via serve (only used internally for query expansion) + // expandQuery handles this end-to-end + return null; + } + + async modelExists(model: string): Promise { + try { + const health = await this.get<{ ok: boolean; models: Record }>("/health"); + const loaded = Object.values(health.models); + return { + name: model, + exists: loaded.some((m) => m.includes(model) || model.includes(m)), + }; + } catch { + return { name: model, exists: false }; + } + } + + async expandQuery( + query: string, + options?: { context?: string; includeLexical?: boolean; intent?: string }, + ): Promise { + return this.post("/expand", { query, options }); + } + + async rerank( + query: string, + documents: RerankDocument[], + _options?: RerankOptions, + ): Promise { + return this.post("/rerank", { query, documents }); + } + + /** + * Tokenize remotely - falls back to char-based estimate on failure. + */ + async tokenize(text: string): Promise { + try { + const result = await this.post<{ tokens: number }>("/tokenize", { text }); + // Return a dummy token array of the right length (actual IDs don't matter for chunking) + return new Array(result.tokens).fill(0); + } catch { + // Fallback: ~4 chars per token + return new Array(Math.ceil(text.length / 4)).fill(0); + } + } + + async dispose(): Promise { + // Nothing to dispose - we don't own the models + } +} diff --git a/src/llm.ts b/src/llm.ts index bd276bbf..85c1e68e 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -156,7 +156,7 @@ export type LLMSessionOptions = { export interface ILLMSession { embed(text: string, options?: EmbedOptions): Promise; embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>; - expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise; + expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }): Promise; rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise; /** Whether this session is still valid (not released or aborted) */ readonly isValid: boolean; @@ -319,6 +319,11 @@ export interface LLM { */ embed(text: string, options?: EmbedOptions): Promise; + /** + * Batch embed multiple texts efficiently + */ + embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>; + /** * Generate text completion */ @@ -333,7 +338,7 @@ export interface LLM { * Expand a search query into multiple variations for different backends. * Returns a list of Queryable objects. */ - expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise; + expandQuery(query: string, options?: { context?: string, includeLexical?: boolean, intent?: string }): Promise; /** * Rerank documents by relevance to a query @@ -341,6 +346,12 @@ export interface LLM { */ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise; + /** + * Tokenize text using the embedding model's tokenizer. + * Returns an array of token IDs (or dummy array of correct length for remote). + */ + tokenize(text: string): Promise; + /** * Dispose of resources */ @@ -1285,11 +1296,11 @@ export class LlamaCpp implements LLM { * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions. */ class LLMSessionManager { - private llm: LlamaCpp; + private llm: LLM; private _activeSessionCount = 0; private _inFlightOperations = 0; - constructor(llm: LlamaCpp) { + constructor(llm: LLM) { this.llm = llm; } @@ -1325,7 +1336,7 @@ class LLMSessionManager { this._inFlightOperations = Math.max(0, this._inFlightOperations - 1); } - getLlamaCpp(): LlamaCpp { + getLLM(): LLM { return this.llm; } } @@ -1368,7 +1379,7 @@ class LLMSession implements ILLMSession { } // Set up max duration timer - const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes + const maxDuration = options.maxDuration ?? 60 * 60 * 1000; // Default 60 minutes (generous for SBC batch operations) if (maxDuration > 0) { this.maxDurationTimer = setTimeout(() => { this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`)); @@ -1428,18 +1439,18 @@ class LLMSession implements ILLMSession { } async embed(text: string, options?: EmbedOptions): Promise { - return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options)); + return this.withOperation(() => this.manager.getLLM().embed(text, options)); } async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { - return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts)); + return this.withOperation(() => this.manager.getLLM().embedBatch(texts)); } async expandQuery( query: string, - options?: { context?: string; includeLexical?: boolean } + options?: { context?: string; includeLexical?: boolean; intent?: string } ): Promise { - return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options)); + return this.withOperation(() => this.manager.getLLM().expandQuery(query, options)); } async rerank( @@ -1447,7 +1458,7 @@ class LLMSession implements ILLMSession { documents: RerankDocument[], options?: RerankOptions ): Promise { - return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options)); + return this.withOperation(() => this.manager.getLLM().rerank(query, documents, options)); } } @@ -1459,7 +1470,7 @@ let defaultSessionManager: LLMSessionManager | null = null; */ function getSessionManager(): LLMSessionManager { const llm = getDefaultLlamaCpp(); - if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) { + if (!defaultSessionManager || defaultSessionManager.getLLM() !== llm) { defaultSessionManager = new LLMSessionManager(llm); } return defaultSessionManager; @@ -1498,7 +1509,7 @@ export async function withLLMSession( * Unlike withLLMSession, this does not use the global singleton. */ export async function withLLMSessionForLlm( - llm: LlamaCpp, + llm: LLM, fn: (session: ILLMSession) => Promise, options?: LLMSessionOptions ): Promise { @@ -1526,6 +1537,37 @@ export function canUnloadLLM(): boolean { // ============================================================================= let defaultLlamaCpp: LlamaCpp | null = null; +let defaultLLM: LLM | null = null; + +/** + * Get the default LLM instance. + * If a remote server URL is configured (via QMD_SERVER or setDefaultLLM), + * returns a RemoteLLM; otherwise returns the local LlamaCpp instance. + */ +export function getDefaultLLM(): LLM { + if (defaultLLM) return defaultLLM; + // Auto-detect QMD_SERVER env var as a safety net — + // the CLI sets defaultLLM explicitly, but SDK consumers and + // internal callers (e.g. chunkDocumentByTokens) may reach here + // without the CLI having run setDefaultLLM first. + const serverUrl = process.env.QMD_SERVER; + if (serverUrl) { + // Lazy import to avoid circular dependency at module load time + const { RemoteLLM } = require("./llm-remote.js"); + const remote: LLM = new RemoteLLM({ serverUrl }); + defaultLLM = remote; + return remote; + } + return getDefaultLlamaCpp(); +} + +/** + * Set a custom default LLM instance (remote or local). + * When set, getDefaultLLM() will return this instead of the local LlamaCpp. + */ +export function setDefaultLLM(llm: LLM | null): void { + defaultLLM = llm as LLM; +} /** * Get the default LlamaCpp instance (creates one if needed) @@ -1550,6 +1592,10 @@ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void { * Call this before process exit to prevent NAPI crashes. */ export async function disposeDefaultLlamaCpp(): Promise { + if (defaultLLM) { + await defaultLLM.dispose(); + defaultLLM = null; + } if (defaultLlamaCpp) { await defaultLlamaCpp.dispose(); defaultLlamaCpp = null; diff --git a/src/serve.ts b/src/serve.ts new file mode 100644 index 00000000..c2e45885 --- /dev/null +++ b/src/serve.ts @@ -0,0 +1,611 @@ +/** + * serve.ts - QMD model server + * + * Runs a lightweight HTTP server that exposes embedding, reranking, and query + * expansion via a JSON API. Designed to be started once on a host that has + * enough RAM/GPU for the GGUF models, so that multiple QMD clients (e.g. in + * LXC containers) can share the same loaded models over the network. + * + * Supports two backends: + * - "local" (default) — loads GGUF models via node-llama-cpp (CPU/Vulkan) + * - "ollama" — proxies to an Ollama-compatible server (ollama, rkllama, etc.) + * + * Usage: + * qmd serve [--port 7832] [--bind 0.0.0.0] + * qmd serve --backend ollama [--backend-url http://localhost:11434] + * + * Endpoints: + * POST /embed { text: string, options?: EmbedOptions } -> EmbeddingResult + * POST /embed-batch { texts: string[] } -> EmbeddingResult[] + * POST /rerank { query: string, documents: RerankDocument[] } -> RerankResult + * POST /expand { query: string, options?: ExpandOptions } -> Queryable[] + * POST /tokenize { text: string } -> { tokens: number } + * GET /health -> { ok: true, models: { embed, rerank, generate } } + * GET /status -> IndexStatus (collection counts, embedding status) + * GET /collections -> CollectionInfo[] (names, doc counts, last modified) + * GET /search?q=... -> { results: SearchResult[], total: number } + * GET /browse?limit=N -> { chunks: [...], total, limit, offset } + * POST /vsearch { query: string, limit?: number } -> { results: SearchResult[] } + */ + +import { createServer, type IncomingMessage, type ServerResponse } from "http"; +import { + LlamaCpp, + type LlamaCppConfig, + type EmbedOptions, + type RerankDocument, + type EmbeddingResult, + type RerankResult, + type RerankDocumentResult, + type Queryable, + DEFAULT_EMBED_MODEL_URI, + DEFAULT_RERANK_MODEL_URI, + DEFAULT_GENERATE_MODEL_URI, +} from "./llm.js"; + +import { + createStore, + enableProductionMode, + searchFTS, + searchVec, + listCollections, + type Store, +} from "./store.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +const MAX_BODY_BYTES = 50 * 1024 * 1024; // 50MB + +function readBody(req: IncomingMessage): Promise { + return new Promise((resolve, reject) => { + const chunks: Buffer[] = []; + let totalBytes = 0; + req.on("data", (chunk: Buffer) => { + totalBytes += chunk.length; + if (totalBytes > MAX_BODY_BYTES) { + req.destroy(); + reject(new Error(`Request body exceeds ${MAX_BODY_BYTES / 1024 / 1024}MB limit`)); + return; + } + chunks.push(chunk); + }); + req.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8"))); + req.on("error", reject); + }); +} + +function json(res: ServerResponse, status: number, body: unknown): void { + const payload = JSON.stringify(body); + res.writeHead(status, { + "Content-Type": "application/json", + "Content-Length": Buffer.byteLength(payload), + }); + res.end(payload); +} + +// --------------------------------------------------------------------------- +// Backend interface +// --------------------------------------------------------------------------- + +interface ModelBackend { + embed(text: string, options?: EmbedOptions): Promise; + embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>; + rerank(query: string, documents: RerankDocument[]): Promise; + expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }): Promise; + tokenize(text: string): Promise; + health(): Promise<{ models: Record }>; + dispose(): Promise; +} + +// --------------------------------------------------------------------------- +// Local backend (node-llama-cpp) +// --------------------------------------------------------------------------- + +class LocalBackend implements ModelBackend { + private llm: LlamaCpp; + private config: LlamaCppConfig; + + constructor(config: LlamaCppConfig = {}) { + this.config = config; + this.llm = new LlamaCpp(config); + } + + async embed(text: string, options?: EmbedOptions) { + return this.llm.embed(text, options); + } + + async embedBatch(texts: string[]) { + return this.llm.embedBatch(texts); + } + + async rerank(query: string, documents: RerankDocument[]) { + return this.llm.rerank(query, documents); + } + + async expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }) { + return this.llm.expandQuery(query, options); + } + + async tokenize(text: string) { + const tokens = await this.llm.tokenize(text); + return tokens?.length ?? Math.ceil(text.length / 4); + } + + async health() { + return { + models: { + embed: this.config.embedModel ?? DEFAULT_EMBED_MODEL_URI, + rerank: this.config.rerankModel ?? DEFAULT_RERANK_MODEL_URI, + generate: this.config.generateModel ?? DEFAULT_GENERATE_MODEL_URI, + }, + }; + } + + async dispose() { + await this.llm.dispose(); + } +} + +// --------------------------------------------------------------------------- +// RKLLama NPU backend +// --------------------------------------------------------------------------- + +class OllamaCompatBackend implements ModelBackend { + private baseUrl: string; + private embedModel: string; + private rerankModel: string; + private expandModel: string; + + constructor(options: { + url?: string; + embedModel?: string; + rerankModel?: string; + expandModel?: string; + } = {}) { + this.baseUrl = (options.url ?? "http://localhost:8080").replace(/\/+$/, ""); + this.embedModel = options.embedModel ?? "qwen3-embedding-0.6b"; + this.rerankModel = options.rerankModel ?? "qwen3-reranker-0.6b"; + this.expandModel = options.expandModel ?? "qmd-query-expansion"; + } + + private async post(path: string, body: unknown): Promise { + const res = await fetch(`${this.baseUrl}${path}`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + if (!res.ok) { + const text = await res.text().catch(() => ""); + throw new Error(`ollama-compat ${path} returned ${res.status}: ${text}`); + } + return (await res.json()) as T; + } + + async embed(text: string, options?: EmbedOptions): Promise { + // Format text for Qwen3-Embedding (instruction-based format) + const formattedText = options?.isQuery + ? `Instruct: Retrieve relevant documents for the given query\nQuery: ${text}` + : text; + + const result = await this.post<{ embeddings: number[][] }>("/api/embed", { + model: this.embedModel, + input: formattedText, + }); + + if (!result.embeddings || result.embeddings.length === 0) return null; + + return { + embedding: result.embeddings[0]!, + model: this.embedModel, + }; + } + + async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + if (texts.length === 0) return []; + + // Send all texts in one /api/embed call — Ollama API accepts arrays + // and returns one embedding per input, saving HTTP round-trips. + try { + const result = await this.post<{ embeddings: number[][] }>("/api/embed", { + model: this.embedModel, + input: texts, + }); + + if (!result.embeddings) return texts.map(() => null); + + return result.embeddings.map((emb) => + emb ? { embedding: emb, model: this.embedModel } : null, + ); + } catch { + // Fallback to individual embeds if batch fails + const results: (EmbeddingResult | null)[] = []; + for (const text of texts) { + results.push(await this.embed(text)); + } + return results; + } + } + + async rerank(query: string, documents: RerankDocument[]): Promise { + // Use the /api/rerank endpoint which uses logit-based + // cross-encoder scoring (softmax over yes/no token probabilities). + // This produces accurate relevance scores directly from the NPU. + const docTexts = documents.map((d) => d.text); + + const result = await this.post<{ + model: string; + results: { index: number; relevance_score: number }[]; + }>("/api/rerank", { + model: this.rerankModel, + query, + documents: docTexts, + }); + + // Map the response format to QMD's RerankResult format + const results: RerankDocumentResult[] = result.results.map((r) => ({ + file: documents[r.index]?.file ?? `doc-${r.index}`, + score: r.relevance_score, + index: r.index, + })); + + // Sort by score descending + results.sort((a, b) => b.score - a.score); + + return { results, model: this.rerankModel }; + } + + async expandQuery( + query: string, + options?: { context?: string; includeLexical?: boolean; intent?: string }, + ): Promise { + // The QMD query expansion model is fine-tuned to output structured lines: + // lex: keyword terms for BM25 + // vec: semantic rephrasing for vector search + // hyde: hypothetical document for HyDE search + // Without grammar constraints (Ollama doesn't support GBNF), we use + // the model's training + a clear prompt to get structured output. + const intent = options?.intent; + const prompt = intent + ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}` + : `/no_think Expand this search query: ${query}`; + + const result = await this.post<{ response: string }>("/api/generate", { + model: this.expandModel, + prompt, + stream: false, + options: { temperature: 0.7, top_k: 20, top_p: 0.8, num_predict: 600 }, + }); + + // Parse the response - the fine-tuned model should output lex:/vec:/hyde: lines + const response = result.response.trim(); + const queryables: Queryable[] = []; + const queryLower = query.toLowerCase(); + const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean); + + // Check if a generated line relates to the original query + const hasQueryTerm = (text: string): boolean => { + const lower = text.toLowerCase(); + if (queryTerms.length === 0) return true; + return queryTerms.some(term => lower.includes(term)); + }; + + for (const line of response.split("\n")) { + const trimmed = line.trim(); + const colonIdx = trimmed.indexOf(":"); + if (colonIdx === -1) continue; + const type = trimmed.slice(0, colonIdx).trim().toLowerCase(); + if (type !== "lex" && type !== "vec" && type !== "hyde") continue; + const text = trimmed.slice(colonIdx + 1).trim(); + if (!text || !hasQueryTerm(text)) continue; + queryables.push({ type: type as "lex" | "vec" | "hyde", text }); + } + + // If the model produced valid structured output, return it + const includeLexical = options?.includeLexical ?? true; + const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== "lex"); + if (filtered.length > 0) return filtered; + + // Fallback: if model produced free text instead of structured lines, + // wrap it as a vec query and add the original as lex + const fallback: Queryable[] = [ + { type: "hyde", text: `Information about ${query}` }, + { type: "lex", text: query }, + { type: "vec", text: query }, + ]; + return includeLexical ? fallback : fallback.filter(q => q.type !== "lex"); + } + + async tokenize(text: string): Promise { + // No tokenizer endpoint in Ollama API - estimate + return Math.ceil(text.length / 4); + } + + async health() { + const res = await fetch(`${this.baseUrl}/api/tags`); + const data = (await res.json()) as { models: { name: string }[] }; + const modelNames = data.models.map((m) => m.name); + return { + models: { + embed: modelNames.find((n) => n.includes("embed")) ?? this.embedModel, + rerank: modelNames.find((n) => n.includes("rerank")) ?? this.rerankModel, + generate: modelNames.find((n) => n.includes("expansion") || n.includes("query")) ?? this.expandModel, + }, + }; + } + + async dispose() { + // Nothing to dispose - we don't own the external process + } +} + +// --------------------------------------------------------------------------- +// Server +// --------------------------------------------------------------------------- + +export interface ServeOptions { + port?: number; + bind?: string; + backend?: "local" | "ollama"; + backendUrl?: string; + /** @deprecated Use backendUrl instead */ + rkllamaUrl?: string; + config?: LlamaCppConfig; + dbPath?: string; +} + +export async function startServer(options: ServeOptions = {}): Promise { + const port = options.port ?? 7832; + const bind = options.bind ?? "127.0.0.1"; + const backendType = options.backend ?? "local"; + + let backend: ModelBackend; + + if (backendType === "ollama") { + const url = options.backendUrl ?? options.rkllamaUrl ?? "http://localhost:11434"; + backend = new OllamaCompatBackend({ url }); + console.log(`[qmd serve] Backend: ollama-compatible (${url})`); + } else { + backend = new LocalBackend(options.config ?? {}); + console.log(`[qmd serve] Backend: local (node-llama-cpp)`); + console.log(` embed: ${options.config?.embedModel ?? DEFAULT_EMBED_MODEL_URI}`); + console.log(` rerank: ${options.config?.rerankModel ?? DEFAULT_RERANK_MODEL_URI}`); + console.log(` generate: ${options.config?.generateModel ?? DEFAULT_GENERATE_MODEL_URI}`); + } + + const healthInfo = await backend.health().catch(() => ({ + models: { embed: "unknown", rerank: "unknown", generate: "unknown" }, + })); + console.log(` Models: ${Object.values(healthInfo.models).join(", ")}`); + + // Open the QMD index database for search/browse/collections endpoints. + // If no dbPath is specified, uses the default (~/.cache/qmd/index.sqlite). + enableProductionMode(); + let store: Store | null = null; + try { + store = createStore(options.dbPath); + console.log(`[qmd serve] Index: ${store.dbPath}`); + } catch (err) { + console.warn(`[qmd serve] No index database found — search/browse endpoints disabled`); + } + + const server = createServer(async (req, res) => { + // CORS for local network + res.setHeader("Access-Control-Allow-Origin", "*"); + res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); + res.setHeader("Access-Control-Allow-Headers", "Content-Type"); + + if (req.method === "OPTIONS") { + res.writeHead(204); + res.end(); + return; + } + + const url = new URL(req.url ?? "/", `http://${req.headers.host}`); + const path = url.pathname; + + try { + // ----- Health ----------------------------------------------------------- + if (path === "/health" && req.method === "GET") { + const info = await backend.health(); + json(res, 200, { ok: true, version: "2", backend: backendType, ...info }); + return; + } + + // ----- Index endpoints (require store) ----------------------------------- + + if (path === "/status" && req.method === "GET") { + if (!store) { json(res, 503, { error: "No index database loaded" }); return; } + const status = store.getStatus(); + json(res, 200, status); + return; + } + + if (path === "/collections" && req.method === "GET") { + if (!store) { json(res, 503, { error: "No index database loaded" }); return; } + const collections = listCollections(store.db); + json(res, 200, collections); + return; + } + + if (path === "/search" && req.method === "GET") { + if (!store) { json(res, 503, { error: "No index database loaded" }); return; } + const query = url.searchParams.get("q") || url.searchParams.get("query"); + if (!query) { json(res, 400, { error: "q or query parameter is required" }); return; } + const limit = parseInt(url.searchParams.get("limit") || "20", 10); + const collection = url.searchParams.get("collection") || undefined; + const results = searchFTS(store.db, query, limit, collection); + json(res, 200, { results, total: results.length }); + return; + } + + if (path === "/browse" && req.method === "GET") { + if (!store) { json(res, 503, { error: "No index database loaded" }); return; } + const limit = parseInt(url.searchParams.get("limit") || "20", 10); + const offset = parseInt(url.searchParams.get("offset") || "0", 10); + const collection = url.searchParams.get("collection") || undefined; + + let sql = ` + SELECT c.hash, substr(c.doc, 1, 500) as snippet, c.created_at, + d.collection, d.path, d.title + FROM content c + JOIN documents d ON d.hash = c.hash AND d.active = 1 + `; + const params: (string | number)[] = []; + if (collection) { + sql += ` WHERE d.collection = ?`; + params.push(collection); + } + sql += ` ORDER BY c.created_at DESC LIMIT ? OFFSET ?`; + params.push(limit, offset); + + const rows = store.db.prepare(sql).all(...params); + + // Get total count + let countSql = `SELECT COUNT(*) as total FROM content c JOIN documents d ON d.hash = c.hash AND d.active = 1`; + const countParams: string[] = []; + if (collection) { + countSql += ` WHERE d.collection = ?`; + countParams.push(collection); + } + const countRow = store.db.prepare(countSql).get(...countParams) as { total: number }; + + json(res, 200, { chunks: rows, total: countRow.total, limit, offset }); + return; + } + + // Only POST below + if (req.method !== "POST") { + json(res, 405, { error: "Method not allowed" }); + return; + } + + const body = JSON.parse(await readBody(req)); + + // ----- Vector Search (semantic) ------------------------------------------- + if (path === "/vsearch") { + if (!store) { json(res, 503, { error: "No index database loaded" }); return; } + const { query: vsQuery, limit: vsLimit, collection: vsCollection } = body as { + query: string; + limit?: number; + collection?: string; + }; + if (typeof vsQuery !== "string" || vsQuery.length === 0) { + json(res, 400, { error: "query must be a non-empty string" }); + return; + } + // Step 1: embed the query via our backend + const embedResult = await backend.embed(vsQuery, { isQuery: true }); + if (!embedResult) { + json(res, 500, { error: "Failed to embed query" }); + return; + } + // Step 2: vector search with precomputed embedding + const results = await searchVec( + store.db, vsQuery, embedResult.model, + vsLimit ?? 20, vsCollection ?? undefined, + undefined, embedResult.embedding, + ); + json(res, 200, { results, total: results.length }); + return; + } + + // ----- Embed ------------------------------------------------------------ + if (path === "/embed") { + const { text, options: embedOpts } = body as { + text: string; + options?: EmbedOptions; + }; + if (typeof text !== "string" || text.length === 0) { + json(res, 400, { error: "text must be a non-empty string" }); + return; + } + const result = await backend.embed(text, embedOpts); + json(res, 200, result); + return; + } + + // ----- Embed Batch ------------------------------------------------------ + if (path === "/embed-batch") { + const { texts } = body as { texts: string[] }; + if (!Array.isArray(texts) || texts.length === 0 || !texts.every(t => typeof t === "string")) { + json(res, 400, { error: "texts must be a non-empty array of strings" }); + return; + } + const results = await backend.embedBatch(texts); + json(res, 200, results); + return; + } + + // ----- Rerank ----------------------------------------------------------- + if (path === "/rerank") { + const { query, documents } = body as { + query: string; + documents: RerankDocument[]; + }; + if (typeof query !== "string" || query.length === 0 || !Array.isArray(documents) || documents.length === 0) { + json(res, 400, { error: "query must be a non-empty string and documents must be a non-empty array" }); + return; + } + const result = await backend.rerank(query, documents); + json(res, 200, result); + return; + } + + // ----- Expand Query ----------------------------------------------------- + if (path === "/expand") { + const { query, options: expandOpts } = body as { + query: string; + options?: { context?: string; includeLexical?: boolean; intent?: string }; + }; + if (typeof query !== "string" || query.length === 0) { + json(res, 400, { error: "query must be a non-empty string" }); + return; + } + const result = await backend.expandQuery(query, expandOpts); + json(res, 200, result); + return; + } + + // ----- Tokenize --------------------------------------------------------- + if (path === "/tokenize") { + const { text } = body as { text: string }; + if (typeof text !== "string" || text.length === 0) { + json(res, 400, { error: "text must be a non-empty string" }); + return; + } + const count = await backend.tokenize(text); + json(res, 200, { tokens: count }); + return; + } + + json(res, 404, { error: "Not found" }); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + console.error(`[qmd serve] Error on ${path}: ${message}`); + json(res, 500, { error: message }); + } + }); + + // Return a Promise that stays pending until the server shuts down. + return new Promise((resolve) => { + const shutdown = async () => { + console.log("\n[qmd serve] Shutting down..."); + server.close(); + if (store) store.close(); + await backend.dispose(); + resolve(); + }; + process.on("SIGINT", shutdown); + process.on("SIGTERM", shutdown); + + server.listen(port, bind, () => { + console.log(`[qmd serve] Listening on http://${bind}:${port}`); + console.log(`[qmd serve] Endpoints: /embed, /embed-batch, /rerank, /expand, /tokenize, /health`); + if (store) { + console.log(`[qmd serve] Index endpoints: /status, /collections, /search, /vsearch, /browse`); + } + }); + }); +} diff --git a/src/store.ts b/src/store.ts index d1b24eb3..612a0244 100644 --- a/src/store.ts +++ b/src/store.ts @@ -21,9 +21,11 @@ import fastGlob from "fast-glob"; import { LlamaCpp, getDefaultLlamaCpp, + getDefaultLLM, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, + type LLM, type RerankDocument, type ILLMSession, } from "./llm.js"; @@ -62,8 +64,8 @@ export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars * Get the LlamaCpp instance for a store — prefers the store's own instance, * falls back to the global singleton. */ -function getLlm(store: Store): LlamaCpp { - return store.llm ?? getDefaultLlamaCpp(); +function getLlm(store: Store): LLM { + return store.llm ?? getDefaultLLM(); } // ============================================================================= @@ -1066,8 +1068,8 @@ function ensureVecTableInternal(db: Database, dimensions: number): void { export type Store = { db: Database; dbPath: string; - /** Optional LlamaCpp instance for this store (overrides the global singleton) */ - llm?: LlamaCpp; + /** Optional LLM instance for this store (overrides the global singleton). Can be LlamaCpp or RemoteLLM. */ + llm?: LLM; close: () => void; ensureVecTable: (dimensions: number) => void; @@ -1487,9 +1489,11 @@ export async function generateEmbeddings( break; } - // Abort early if error rate is too high (>80% of processed chunks failed) + // Abort early if error rate is too high (>99% of processed chunks failed) + // Very generous for SBC/NPU: individual chunks already retry 3x with backoff. + // Only abort if nearly everything is failing (server truly down). const processed = chunksEmbedded + errors; - if (processed >= BATCH_SIZE && errors > processed * 0.8) { + if (processed >= BATCH_SIZE * 4 && errors > processed * 0.99) { const remaining = batchChunks.length - batchStart; errors += remaining; console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`); @@ -1521,18 +1525,23 @@ export async function generateEmbeddings( batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0); } else { for (const chunk of chunkBatch) { - try { - const text = formatDocForEmbedding(chunk.text, chunk.title); - const result = await session.embed(text); - if (result) { - insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now); - chunksEmbedded++; - } else { - errors++; + let embedded = false; + const text = formatDocForEmbedding(chunk.text, chunk.title); + // Retry up to 3 times with backoff for SBC/NPU intermittent failures + for (let attempt = 0; attempt < 3 && !embedded; attempt++) { + try { + if (attempt > 0) await new Promise(r => setTimeout(r, 1000 * attempt)); + const result = await session.embed(text); + if (result) { + insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now); + chunksEmbedded++; + embedded = true; + } + } catch { + // Retry on next iteration } - } catch { - errors++; } + if (!embedded) errors++; batchChunkBytesProcessed += chunk.bytes; } } @@ -2201,7 +2210,7 @@ export async function chunkDocumentByTokens( chunkStrategy: ChunkStrategy = "regex", signal?: AbortSignal ): Promise<{ text: string; pos: number; tokens: number }[]> { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3) // If chunks exceed limit, they'll be re-split with actual ratio @@ -3077,12 +3086,12 @@ export async function searchVec(db: Database, query: string, model: string, limi // Embeddings // ============================================================================= -async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp): Promise { +async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LLM): Promise { // Format text using the appropriate prompt template const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model); const result = session ? await session.embed(formattedText, { model, isQuery }) - : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery }); + : await (llmOverride ?? getDefaultLLM()).embed(formattedText, { model, isQuery }); return result?.embedding || null; } @@ -3146,7 +3155,7 @@ export function insertEmbedding( // Query expansion // ============================================================================= -export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise { +export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise { // Check cache first — stored as JSON preserving types const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) }); const cached = getCachedResult(db, cacheKey); @@ -3164,7 +3173,7 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M } } - const llm = llmOverride ?? getDefaultLlamaCpp(); + const llm = llmOverride ?? getDefaultLLM(); // Note: LlamaCpp uses hardcoded model, model parameter is ignored const results = await llm.expandQuery(query, { intent }); @@ -3185,7 +3194,7 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M // Reranking // ============================================================================= -export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{ file: string; score: number }[]> { +export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise<{ file: string; score: number }[]> { // Prepend intent to rerank query so the reranker scores with domain context const rerankQuery = intent ? `${intent}\n\n${query}` : query; @@ -3210,7 +3219,7 @@ export async function rerank(query: string, documents: { file: string; text: str // Rerank uncached documents using LlamaCpp if (uncachedDocsByChunk.size > 0) { - const llm = llmOverride ?? getDefaultLlamaCpp(); + const llm = llmOverride ?? getDefaultLLM(); const uncachedDocs = [...uncachedDocsByChunk.values()]; const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model });