Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

## [Unreleased]

### Changes

- Remote embedding, reranking, and query expansion via OpenAI-compatible API
(vLLM, Ollama, OpenAI, etc.). Set `QMD_EMBED_API_URL` / `QMD_EMBED_API_MODEL`
(and optionally `QMD_RERANK_API_*` / `QMD_EXPAND_API_*`) env vars or add
the equivalent keys to `models:` in `index.yml`. Local generation and
tokenization are preserved via a hybrid routing layer. Includes circuit
breakers, dimension validation, and batch splitting.

### Fixes

- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
Expand Down
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,41 @@ Uses node-llama-cpp's `createRankingContext()` and `rankAndSort()` API for cross

Used for generating query variations via `LlamaChatSession`.

### Remote Embedding & Reranking

QMD can offload embedding and reranking to a remote OpenAI-compatible server (vLLM, Ollama, LM Studio, OpenAI, etc.) while keeping query expansion local.

**Environment variables** (presence of `QMD_EMBED_API_URL` activates remote mode):

| Variable | Required | Description |
|----------|----------|-------------|
| `QMD_EMBED_API_URL` | Yes | Base URL, e.g. `http://gpu-host:8000/v1` |
| `QMD_EMBED_API_MODEL` | Yes | Model name, e.g. `BAAI/bge-m3` |
| `QMD_EMBED_API_KEY` | No | Bearer token for auth |
| `QMD_RERANK_API_URL` | No | Rerank endpoint (defaults to embed URL) |
| `QMD_RERANK_API_MODEL` | No | Rerank model name |
| `QMD_RERANK_API_KEY` | No | Rerank auth (defaults to embed key) |

**YAML config** (`~/.config/qmd/index.yml`):
```yaml
models:
embed_api_url: "http://gpu-host:8000/v1"
embed_api_model: "BAAI/bge-m3"
rerank_api_model: "BAAI/bge-reranker-v2-m3"
```

**Example with vLLM:**
```sh
# Start vLLM with an embedding model
vllm serve BAAI/bge-m3 --task embed

# Point QMD at it
export QMD_EMBED_API_URL=http://localhost:8000/v1
export QMD_EMBED_API_MODEL=BAAI/bge-m3
qmd embed
qmd query "your search query"
```

## License

MIT
28 changes: 25 additions & 3 deletions src/cli/qmd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ import {
type ReindexResult,
type ChunkStrategy,
} from "../store.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { disposeDefaultLlamaCpp, getDefaultLLM, setDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { RemoteLLM, remoteConfigFromEnv } from "../remote-llm.js";
import { HybridLLM } from "../hybrid-llm.js";
import {
formatSearchResults,
formatDocuments,
Expand Down Expand Up @@ -121,11 +123,28 @@ function getStore(): ReturnType<typeof createStore> {
const config = loadConfig();
syncConfigToDb(store.db, config);
if (config.models) {
setDefaultLlamaCpp(new LlamaCpp({
const localLlm = new LlamaCpp({
embedModel: config.models.embed,
generateModel: config.models.generate,
rerankModel: config.models.rerank,
}));
});

// Check if remote embedding is configured (env vars take precedence over YAML)
const remoteConfig = remoteConfigFromEnv(config.models);
if (remoteConfig) {
const remoteLlm = new RemoteLLM(remoteConfig);
setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
} else {
setDefaultLLM(localLlm);
}
} else {
// No YAML models config — still check env vars for remote embedding
const remoteConfig = remoteConfigFromEnv();
if (remoteConfig) {
const remoteLlm = new RemoteLLM(remoteConfig);
const localLlm = new LlamaCpp();
setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
}
}
} catch {
// Config may not exist yet — that's fine, DB works without it
Expand Down Expand Up @@ -1681,6 +1700,9 @@ async function vectorIndex(
const storeInstance = getStore();
const db = storeInstance.db;

// Use the actual model name from the configured LLM (may be remote, not the default GGUF URI)
model = getDefaultLLM().embedModelName;

if (force) {
console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
}
Expand Down
12 changes: 12 additions & 0 deletions src/collections.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ export interface ModelsConfig {
embed?: string;
rerank?: string;
generate?: string;
/** Remote embedding API base URL (e.g. http://gpu-host:8000/v1) */
embed_api_url?: string;
/** Remote embedding model name (e.g. BAAI/bge-m3) */
embed_api_model?: string;
/** Bearer token for remote embedding API */
embed_api_key?: string;
/** Remote rerank API base URL (defaults to embed_api_url) */
rerank_api_url?: string;
/** Remote rerank model name */
rerank_api_model?: string;
/** Bearer token for remote rerank API */
rerank_api_key?: string;
}

/**
Expand Down
70 changes: 70 additions & 0 deletions src/hybrid-llm.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/**
* hybrid-llm.ts - Compositor that routes LLM operations between remote and local backends
*
* Embed/rerank → remote (GPU-heavy, benefits from offloading)
* Generate → local LlamaCpp
* ExpandQuery → remote when expandApiModel is configured, otherwise local LlamaCpp
* tokenize/countTokens → local LlamaCpp (CPU-cheap, needed for chunking)
*/

import type {
LLM,
EmbedOptions,
EmbeddingResult,
GenerateOptions,
GenerateResult,
ModelInfo,
Queryable,
RerankDocument,
RerankOptions,
RerankResult,
} from "./llm.js";
import { RemoteLLM } from "./remote-llm.js";

export class HybridLLM implements LLM {
constructor(
private readonly remote: LLM,
private readonly local: LLM,
) {}

get embedModelName(): string {
return this.remote.embedModelName;
}

// Route to remote
embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
return this.remote.embed(text, options);
}

embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]> {
return this.remote.embedBatch(texts, options);
}

rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult> {
return this.remote.rerank(query, documents, options);
}

// Route to local
generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null> {
return this.local.generate(prompt, options);
}

/**
* Route expandQuery to remote when the remote backend supports it
* (i.e., RemoteLLM with expandApiModel configured), otherwise fall back to local.
*/
expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }): Promise<Queryable[]> {
if (this.remote instanceof RemoteLLM && this.remote.supportsExpand) {
return this.remote.expandQuery(query, options);
}
return this.local.expandQuery(query, options);
}

modelExists(model: string): Promise<ModelInfo> {
return this.local.modelExists(model);
}

async dispose(): Promise<void> {
await Promise.all([this.remote.dispose(), this.local.dispose()]);
}
}
Loading