Skip to content
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,30 @@ Supported model families:
> since vectors are not cross-compatible between models. The prompt format is
> automatically adjusted for each model family.

### OpenAI Embeddings (Optional)

As an alternative to local embedding models, you can use OpenAI's API for faster, more reliable embeddings:

```yaml
# ~/.config/qmd/index.yml
embedding:
provider: openai
openai:
api_key: sk-... # Optional, falls back to QMD_OPENAI_API_KEY or OPENAI_API_KEY env var
model: text-embedding-3-small # Optional, this is the default
expansion_model: gpt-4o-mini # Optional, model for query expansion/reranking
base_url: https://api.openai.com/v1 # Optional, for OpenAI-compatible APIs (Ollama, vLLM, etc.)
```

Benefits:
- **~10x faster** than local CPU inference
- **No GPU required** - works on any machine
- **More reliable** - no local model loading issues
- **Cost:** ~$0.02 per 1M tokens (very cheap)
- **OpenAI-compatible** - works with Ollama, vLLM, Azure, and other compatible APIs via `base_url`

When using OpenAI embeddings, query expansion and reranking use the OpenAI API instead of local models.

## Installation

```sh
Expand Down
228 changes: 129 additions & 99 deletions bun.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@
"better-sqlite3": "12.8.0",
"fast-glob": "3.3.3",
"node-llama-cpp": "3.18.1",
"openai": "^4.77.0",
"picomatch": "4.0.4",
"sqlite-vec": "0.1.9",
"tiktoken": "^1.0.22",
"web-tree-sitter": "0.26.7",
"yaml": "2.8.3",
"zod": "4.2.1"
Expand Down
180 changes: 114 additions & 66 deletions src/cli/qmd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ import {
type ReindexResult,
type ChunkStrategy,
} from "../store.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, getDefaultEmbeddingLLM, getEmbeddingConfig, withLLMSession, pullModels, setEmbeddingConfig, isUsingOpenAI, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import {
formatSearchResults,
formatDocuments,
Expand All @@ -98,6 +98,7 @@ import {
listAllContexts,
setConfigIndexName,
loadConfig,
getEmbeddingConfig as getEmbeddingConfigFromYaml,
} from "../collections.js";
import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";

Expand Down Expand Up @@ -454,8 +455,14 @@ async function showStatus(): Promise<void> {
console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
}

// Models
{
// Models / Provider info
if (isUsingOpenAI()) {
const embCfg = getEmbeddingConfig();
console.log(`\n${c.bold}Provider${c.reset}`);
console.log(` Mode: ${c.green}OpenAI-compatible${c.reset}`);
console.log(` Base URL: ${embCfg.openai?.baseURL || process.env.QMD_OPENAI_BASE_URL || '(default)'}`);
console.log(` Embed model: ${embCfg.openai?.embedModel || 'text-embedding-3-small'}`);
} else {
// hf:org/repo/file.gguf → https://huggingface.co/org/repo
const hfLink = (uri: string) => {
const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
Expand All @@ -465,38 +472,37 @@ async function showStatus(): Promise<void> {
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
}

// Device / GPU info
console.log(`\n${c.bold}Device${c.reset}`);
try {
const llm = getDefaultLlamaCpp();
const device = await llm.getDeviceInfo({ allowBuild: false });
if (device.gpu) {
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
if (device.gpuDevices.length > 0) {
// Deduplicate and count GPUs
const counts = new Map<string, number>();
for (const name of device.gpuDevices) {
counts.set(name, (counts.get(name) || 0) + 1);
// Device / GPU info (local mode only — skip in OpenAI mode to avoid triggering compilation)
console.log(`\n${c.bold}Device${c.reset}`);
try {
const llm = getDefaultLlamaCpp();
const device = await llm.getDeviceInfo({ allowBuild: false });
if (device.gpu) {
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
if (device.gpuDevices.length > 0) {
const counts = new Map<string, number>();
for (const name of device.gpuDevices) {
counts.set(name, (counts.get(name) || 0) + 1);
}
const deviceStr = Array.from(counts.entries())
.map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
.join(', ');
console.log(` Devices: ${deviceStr}`);
}
const deviceStr = Array.from(counts.entries())
.map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
.join(', ');
console.log(` Devices: ${deviceStr}`);
if (device.vram) {
console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
}
} else {
console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
}
if (device.vram) {
console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
console.log(` CPU: ${device.cpuCores} math cores`);
} catch (error) {
console.log(` Status: ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
if (error instanceof Error && error.message) {
console.log(` ${c.dim}${error.message}${c.reset}`);
}
} else {
console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
}
console.log(` CPU: ${device.cpuCores} math cores`);
} catch (error) {
console.log(` Status: ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
if (error instanceof Error && error.message) {
console.log(` ${c.dim}${error.message}${c.reset}`);
}
}

Expand Down Expand Up @@ -1704,34 +1710,37 @@ async function vectorIndex(

const startTime = Date.now();

const result = await generateEmbeddings(storeInstance, {
force,
model,
maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
maxBatchBytes: batchOptions?.maxBatchBytes,
chunkStrategy: batchOptions?.chunkStrategy,
onProgress: (info) => {
if (info.totalBytes === 0) return;
const percent = (info.bytesProcessed / info.totalBytes) * 100;
progress.set(percent);

const elapsed = (Date.now() - startTime) / 1000;
const bytesPerSec = info.bytesProcessed / elapsed;
const remainingBytes = info.totalBytes - info.bytesProcessed;
const etaSec = remainingBytes / bytesPerSec;

const bar = renderProgressBar(percent);
const percentStr = percent.toFixed(0).padStart(3);
const throughput = `${formatBytes(bytesPerSec)}/s`;
const eta = elapsed > 2 ? formatETA(etaSec) : "...";
const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";

if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
},
});
let result: Awaited<ReturnType<typeof generateEmbeddings>>;
try {
result = await generateEmbeddings(storeInstance, {
force,
model,
maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
maxBatchBytes: batchOptions?.maxBatchBytes,
chunkStrategy: batchOptions?.chunkStrategy,
onProgress: (info) => {
if (info.totalBytes === 0) return;
const percent = (info.bytesProcessed / info.totalBytes) * 100;
progress.set(percent);

progress.clear();
cursor.show();
const elapsed = (Date.now() - startTime) / 1000;
const bytesPerSec = info.bytesProcessed / elapsed;
const remainingBytes = info.totalBytes - info.bytesProcessed;
const etaSec = remainingBytes / bytesPerSec;

const bar = renderProgressBar(percent);
const percentStr = percent.toFixed(0).padStart(3);
const throughput = `${formatBytes(bytesPerSec)}/s`;
const eta = elapsed > 2 ? formatETA(etaSec) : "...";
const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";

if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
},
});
} finally {
progress.clear();
cursor.show();
}

const totalTimeSec = result.durationMs / 1000;

Expand Down Expand Up @@ -2235,10 +2244,8 @@ function search(query: string, opts: OutputOptions): void {

// Use large limit for --all, otherwise fetch more than needed and let outputResults filter
const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
const results = filterByCollections(
searchFTS(db, query, fetchLimit, singleCollection),
collectionNames
);
// Pass collections directly to searchFTS (it now supports arrays)
const results = searchFTS(db, query, fetchLimit, collectionNames.length > 0 ? collectionNames : undefined);

// Add context to results
const resultsWithContext = results.map(r => ({
Expand Down Expand Up @@ -2286,7 +2293,7 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =

checkIndexHealth(store.db);

await withLLMSession(async () => {
const llmSession = async () => {
let results = await vectorSearchQuery(store, query, {
collection: singleCollection,
limit: opts.all ? 500 : (opts.limit || 10),
Expand Down Expand Up @@ -2324,7 +2331,15 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
context: r.context,
docid: r.docid,
})), query, { ...opts, limit: results.length });
}, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
};

if (isUsingOpenAI()) {
await llmSession();
} else {
await withLLMSession(async () => llmSession(),
{ maxDuration: 10 * 60 * 1000, name: 'vectorSearch' }
);
}
}

async function querySearch(query: string, opts: OutputOptions, _embedModel: string = DEFAULT_EMBED_MODEL, _rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
Expand All @@ -2342,7 +2357,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
// Intent can come from --intent flag or from intent: line in query document
const intent = opts.intent || parsed?.intent;

await withLLMSession(async () => {
const querySession = async () => {
let results;

if (parsed) {
Expand Down Expand Up @@ -2462,7 +2477,15 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
docid: r.docid,
explain: r.explain,
})), displayQuery, { ...opts, limit: results.length });
}, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
};

if (isUsingOpenAI()) {
await querySession();
} else {
await withLLMSession(async () => querySession(),
{ maxDuration: 10 * 60 * 1000, name: 'querySearch' }
);
}
}

// Parse CLI arguments using util.parseArgs
Expand Down Expand Up @@ -2847,6 +2870,31 @@ if (isMain) {
process.exit(cli.values.help ? 0 : 1);
}

// Load embedding configuration.
// Priority: YAML config > env vars > default (local).
// Setting QMD_OPENAI_BASE_URL alone is enough to activate OpenAI mode.
const embeddingYamlConfig = getEmbeddingConfigFromYaml();
const useOpenAI = embeddingYamlConfig.provider === 'openai'
|| !!process.env.QMD_OPENAI_BASE_URL
|| process.env.QMD_OPENAI === '1';

if (useOpenAI) {
setEmbeddingConfig({
provider: 'openai',
openai: {
apiKey: embeddingYamlConfig.openai?.api_key || process.env.QMD_OPENAI_API_KEY,
embedModel: embeddingYamlConfig.openai?.model || process.env.QMD_OPENAI_EMBED_MODEL,
expansionModel: embeddingYamlConfig.openai?.expansion_model,
rerankModel: embeddingYamlConfig.openai?.rerank_model,
baseURL: embeddingYamlConfig.openai?.base_url || process.env.QMD_OPENAI_BASE_URL,
chatBaseURL: embeddingYamlConfig.openai?.chat_base_url,
chatApiKey: embeddingYamlConfig.openai?.chat_api_key,
rerankBaseURL: embeddingYamlConfig.openai?.rerank_base_url,
rerankApiKey: embeddingYamlConfig.openai?.rerank_api_key,
},
});
}

switch (cli.command) {
case "context": {
const subcommand = cli.args[0];
Expand Down
28 changes: 28 additions & 0 deletions src/collections.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,24 @@ export interface ModelsConfig {
generate?: string;
}

/**
* Embedding provider configuration (optional in config file)
*/
export interface EmbeddingProviderConfig {
provider?: 'local' | 'openai'; // Default: 'local'
openai?: {
api_key?: string; // Falls back to QMD_OPENAI_API_KEY / OPENAI_API_KEY env var
model?: string; // Default: 'text-embedding-3-small'
expansion_model?: string; // Default: 'gpt-4o-mini'
rerank_model?: string; // Default: falls back to expansion_model
base_url?: string; // Base URL for embeddings (OpenAI-compatible)
chat_base_url?: string; // Separate base URL for expansion (falls back to base_url)
chat_api_key?: string; // Separate API key for chat endpoint (falls back to api_key)
rerank_base_url?: string; // Separate base URL for reranking (falls back to chat_base_url)
rerank_api_key?: string; // Separate API key for rerank endpoint (falls back to chat_api_key)
};
}

/**
* The complete configuration file structure
*/
Expand All @@ -51,6 +69,7 @@ export interface CollectionConfig {
editor_uri_template?: string; // Alias for editor_uri
collections: Record<string, Collection>; // Collection name -> config
models?: ModelsConfig;
embedding?: EmbeddingProviderConfig; // Optional embedding provider settings
}

/**
Expand Down Expand Up @@ -510,3 +529,12 @@ export function isValidCollectionName(name: string): boolean {
// Allow alphanumeric, hyphens, underscores
return /^[a-zA-Z0-9_-]+$/.test(name);
}

/**
* Get embedding configuration from config file
* Returns default (local) config if not specified
*/
export function getEmbeddingConfig(): EmbeddingProviderConfig {
const config = loadConfig();
return config.embedding || { provider: 'local' };
}
1 change: 1 addition & 0 deletions src/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ export interface Database {
exec(sql: string): void;
prepare(sql: string): Statement;
loadExtension(path: string): void;
transaction<T>(fn: () => T): () => T;
close(): void;
}

Expand Down
Loading