Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions src/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
LlamaChatSession,
LlamaLogLevel,
type Llama,
type LlamaGpuType,
type LlamaModel,
type LlamaEmbeddingContext,
type Token as LlamaToken,
Expand Down Expand Up @@ -384,6 +385,8 @@ export type LlamaCppConfig = {
// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
const CPU_ONLY_GPU_OVERRIDES = new Set(["false", "off", "none", "disable", "disabled", "0"]);
const EXPLICIT_GPU_OVERRIDES = new Set<LlamaGpuType>(["metal", "cuda", "vulkan"]);

function resolveExpandContextSize(configValue?: number): number {
if (configValue !== undefined) {
Expand All @@ -406,6 +409,14 @@ function resolveExpandContextSize(configValue?: number): number {
return parsed;
}

export function resolveLlamaGpuMode(rawValue = process.env.QMD_LLAMA_GPU): "auto" | false | LlamaGpuType {
const normalized = rawValue?.trim().toLowerCase();
if (!normalized || normalized === "auto") return "auto";
if (CPU_ONLY_GPU_OVERRIDES.has(normalized)) return false;
if (EXPLICIT_GPU_OVERRIDES.has(normalized as LlamaGpuType)) return normalized as LlamaGpuType;
return "auto";
}

export class LlamaCpp implements LLM {
private readonly _ciMode = !!process.env.CI;
private llama: Llama | null = null;
Expand Down Expand Up @@ -552,20 +563,27 @@ export class LlamaCpp implements LLM {
*/
private async ensureLlama(): Promise<Llama> {
if (!this.llama) {
// Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
const gpuMode = resolveLlamaGpuMode();

const loadLlama = async (gpu: "auto" | false) =>
const loadLlama = async (gpu: "auto" | false | LlamaGpuType) =>
await getLlama({
build: "autoAttempt",
logLevel: LlamaLogLevel.error,
gpu,
});

let llama: Llama;
if (forceCpu) {
if (gpuMode === false) {
llama = await loadLlama(false);
} else if (gpuMode !== "auto") {
try {
llama = await loadLlama(gpuMode);
} catch (err) {
process.stderr.write(
`QMD Warning: GPU init failed for ${gpuMode} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
);
llama = await loadLlama(false);
}
} else {
try {
llama = await loadLlama("auto");
Expand Down
20 changes: 20 additions & 0 deletions test/llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
disposeDefaultLlamaCpp,
withLLMSession,
canUnloadLLM,
resolveLlamaGpuMode,
SessionReleasedError,
type RerankDocument,
type ILLMSession,
Expand Down Expand Up @@ -161,6 +162,25 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
});
});

describe("resolveLlamaGpuMode", () => {
test("preserves explicit GPU backend overrides", () => {
expect(resolveLlamaGpuMode("metal")).toBe("metal");
expect(resolveLlamaGpuMode("cuda")).toBe("cuda");
expect(resolveLlamaGpuMode("vulkan")).toBe("vulkan");
});

test("keeps CPU-only overrides as false", () => {
expect(resolveLlamaGpuMode("false")).toBe(false);
expect(resolveLlamaGpuMode("off")).toBe(false);
});

test("falls back to auto for empty or unknown overrides", () => {
expect(resolveLlamaGpuMode(undefined)).toBe("auto");
expect(resolveLlamaGpuMode("auto")).toBe("auto");
expect(resolveLlamaGpuMode("mystery-backend")).toBe("auto");
});
});

describe("LlamaCpp rerank deduping", () => {
test("deduplicates identical document texts before scoring", async () => {
const llm = new LlamaCpp({}) as any;
Expand Down