tobi · kuishou68 · Apr 7, 2026
diff --git a/src/llm.ts b/src/llm.ts
@@ -10,6 +10,7 @@ import {
   LlamaChatSession,
   LlamaLogLevel,
   type Llama,
+  type LlamaGpuType,
   type LlamaModel,
   type LlamaEmbeddingContext,
   type Token as LlamaToken,
@@ -384,6 +385,8 @@ export type LlamaCppConfig = {
 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
+const CPU_ONLY_GPU_OVERRIDES = new Set(["false", "off", "none", "disable", "disabled", "0"]);
+const EXPLICIT_GPU_OVERRIDES = new Set<LlamaGpuType>(["metal", "cuda", "vulkan"]);
 
 function resolveExpandContextSize(configValue?: number): number {
   if (configValue !== undefined) {
@@ -406,6 +409,14 @@ function resolveExpandContextSize(configValue?: number): number {
   return parsed;
 }
 
+export function resolveLlamaGpuMode(rawValue = process.env.QMD_LLAMA_GPU): "auto" | false | LlamaGpuType {
+  const normalized = rawValue?.trim().toLowerCase();
+  if (!normalized || normalized === "auto") return "auto";
+  if (CPU_ONLY_GPU_OVERRIDES.has(normalized)) return false;
+  if (EXPLICIT_GPU_OVERRIDES.has(normalized as LlamaGpuType)) return normalized as LlamaGpuType;
+  return "auto";
+}
+
 export class LlamaCpp implements LLM {
   private readonly _ciMode = !!process.env.CI;
   private llama: Llama | null = null;
@@ -552,20 +563,27 @@ export class LlamaCpp implements LLM {
    */
   private async ensureLlama(): Promise<Llama> {
     if (!this.llama) {
-      // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
-      const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
-      const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
+      const gpuMode = resolveLlamaGpuMode();
 
-      const loadLlama = async (gpu: "auto" | false) =>
+      const loadLlama = async (gpu: "auto" | false | LlamaGpuType) =>
         await getLlama({
           build: "autoAttempt",
           logLevel: LlamaLogLevel.error,
           gpu,
         });
 
       let llama: Llama;
-      if (forceCpu) {
+      if (gpuMode === false) {
         llama = await loadLlama(false);
+      } else if (gpuMode !== "auto") {
+        try {
+          llama = await loadLlama(gpuMode);
+        } catch (err) {
+          process.stderr.write(
+            `QMD Warning: GPU init failed for ${gpuMode} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
+          );
+          llama = await loadLlama(false);
+        }
       } else {
         try {
           llama = await loadLlama("auto");

diff --git a/test/llm.test.ts b/test/llm.test.ts
@@ -14,6 +14,7 @@ import {
   disposeDefaultLlamaCpp,
   withLLMSession,
   canUnloadLLM,
+  resolveLlamaGpuMode,
   SessionReleasedError,
   type RerankDocument,
   type ILLMSession,
@@ -161,6 +162,25 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
   });
 });
 
+describe("resolveLlamaGpuMode", () => {
+  test("preserves explicit GPU backend overrides", () => {
+    expect(resolveLlamaGpuMode("metal")).toBe("metal");
+    expect(resolveLlamaGpuMode("cuda")).toBe("cuda");
+    expect(resolveLlamaGpuMode("vulkan")).toBe("vulkan");
+  });
+
+  test("keeps CPU-only overrides as false", () => {
+    expect(resolveLlamaGpuMode("false")).toBe(false);
+    expect(resolveLlamaGpuMode("off")).toBe(false);
+  });
+
+  test("falls back to auto for empty or unknown overrides", () => {
+    expect(resolveLlamaGpuMode(undefined)).toBe("auto");
+    expect(resolveLlamaGpuMode("auto")).toBe("auto");
+    expect(resolveLlamaGpuMode("mystery-backend")).toBe("auto");
+  });
+});
+
 describe("LlamaCpp rerank deduping", () => {
   test("deduplicates identical document texts before scoring", async () => {
     const llm = new LlamaCpp({}) as any;