From 9c8765f35349c1c779f97238e4442c5b13cbea19 Mon Sep 17 00:00:00 2001 From: Justin Michaels Date: Sun, 31 May 2026 13:50:37 -0400 Subject: [PATCH] feat(tuning): VRAM-tier-adaptive context + embed ubatch for AMD discrete Every AMD-discrete profile previously inherited the Vega-II bench constants (--ctx-size 8192, embed --ubatch-size 1024) regardless of the card, so smaller Intel-Mac AMD GPUs (8 GB RX 5700, 4 GB MacBook Pro dGPUs) could oversubscribe VRAM and forced operators to hand-set QUENCHFORGE_MAX_CONTEXT / QUENCHFORGE_EMBED_UBATCH_SIZE. amdSizing(vramGB) now derives both from the detected headline VRAM, threaded into KernelParams as a new arg: >= 12 GB -> no ctx cap, ubatch 1024 (Vega II/Duo, W6800X, W6900X, Vega 56/64) 7-11 GB -> ctx 4096, ubatch 512 (RX 5700/5700 XT, W5700X) <= 6 GB -> ctx 2048, ubatch 256 (4 GB MBP dGPUs, Polaris 560X) Guarantees: the >= 12 GB tier and any VRAM probe miss (0/unknown) keep the exact Vega-II-validated values (zero regression on the canonical path); buildSlotArgs applies the context cap as min(MaxContext, cap) so it only ever lowers; an explicit QUENCHFORGE_EMBED_UBATCH_SIZE still overrides the tier. Family-agnostic: unlisted/future AMD cards fall through classifyProfile to vega-pro and size by VRAM like any other. Adds SlotTuning.ContextSize, the amdSizing curve, six new tuning/cmd tests, and updates CHANGELOG (v0.8.0 final), README env table, and CLAUDE.md gotcha #2. --- CHANGELOG.md | 46 +++++++++++ CLAUDE.md | 18 +++-- README.md | 4 +- cmd/quenchforge/main.go | 16 +++- cmd/quenchforge/serve_test.go | 29 +++++++ internal/tuning/tuning.go | 81 +++++++++++++++---- internal/tuning/tuning_test.go | 139 +++++++++++++++++++++++++++++---- 7 files changed, 290 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 679709f..bdc8fab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,52 @@ patch bumps fix bugs or polish without behaviour change. --- +## v0.8.0 — AMD-discrete GPU mode + VRAM-tier-adaptive sizing (2026-05-31) + +Promotes the `v0.8.0-rc2` AMD-discrete GPU-mode revival (below) to a +final release and adds **VRAM-tier-adaptive slot sizing** so the full +range of Intel-Mac AMD GPUs — not just the 32 GB Vega II — runs +out-of-the-box without operator hand-tuning. + +### VRAM-tier-adaptive sizing + +Prior to this release every AMD-discrete profile inherited the Vega II +bench constants (`--ctx-size 8192`, embed `--ubatch-size 1024`) +regardless of card. On a smaller card (8 GB RX 5700, 4 GB MacBook Pro +dGPU) those defaults could oversubscribe VRAM, forcing the operator to +discover and set `QUENCHFORGE_MAX_CONTEXT` / `QUENCHFORGE_EMBED_UBATCH_SIZE` +by hand. `internal/tuning/tuning.go::amdSizing` now derives both from the +detected headline VRAM (`hardware.Info.GPUVRAMGB`, threaded into +`KernelParams`): + +| VRAM | `--ctx-size` cap | embed `--ubatch-size` | example cards | +|---|---|---|---| +| ≥ 12 GB | none (keeps `MaxContext`) | 1024 | Vega II/Duo, W6800X, W6900X, Vega 56/64, 5600M | +| 7–11 GB | 4096 | 512 | RX 5700 / 5700 XT, W5700X | +| ≤ 6 GB | 2048 | 256 | 4 GB MacBook Pro dGPUs (5300M/5500M), Polaris 560X | + +Design guarantees: + +- **Zero regression on the validated path.** The ≥ 12 GB tier (and any + VRAM probe that returns 0/unknown) keeps the exact Vega-II-benched + values, so the canonical Mac Pro config is byte-for-byte unchanged. +- **Caps only ever lower.** `buildSlotArgs` applies the context ceiling + as `min(cfg.MaxContext, cap)`, so an operator who raised + `QUENCHFORGE_MAX_CONTEXT` on a big card is never clamped. +- **Operator overrides still win.** An explicit + `QUENCHFORGE_EMBED_UBATCH_SIZE` beats the tier ubatch; the context cap + is an independent safety knob. +- The fix is family-agnostic: unlisted/future AMD cards fall through + `classifyProfile` to `vega-pro` and are sized by VRAM like any other. + +New coverage: `TestAmdSizing_Tiers`, `TestKernelParams_EmbedLowVRAMScalesDown`, +`TestKernelParams_ContextCapAppliesToAllAMDSlots`, +`TestKernelParams_HighVRAMAndNonAMDHaveNoContextCap`, +`TestKernelParams_UbatchOverrideBeatsTierButCapStands`, and +`TestBuildSlotArgs_LowVRAMAMDCapsContextAndUbatch`. + +--- + ## v0.8.0-rc2 — AMD-discrete GPU mode revival (2026-05-25) The Mac Pro 7,1 + Radeon Pro Vega II 32 GB configuration now runs all diff --git a/CLAUDE.md b/CLAUDE.md index c99e1d1..0f270c0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -145,18 +145,24 @@ that copy require a maintainer review. 2. **Embed/rerank slots have their own AMD safety surface — section 3.** The family-B graph-compute buffer-corruption crash hits embed/rerank under sustained batch load (eval suites, bulk KB ingest, sustained - MCP retrieval). Operators on AMD discrete running those workloads - should set: + MCP retrieval). As of **v0.8.0 the embed ubatch + context ceiling are + VRAM-tier-adaptive** (`internal/tuning/tuning.go::amdSizing`): the + detected `GPUVRAMGB` picks 1024/none (≥ 12 GB), 512/4096 (8 GB), or + 256/2048 (4 GB) automatically, so small cards no longer need + hand-tuning. Operators only set the env vars below to *override* the + tier (force a value, or apply the same safety knobs on a profile the + detector classified as non-AMD): ``` - QUENCHFORGE_EMBED_UBATCH_SIZE=1024 # or smaller — caps Metal staging-buffer pressure + QUENCHFORGE_EMBED_UBATCH_SIZE=1024 # override tier ubatch — caps Metal staging-buffer pressure QUENCHFORGE_EMBED_METAL_N_CB=1 # serialise command-buffer submission QUENCHFORGE_AUTO_BACKOFF=true # auto-503 before SIGABRT ``` - Defaults preserve historical behaviour. `quenchforge-bench - sustained-embed` is the empirical tuning tool; the follow-up PR - will land bench-driven Vega-II defaults. + The ≥ 12 GB tier keeps the bench-validated Vega II values verbatim; + a VRAM probe miss (0/unknown) is treated as high tier so detection + failures never throttle the validated path. `quenchforge-bench + sustained-embed` remains the empirical tuning tool for new families. 3. **`internal/tuning/` is the sole owner of per-(profile, kind) slot tuning.** `cmd/quenchforge/main.go::buildSlotArgs` and `slotEnv` diff --git a/README.md b/README.md index 0954e9a..5d9397d 100644 --- a/README.md +++ b/README.md @@ -176,9 +176,9 @@ All settings have sensible defaults. Selected env vars: | `QUENCHFORGE_MODELS_DIR` | `~/.quenchforge/models` | Where Quenchforge looks for GGUFs | | `QUENCHFORGE_LOG_DIR` | `~/Library/Logs/quenchforge` | Per-slot log files land here | | `QUENCHFORGE_PID_DIR` | `~/.config/quenchforge/pids` | Orphan-reaper pidfile dir | -| `QUENCHFORGE_MAX_CONTEXT` | `8192` | `--ctx-size` passed to every slot | +| `QUENCHFORGE_MAX_CONTEXT` | `8192` | `--ctx-size` passed to every slot. On AMD-discrete cards ≤ 11 GB this is auto-capped by VRAM tier (4096 on 8 GB, 2048 on 4 GB) so the KV cache fits; the cap only lowers, never raises, your value. ≥ 12 GB cards use it verbatim. | | `QUENCHFORGE_METAL_N_CB` | `2` | Metal command-buffer count (`GGML_METAL_N_CB`); global default — per-slot overrides below | -| `QUENCHFORGE_EMBED_UBATCH_SIZE` | `0` (inherit MaxContext) | Per-call `--batch-size` / `--ubatch-size` for embed and code-embed slots. On AMD discrete, lowering this (e.g. `1024`) caps Metal staging-buffer pressure and prevents the family-B sustained-load SIGABRT documented in `patches/README.md` section 3. | +| `QUENCHFORGE_EMBED_UBATCH_SIZE` | `0` (auto) | Per-call `--batch-size` / `--ubatch-size` for embed and code-embed slots. Zero auto-sizes by VRAM tier on AMD discrete (1024 on ≥ 12 GB, 512 on 8 GB, 256 on 4 GB) to cap Metal staging-buffer pressure and prevent the family-B sustained-load SIGABRT (`patches/README.md` section 3); non-AMD inherits MaxContext. An explicit value overrides the tier. | | `QUENCHFORGE_EMBED_METAL_N_CB` | `0` (inherit `METAL_N_CB`) | Per-slot `GGML_METAL_N_CB` for embed and code-embed. Set to `1` on AMD discrete to serialise Metal command-buffer submission. | | `QUENCHFORGE_RERANK_BATCH_SIZE` | `0` (llama.cpp's 512-token default) | Rerank slot `--batch-size` and `--ubatch-size`. Raise this when the reranker takes (query, doc) pairs longer than 510 tokens (e.g. `bge-reranker-v2-m3` with ≥ 1k-token chunks). | | `QUENCHFORGE_RERANK_METAL_N_CB` | `0` (inherit `METAL_N_CB`) | Per-slot `GGML_METAL_N_CB` for the rerank slot. | diff --git a/cmd/quenchforge/main.go b/cmd/quenchforge/main.go index 4e55c6a..16753b9 100644 --- a/cmd/quenchforge/main.go +++ b/cmd/quenchforge/main.go @@ -908,15 +908,23 @@ type slotSpec struct { // only for the base arg shape plus the layering of the tuning result. // Move the per-profile decisions there when they change, not here. func buildSlotArgs(cfg config.Config, hwInfo hardware.Info, spec slotSpec, modelPath string) []string { + tn := tuning.KernelParams(hwInfo.Profile, hwInfo.GPUVRAMGB, spec.Kind, cfg) + + // VRAM-tier-adaptive context ceiling: ContextSize only ever lowers + // cfg.MaxContext (small AMD cards), never raises it. + ctxSize := cfg.MaxContext + if tn.ContextSize > 0 && tn.ContextSize < ctxSize { + ctxSize = tn.ContextSize + } + args := []string{ "--model", modelPath, "--host", "127.0.0.1", "--port", fmt.Sprintf("%d", spec.Port), - "--ctx-size", fmt.Sprintf("%d", cfg.MaxContext), + "--ctx-size", fmt.Sprintf("%d", ctxSize), } args = append(args, spec.ExtraArgs...) - tn := tuning.KernelParams(hwInfo.Profile, spec.Kind, cfg) if tn.BatchSize > 0 { args = append(args, "--batch-size", fmt.Sprintf("%d", tn.BatchSize)) } @@ -936,7 +944,7 @@ func buildSlotArgs(cfg config.Config, hwInfo hardware.Info, spec slotSpec, model // on AMD discrete). func slotEnv(cfg config.Config, hwInfo hardware.Info, kind gateway.SlotKind) []string { ncb := cfg.MetalNCB - tn := tuning.KernelParams(hwInfo.Profile, kind, cfg) + tn := tuning.KernelParams(hwInfo.Profile, hwInfo.GPUVRAMGB, kind, cfg) if tn.MetalNCB > 0 { ncb = tn.MetalNCB } @@ -1022,7 +1030,7 @@ func startSlot(ctx context.Context, cfg config.Config, hwInfo hardware.Info, spe // graph-compute buffer-corruption crash is non-deterministic and the // slot stays dead after SIGABRT until manual restart. Tuning module // owns the decision; we just translate AutoRespawn → RestartPolicy. - tn := tuning.KernelParams(hwInfo.Profile, spec.Kind, cfg) + tn := tuning.KernelParams(hwInfo.Profile, hwInfo.GPUVRAMGB, spec.Kind, cfg) if tn.AutoRespawn { slot.RestartPolicy = supervisor.PolicyExpBackoff } diff --git a/cmd/quenchforge/serve_test.go b/cmd/quenchforge/serve_test.go index 32cefcd..487cc01 100644 --- a/cmd/quenchforge/serve_test.go +++ b/cmd/quenchforge/serve_test.go @@ -343,6 +343,35 @@ func TestBuildSlotArgs_EmbedKindsBatchOverride(t *testing.T) { } } +func TestBuildSlotArgs_LowVRAMAMDCapsContextAndUbatch(t *testing.T) { + // v0.8.0 adaptive sizing: an 8 GB AMD card (e.g. RX 5700) must get a + // capped --ctx-size 4096 (down from MaxContext 8192) and --ubatch-size + // 512 on embed/chat without any operator env var — the gap that used + // to force manual QUENCHFORGE_MAX_CONTEXT / _EMBED_UBATCH_SIZE tuning. + cfg := config.Config{MaxContext: 8192} + info := hardware.Info{Profile: hardware.ProfileRDNA1, GPUVRAMGB: 8} + + embedArgs := buildSlotArgs(cfg, info, slotSpec{Kind: gateway.KindEmbed, Name: "embed", Port: 11501}, "/tmp/e.gguf") + if !containsArgPair(embedArgs, "--ctx-size", "4096") { + t.Errorf("8 GB AMD embed missing capped --ctx-size 4096: %v", embedArgs) + } + if !containsArgPair(embedArgs, "--ubatch-size", "512") { + t.Errorf("8 GB AMD embed missing scaled --ubatch-size 512: %v", embedArgs) + } + + chatArgs := buildSlotArgs(cfg, info, slotSpec{Kind: gateway.KindChat, Name: "chat", Port: 11500}, "/tmp/c.gguf") + if !containsArgPair(chatArgs, "--ctx-size", "4096") { + t.Errorf("8 GB AMD chat missing capped --ctx-size 4096: %v", chatArgs) + } + + // Regression: a 32 GB card keeps the full configured context. + hi := hardware.Info{Profile: hardware.ProfileVegaPro, GPUVRAMGB: 32} + hiArgs := buildSlotArgs(cfg, hi, slotSpec{Kind: gateway.KindChat, Name: "chat", Port: 11500}, "/tmp/c.gguf") + if !containsArgPair(hiArgs, "--ctx-size", "8192") { + t.Errorf("32 GB AMD chat should keep --ctx-size 8192 (no cap): %v", hiArgs) + } +} + func TestBuildSlotArgs_NonEmbedKindsSkipBatchOverride(t *testing.T) { // Chat / rerank / whisper slots don't need the embed batch override // (they decode autoregressively or operate per-pair). Adding it diff --git a/internal/tuning/tuning.go b/internal/tuning/tuning.go index f7aee56..3c453ea 100644 --- a/internal/tuning/tuning.go +++ b/internal/tuning/tuning.go @@ -16,9 +16,11 @@ // kind on this hardware profile want", with table-driven tests. // // The function is intentionally pure (no I/O, no globals): it consumes -// a profile, a slot kind, and a config snapshot, and returns a -// `SlotTuning` describing the additional llama-server flags and env -// vars the supervisor should layer on top of the base argv. +// a profile, the detected GPU VRAM (GB), a slot kind, and a config +// snapshot, and returns a `SlotTuning` describing the additional +// llama-server flags and env vars the supervisor should layer on top of +// the base argv. VRAM drives the adaptive context/ubatch sizing (see +// amdSizing) so smaller AMD cards fit without operator hand-tuning. // // Honors operator overrides: env-driven config fields (cfg.EmbedUbatchSize, // cfg.EmbedMetalNCB, cfg.RerankBatchSize, cfg.RerankMetalNCB) win over @@ -93,6 +95,14 @@ type SlotTuning struct { // models and chat-decode races. See llama.cpp issue #19563 and patch 0002. // Apple Silicon (UMA) does not need this; the concurrent path is correct there. MetalConcurrencyDisable bool + + // ContextSize, when non-zero, is a VRAM-tier-derived ceiling on the + // slot's --ctx-size. buildSlotArgs applies it as min(cfg.MaxContext, + // ContextSize), so it only ever LOWERS the configured context: small + // AMD cards (<= 11 GB) get a KV cache that fits without manual tuning, + // while >= 12 GB cards and non-AMD profiles leave this 0 (no cap) and + // keep cfg.MaxContext verbatim. See amdSizing. + ContextSize int } // KernelParams returns the tuning the supervisor should apply for the @@ -108,14 +118,14 @@ type SlotTuning struct { // `~/Develop/quenchforge/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m:1665-1717` // — the `buf->is_shared` fast path uses plain `memcpy`). Adding flags // on those profiles would regress throughput without any safety win. -func KernelParams(profile hardware.Profile, kind gateway.SlotKind, cfg config.Config) SlotTuning { +func KernelParams(profile hardware.Profile, vramGB int, kind gateway.SlotKind, cfg config.Config) SlotTuning { switch kind { case gateway.KindChat: - return chatParams(profile) + return chatParams(profile, vramGB) case gateway.KindEmbed, gateway.KindCodeEmbed: - return embedParams(profile, cfg) + return embedParams(profile, vramGB, cfg) case gateway.KindRerank: - return rerankParams(profile, cfg) + return rerankParams(profile, vramGB, cfg) } // Whisper / imagegen and any future kinds fall through unchanged. return SlotTuning{} @@ -129,10 +139,11 @@ func KernelParams(profile hardware.Profile, kind gateway.SlotKind, cfg config.Co // theory that chat is naturally bursty; cerid eval workloads broke // that assumption (chat.log entry at 2026-05-16T23:14 — task 143 // hit GGML_ASSERT at `set_tensor` after ~30 successful chat calls). -func chatParams(profile hardware.Profile) SlotTuning { +func chatParams(profile hardware.Profile, vramGB int) SlotTuning { if !profileIsAMDDiscrete(profile) { return SlotTuning{} } + ctxCap, _ := amdSizing(vramGB) // AMD-discrete chat slot runs on GPU as of v0.8.0. The MTLDispatchTypeConcurrent // race that produced cross-call non-determinism is disabled via // MetalConcurrencyDisable -> GGML_METAL_CONCURRENCY_DISABLE=1. The family-B @@ -150,6 +161,7 @@ func chatParams(profile hardware.Profile) SlotTuning { "--no-cache-prompt", "--gpu-layers", "999", }, + ContextSize: ctxCap, MetalConcurrencyDisable: true, AutoRespawn: true, } @@ -174,23 +186,28 @@ func chatParams(profile hardware.Profile) SlotTuning { // - AMD discrete profiles additionally enable AutoRespawn — the // supervisor brings the slot back on a Metal SIGABRT instead of // leaving it dead until manual restart. -func embedParams(profile hardware.Profile, cfg config.Config) SlotTuning { +func embedParams(profile hardware.Profile, vramGB int, cfg config.Config) SlotTuning { ubatch := cfg.MaxContext metalNCB := cfg.MetalNCB + ctxCap := 0 if profileIsAMDDiscrete(profile) { - // AMD-discrete on GPU (v0.8.0) needs the 1024 ubatch cap re-enabled — + // AMD-discrete on GPU (v0.8.0) needs the ubatch cap re-enabled — // it bounds per-call Metal staging-buffer pressure even with patch 0002's - // pool in place. CLAUDE.md operational gotcha #2 documents this knob. - ubatch = amdEmbedUbatchDefault + // pool in place. As of v0.8.0 the cap is VRAM-tier-adaptive (1024 on + // >= 12 GB cards, 512 on 8 GB, 256 on 4 GB) so smaller cards don't OOM + // without an operator setting QUENCHFORGE_EMBED_UBATCH_SIZE by hand. + // CLAUDE.md operational gotcha #2 documents this knob. + ctxCap, ubatch = amdSizing(vramGB) metalNCB = amdEmbedMetalNCBDefault } if cfg.EmbedUbatchSize > 0 { ubatch = cfg.EmbedUbatchSize } t := SlotTuning{ - UbatchSize: ubatch, - BatchSize: ubatch, - MetalNCB: metalNCB, + UbatchSize: ubatch, + BatchSize: ubatch, + MetalNCB: metalNCB, + ContextSize: ctxCap, } if cfg.EmbedMetalNCB > 0 { t.MetalNCB = cfg.EmbedMetalNCB @@ -224,13 +241,15 @@ func embedParams(profile hardware.Profile, cfg config.Config) SlotTuning { // // AutoRespawn fires on AMD discrete same as embed. AMD profiles also // get the conservative MetalNCB=1 default same as embed. -func rerankParams(profile hardware.Profile, cfg config.Config) SlotTuning { +func rerankParams(profile hardware.Profile, vramGB int, cfg config.Config) SlotTuning { t := SlotTuning{} if cfg.RerankBatchSize > 0 { t.BatchSize = cfg.RerankBatchSize t.UbatchSize = cfg.RerankBatchSize } if profileIsAMDDiscrete(profile) { + ctxCap, _ := amdSizing(vramGB) + t.ContextSize = ctxCap t.MetalNCB = amdEmbedMetalNCBDefault } if cfg.RerankMetalNCB > 0 { @@ -251,6 +270,36 @@ func rerankParams(profile hardware.Profile, cfg config.Config) SlotTuning { return t } +// amdSizing returns the VRAM-tier-adaptive context ceiling and embed +// ubatch for an AMD-discrete profile. A contextCap of 0 means "no cap — +// honour cfg.MaxContext as-is". +// +// The high tier (>= 12 GB: Vega II/Duo, W6800X, W6900X, Vega 56/64, +// 5600M) keeps the Vega-II-validated values: no context cap, ubatch +// 1024. Smaller cards scale both down so the KV cache + Metal staging +// buffers fit without an operator hand-tuning QUENCHFORGE_MAX_CONTEXT / +// QUENCHFORGE_EMBED_UBATCH_SIZE: +// +// VRAM context cap embed ubatch example cards +// >= 12 GB none (0) 1024 Vega II/Duo, W6800X, W6900X, Vega 56/64, 5600M +// 7-11 GB 4096 512 RX 5700 / 5700 XT, W5700X +// <= 6 GB 2048 256 4 GB MacBook Pro dGPUs (5300M/5500M), Polaris 560X +// +// vramGB <= 0 means detection could not read VRAM; treat it as the high +// tier so a probe miss never throttles the validated Vega II path. The +// caps only ever LOWER cfg.MaxContext (buildSlotArgs takes the min), so a +// high-VRAM operator who raised QUENCHFORGE_MAX_CONTEXT is unaffected. +func amdSizing(vramGB int) (contextCap, ubatch int) { + switch { + case vramGB <= 0 || vramGB >= 12: + return 0, amdEmbedUbatchDefault + case vramGB >= 7: + return 4096, 512 + default: + return 2048, 256 + } +} + // profileIsAMDDiscrete inlines hardware.Info.IsAMDDiscrete logic // against a raw Profile (we don't have a full Info here). Kept in sync // with internal/hardware/hardware.go::IsAMDDiscrete by the test in diff --git a/internal/tuning/tuning_test.go b/internal/tuning/tuning_test.go index 20a4521..cd12475 100644 --- a/internal/tuning/tuning_test.go +++ b/internal/tuning/tuning_test.go @@ -36,6 +36,12 @@ var amdProfiles = []hardware.Profile{ hardware.ProfileRDNA2, } +// vramHigh is a >= 12 GB headline VRAM (e.g. Vega II's 32 GB). At this +// tier amdSizing imposes no context cap and keeps ubatch 1024, so the +// pre-v0.8.0 assertions below remain valid verbatim. Low-VRAM behaviour +// has its own dedicated tests. +const vramHigh = 32 + func TestProfileIsAMDDiscrete_MatchesHardwarePackage(t *testing.T) { // Cross-check our inline AMD predicate against hardware.Info's // IsAMDDiscrete. If hardware adds or removes a profile from the @@ -55,7 +61,7 @@ func TestKernelParams_ChatAMDGetsGPUWithConcurrencyDisable(t *testing.T) { cfg := config.Config{MaxContext: 8192} for _, p := range amdProfiles { t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindChat, cfg) + tn := KernelParams(p, vramHigh, gateway.KindChat, cfg) wantExtra := []string{ "--flash-attn", "off", "--cache-ram", "0", @@ -94,7 +100,7 @@ func TestKernelParams_ChatNonAMDIsEmpty(t *testing.T) { continue } t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindChat, cfg) + tn := KernelParams(p, vramHigh, gateway.KindChat, cfg) if !slices.Equal(tn.ExtraArgs, nil) && len(tn.ExtraArgs) != 0 { t.Errorf("chat %s should emit no ExtraArgs, got %v", p, tn.ExtraArgs) @@ -115,7 +121,7 @@ func TestKernelParams_EmbedDefaultsByProfile(t *testing.T) { for _, p := range allProfiles { for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} { t.Run(string(p)+"/"+string(k), func(t *testing.T) { - tn := KernelParams(p, k, cfg) + tn := KernelParams(p, vramHigh, k, cfg) wantUbatch := 8192 wantNCB := 0 if profileIsAMDDiscrete(p) { @@ -148,7 +154,7 @@ func TestKernelParams_EmbedAMDGetsGPUWithConcurrencyDisable(t *testing.T) { for _, p := range allProfiles { for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} { t.Run(string(p)+"/"+string(k), func(t *testing.T) { - tn := KernelParams(p, k, cfg) + tn := KernelParams(p, vramHigh, k, cfg) hasGPUFlag := containsSubslice(tn.ExtraArgs, []string{"--gpu-layers", "999"}) if profileIsAMDDiscrete(p) { if !hasGPUFlag { @@ -181,7 +187,7 @@ func TestKernelParams_EmbedAMDMultithreading(t *testing.T) { for _, p := range allProfiles { for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed, gateway.KindRerank} { t.Run(string(p)+"/"+string(k), func(t *testing.T) { - tn := KernelParams(p, k, cfg) + tn := KernelParams(p, vramHigh, k, cfg) hasThreads := containsArgPair(tn.ExtraArgs, "--threads", "15") hasParallel := containsArgPair(tn.ExtraArgs, "--parallel", "4") if profileIsAMDDiscrete(p) { @@ -244,7 +250,7 @@ func TestKernelParams_RerankAMDGetsGPUWithConcurrencyDisable(t *testing.T) { cfg := config.Config{MaxContext: 8192} for _, p := range allProfiles { t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindRerank, cfg) + tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg) hasGPUFlag := containsSubslice(tn.ExtraArgs, []string{"--gpu-layers", "999"}) if profileIsAMDDiscrete(p) { if !hasGPUFlag { @@ -273,7 +279,7 @@ func TestKernelParams_RerankAMDGetsMetalNCBDefault(t *testing.T) { cfg := config.Config{MaxContext: 8192} for _, p := range allProfiles { t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindRerank, cfg) + tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg) wantNCB := 0 if profileIsAMDDiscrete(p) { wantNCB = amdEmbedMetalNCBDefault @@ -289,7 +295,7 @@ func TestKernelParams_RerankAMDGetsMetalNCBDefault(t *testing.T) { func TestKernelParams_EmbedHonoursUbatchOverride(t *testing.T) { // Operator-set QUENCHFORGE_EMBED_UBATCH_SIZE wins. cfg := config.Config{MaxContext: 8192, EmbedUbatchSize: 1024} - tn := KernelParams(hardware.ProfileVegaPro, gateway.KindEmbed, cfg) + tn := KernelParams(hardware.ProfileVegaPro, vramHigh, gateway.KindEmbed, cfg) if tn.UbatchSize != 1024 { t.Errorf("UbatchSize = %d, want 1024 (env override)", tn.UbatchSize) } @@ -300,7 +306,7 @@ func TestKernelParams_EmbedHonoursUbatchOverride(t *testing.T) { func TestKernelParams_EmbedHonoursMetalNCBOverride(t *testing.T) { cfg := config.Config{MaxContext: 8192, EmbedMetalNCB: 1} - tn := KernelParams(hardware.ProfileVegaPro, gateway.KindEmbed, cfg) + tn := KernelParams(hardware.ProfileVegaPro, vramHigh, gateway.KindEmbed, cfg) if tn.MetalNCB != 1 { t.Errorf("MetalNCB = %d, want 1 (env override)", tn.MetalNCB) } @@ -311,7 +317,7 @@ func TestKernelParams_EmbedAMDGetsAutoRespawn(t *testing.T) { for _, p := range amdProfiles { for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} { t.Run(string(p)+"/"+string(k), func(t *testing.T) { - tn := KernelParams(p, k, cfg) + tn := KernelParams(p, vramHigh, k, cfg) if !tn.AutoRespawn { t.Errorf("%s %s should request AutoRespawn on AMD", p, k) } @@ -329,7 +335,7 @@ func TestKernelParams_EmbedNonAMDNoAutoRespawn(t *testing.T) { continue } t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindEmbed, cfg) + tn := KernelParams(p, vramHigh, gateway.KindEmbed, cfg) if tn.AutoRespawn { t.Errorf("%s should NOT request AutoRespawn", p) } @@ -344,7 +350,7 @@ func TestKernelParams_RerankNoBatchOverrideByDefault(t *testing.T) { cfg := config.Config{MaxContext: 8192} for _, p := range allProfiles { t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindRerank, cfg) + tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg) if tn.BatchSize != 0 { t.Errorf("%s rerank BatchSize = %d, want 0 (no override)", p, tn.BatchSize) @@ -355,7 +361,7 @@ func TestKernelParams_RerankNoBatchOverrideByDefault(t *testing.T) { func TestKernelParams_RerankHonoursBatchOverride(t *testing.T) { cfg := config.Config{MaxContext: 8192, RerankBatchSize: 2048} - tn := KernelParams(hardware.ProfileVegaPro, gateway.KindRerank, cfg) + tn := KernelParams(hardware.ProfileVegaPro, vramHigh, gateway.KindRerank, cfg) if tn.BatchSize != 2048 { t.Errorf("BatchSize = %d, want 2048", tn.BatchSize) } @@ -368,7 +374,7 @@ func TestKernelParams_RerankAMDGetsAutoRespawn(t *testing.T) { cfg := config.Config{MaxContext: 8192} for _, p := range amdProfiles { t.Run(string(p), func(t *testing.T) { - tn := KernelParams(p, gateway.KindRerank, cfg) + tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg) if !tn.AutoRespawn { t.Errorf("%s rerank should request AutoRespawn on AMD", p) } @@ -376,6 +382,109 @@ func TestKernelParams_RerankAMDGetsAutoRespawn(t *testing.T) { } } +// --------------------------------------------------------------------------- +// VRAM-tier-adaptive sizing (v0.8.0) +// --------------------------------------------------------------------------- + +func TestAmdSizing_Tiers(t *testing.T) { + // Whitebox: the (contextCap, ubatch) curve over VRAM. <=0 and >=12 + // are the high tier (no cap, validated 1024) so a probe miss or a + // big card never throttles. 8 GB scales to 4096/512; 4 GB to 2048/256. + cases := []struct { + vram int + wantCtx int + wantUbatch int + }{ + {vram: 0, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // probe miss -> high + {vram: -1, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // negative -> high + {vram: 32, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // Vega II + {vram: 16, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // W6800X-class + {vram: 12, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // tier boundary (incl) + {vram: 11, wantCtx: 4096, wantUbatch: 512}, // just below high + {vram: 8, wantCtx: 4096, wantUbatch: 512}, // RX 5700 + {vram: 7, wantCtx: 4096, wantUbatch: 512}, // low boundary (incl) + {vram: 6, wantCtx: 2048, wantUbatch: 256}, // tiny boundary + {vram: 4, wantCtx: 2048, wantUbatch: 256}, // 4 GB MBP dGPU + } + for _, c := range cases { + ctx, ub := amdSizing(c.vram) + if ctx != c.wantCtx || ub != c.wantUbatch { + t.Errorf("amdSizing(%d) = (ctx %d, ubatch %d), want (ctx %d, ubatch %d)", + c.vram, ctx, ub, c.wantCtx, c.wantUbatch) + } + } +} + +func TestKernelParams_EmbedLowVRAMScalesDown(t *testing.T) { + // An 8 GB AMD card must get the reduced embed ubatch (512) and a + // context ceiling (4096) without any operator env var. + cfg := config.Config{MaxContext: 8192} + for _, p := range amdProfiles { + for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} { + t.Run(string(p)+"/"+string(k), func(t *testing.T) { + tn := KernelParams(p, 8, k, cfg) + if tn.UbatchSize != 512 || tn.BatchSize != 512 { + t.Errorf("%s %s ubatch/batch = %d/%d, want 512/512", + p, k, tn.UbatchSize, tn.BatchSize) + } + if tn.ContextSize != 4096 { + t.Errorf("%s %s ContextSize = %d, want 4096", p, k, tn.ContextSize) + } + }) + } + } +} + +func TestKernelParams_ContextCapAppliesToAllAMDSlots(t *testing.T) { + // A 4 GB card caps context to 2048 on every AMD slot kind (chat, + // embed, code-embed, rerank) — the KV cache is the dominant VRAM + // consumer and must shrink uniformly. + cfg := config.Config{MaxContext: 8192} + for _, k := range []gateway.SlotKind{ + gateway.KindChat, gateway.KindEmbed, gateway.KindCodeEmbed, gateway.KindRerank, + } { + t.Run(string(k), func(t *testing.T) { + tn := KernelParams(hardware.ProfileVegaPro, 4, k, cfg) + if tn.ContextSize != 2048 { + t.Errorf("%s ContextSize = %d, want 2048", k, tn.ContextSize) + } + }) + } +} + +func TestKernelParams_HighVRAMAndNonAMDHaveNoContextCap(t *testing.T) { + // >= 12 GB AMD and every non-AMD profile must leave ContextSize 0 so + // buildSlotArgs keeps cfg.MaxContext verbatim (zero regression). + cfg := config.Config{MaxContext: 8192} + for _, p := range allProfiles { + for _, k := range []gateway.SlotKind{ + gateway.KindChat, gateway.KindEmbed, gateway.KindRerank, + } { + t.Run(string(p)+"/"+string(k), func(t *testing.T) { + tn := KernelParams(p, vramHigh, k, cfg) + if tn.ContextSize != 0 { + t.Errorf("%s %s ContextSize = %d, want 0 (no cap)", + p, k, tn.ContextSize) + } + }) + } + } +} + +func TestKernelParams_UbatchOverrideBeatsTierButCapStands(t *testing.T) { + // An explicit QUENCHFORGE_EMBED_UBATCH_SIZE wins over the tier ubatch, + // but the VRAM context cap is independent and still applies — the two + // knobs protect different resources. + cfg := config.Config{MaxContext: 8192, EmbedUbatchSize: 2048} + tn := KernelParams(hardware.ProfileVegaPro, 4, gateway.KindEmbed, cfg) + if tn.UbatchSize != 2048 { + t.Errorf("UbatchSize = %d, want 2048 (operator override wins)", tn.UbatchSize) + } + if tn.ContextSize != 2048 { + t.Errorf("ContextSize = %d, want 2048 (cap independent of ubatch override)", tn.ContextSize) + } +} + func TestKernelParams_UnknownKindsAreEmpty(t *testing.T) { // Whisper / imagegen / future kinds: tuning module shouldn't emit // anything until explicitly added. Prevents accidental flag @@ -383,7 +492,7 @@ func TestKernelParams_UnknownKindsAreEmpty(t *testing.T) { cfg := config.Config{MaxContext: 8192} for _, k := range []gateway.SlotKind{gateway.KindWhisper, gateway.KindImageGen} { t.Run(string(k), func(t *testing.T) { - tn := KernelParams(hardware.ProfileVegaPro, k, cfg) + tn := KernelParams(hardware.ProfileVegaPro, vramHigh, k, cfg) if tn.UbatchSize != 0 || tn.BatchSize != 0 || tn.MetalNCB != 0 || len(tn.ExtraArgs) != 0 || tn.AutoRespawn { t.Errorf("%s should emit empty SlotTuning, got %+v", k, tn)