From 9c8765f35349c1c779f97238e4442c5b13cbea19 Mon Sep 17 00:00:00 2001
From: Justin Michaels <cashmoneymail@me.com>
Date: Sun, 31 May 2026 13:50:37 -0400
Subject: [PATCH] feat(tuning): VRAM-tier-adaptive context + embed ubatch for
 AMD discrete

Every AMD-discrete profile previously inherited the Vega-II bench
constants (--ctx-size 8192, embed --ubatch-size 1024) regardless of the
card, so smaller Intel-Mac AMD GPUs (8 GB RX 5700, 4 GB MacBook Pro
dGPUs) could oversubscribe VRAM and forced operators to hand-set
QUENCHFORGE_MAX_CONTEXT / QUENCHFORGE_EMBED_UBATCH_SIZE.

amdSizing(vramGB) now derives both from the detected headline VRAM,
threaded into KernelParams as a new arg:

  >= 12 GB  -> no ctx cap, ubatch 1024   (Vega II/Duo, W6800X, W6900X, Vega 56/64)
  7-11 GB   -> ctx 4096,   ubatch 512    (RX 5700/5700 XT, W5700X)
  <= 6 GB   -> ctx 2048,   ubatch 256    (4 GB MBP dGPUs, Polaris 560X)

Guarantees: the >= 12 GB tier and any VRAM probe miss (0/unknown) keep
the exact Vega-II-validated values (zero regression on the canonical
path); buildSlotArgs applies the context cap as min(MaxContext, cap) so
it only ever lowers; an explicit QUENCHFORGE_EMBED_UBATCH_SIZE still
overrides the tier. Family-agnostic: unlisted/future AMD cards fall
through classifyProfile to vega-pro and size by VRAM like any other.

Adds SlotTuning.ContextSize, the amdSizing curve, six new tuning/cmd
tests, and updates CHANGELOG (v0.8.0 final), README env table, and
CLAUDE.md gotcha #2.
---
 CHANGELOG.md                   |  46 +++++++++++
 CLAUDE.md                      |  18 +++--
 README.md                      |   4 +-
 cmd/quenchforge/main.go        |  16 +++-
 cmd/quenchforge/serve_test.go  |  29 +++++++
 internal/tuning/tuning.go      |  81 +++++++++++++++----
 internal/tuning/tuning_test.go | 139 +++++++++++++++++++++++++++++----
 7 files changed, 290 insertions(+), 43 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 679709f..bdc8fab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,52 @@ patch bumps fix bugs or polish without behaviour change.
 
 ---
 
+## v0.8.0 — AMD-discrete GPU mode + VRAM-tier-adaptive sizing (2026-05-31)
+
+Promotes the `v0.8.0-rc2` AMD-discrete GPU-mode revival (below) to a
+final release and adds **VRAM-tier-adaptive slot sizing** so the full
+range of Intel-Mac AMD GPUs — not just the 32 GB Vega II — runs
+out-of-the-box without operator hand-tuning.
+
+### VRAM-tier-adaptive sizing
+
+Prior to this release every AMD-discrete profile inherited the Vega II
+bench constants (`--ctx-size 8192`, embed `--ubatch-size 1024`)
+regardless of card. On a smaller card (8 GB RX 5700, 4 GB MacBook Pro
+dGPU) those defaults could oversubscribe VRAM, forcing the operator to
+discover and set `QUENCHFORGE_MAX_CONTEXT` / `QUENCHFORGE_EMBED_UBATCH_SIZE`
+by hand. `internal/tuning/tuning.go::amdSizing` now derives both from the
+detected headline VRAM (`hardware.Info.GPUVRAMGB`, threaded into
+`KernelParams`):
+
+| VRAM | `--ctx-size` cap | embed `--ubatch-size` | example cards |
+|---|---|---|---|
+| ≥ 12 GB | none (keeps `MaxContext`) | 1024 | Vega II/Duo, W6800X, W6900X, Vega 56/64, 5600M |
+| 7–11 GB | 4096 | 512 | RX 5700 / 5700 XT, W5700X |
+| ≤ 6 GB | 2048 | 256 | 4 GB MacBook Pro dGPUs (5300M/5500M), Polaris 560X |
+
+Design guarantees:
+
+- **Zero regression on the validated path.** The ≥ 12 GB tier (and any
+  VRAM probe that returns 0/unknown) keeps the exact Vega-II-benched
+  values, so the canonical Mac Pro config is byte-for-byte unchanged.
+- **Caps only ever lower.** `buildSlotArgs` applies the context ceiling
+  as `min(cfg.MaxContext, cap)`, so an operator who raised
+  `QUENCHFORGE_MAX_CONTEXT` on a big card is never clamped.
+- **Operator overrides still win.** An explicit
+  `QUENCHFORGE_EMBED_UBATCH_SIZE` beats the tier ubatch; the context cap
+  is an independent safety knob.
+- The fix is family-agnostic: unlisted/future AMD cards fall through
+  `classifyProfile` to `vega-pro` and are sized by VRAM like any other.
+
+New coverage: `TestAmdSizing_Tiers`, `TestKernelParams_EmbedLowVRAMScalesDown`,
+`TestKernelParams_ContextCapAppliesToAllAMDSlots`,
+`TestKernelParams_HighVRAMAndNonAMDHaveNoContextCap`,
+`TestKernelParams_UbatchOverrideBeatsTierButCapStands`, and
+`TestBuildSlotArgs_LowVRAMAMDCapsContextAndUbatch`.
+
+---
+
 ## v0.8.0-rc2 — AMD-discrete GPU mode revival (2026-05-25)
 
 The Mac Pro 7,1 + Radeon Pro Vega II 32 GB configuration now runs all
diff --git a/CLAUDE.md b/CLAUDE.md
index c99e1d1..0f270c0 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -145,18 +145,24 @@ that copy require a maintainer review.
 2. **Embed/rerank slots have their own AMD safety surface — section 3.**
    The family-B graph-compute buffer-corruption crash hits embed/rerank
    under sustained batch load (eval suites, bulk KB ingest, sustained
-   MCP retrieval). Operators on AMD discrete running those workloads
-   should set:
+   MCP retrieval). As of **v0.8.0 the embed ubatch + context ceiling are
+   VRAM-tier-adaptive** (`internal/tuning/tuning.go::amdSizing`): the
+   detected `GPUVRAMGB` picks 1024/none (≥ 12 GB), 512/4096 (8 GB), or
+   256/2048 (4 GB) automatically, so small cards no longer need
+   hand-tuning. Operators only set the env vars below to *override* the
+   tier (force a value, or apply the same safety knobs on a profile the
+   detector classified as non-AMD):
 
    ```
-   QUENCHFORGE_EMBED_UBATCH_SIZE=1024   # or smaller — caps Metal staging-buffer pressure
+   QUENCHFORGE_EMBED_UBATCH_SIZE=1024   # override tier ubatch — caps Metal staging-buffer pressure
    QUENCHFORGE_EMBED_METAL_N_CB=1       # serialise command-buffer submission
    QUENCHFORGE_AUTO_BACKOFF=true        # auto-503 before SIGABRT
    ```
 
-   Defaults preserve historical behaviour. `quenchforge-bench
-   sustained-embed` is the empirical tuning tool; the follow-up PR
-   will land bench-driven Vega-II defaults.
+   The ≥ 12 GB tier keeps the bench-validated Vega II values verbatim;
+   a VRAM probe miss (0/unknown) is treated as high tier so detection
+   failures never throttle the validated path. `quenchforge-bench
+   sustained-embed` remains the empirical tuning tool for new families.
 
 3. **`internal/tuning/` is the sole owner of per-(profile, kind) slot
    tuning.** `cmd/quenchforge/main.go::buildSlotArgs` and `slotEnv`
diff --git a/README.md b/README.md
index 0954e9a..5d9397d 100644
--- a/README.md
+++ b/README.md
@@ -176,9 +176,9 @@ All settings have sensible defaults. Selected env vars:
 | `QUENCHFORGE_MODELS_DIR` | `~/.quenchforge/models` | Where Quenchforge looks for GGUFs |
 | `QUENCHFORGE_LOG_DIR` | `~/Library/Logs/quenchforge` | Per-slot log files land here |
 | `QUENCHFORGE_PID_DIR` | `~/.config/quenchforge/pids` | Orphan-reaper pidfile dir |
-| `QUENCHFORGE_MAX_CONTEXT` | `8192` | `--ctx-size` passed to every slot |
+| `QUENCHFORGE_MAX_CONTEXT` | `8192` | `--ctx-size` passed to every slot. On AMD-discrete cards ≤ 11 GB this is auto-capped by VRAM tier (4096 on 8 GB, 2048 on 4 GB) so the KV cache fits; the cap only lowers, never raises, your value. ≥ 12 GB cards use it verbatim. |
 | `QUENCHFORGE_METAL_N_CB` | `2` | Metal command-buffer count (`GGML_METAL_N_CB`); global default — per-slot overrides below |
-| `QUENCHFORGE_EMBED_UBATCH_SIZE` | `0` (inherit MaxContext) | Per-call `--batch-size` / `--ubatch-size` for embed and code-embed slots. On AMD discrete, lowering this (e.g. `1024`) caps Metal staging-buffer pressure and prevents the family-B sustained-load SIGABRT documented in `patches/README.md` section 3. |
+| `QUENCHFORGE_EMBED_UBATCH_SIZE` | `0` (auto) | Per-call `--batch-size` / `--ubatch-size` for embed and code-embed slots. Zero auto-sizes by VRAM tier on AMD discrete (1024 on ≥ 12 GB, 512 on 8 GB, 256 on 4 GB) to cap Metal staging-buffer pressure and prevent the family-B sustained-load SIGABRT (`patches/README.md` section 3); non-AMD inherits MaxContext. An explicit value overrides the tier. |
 | `QUENCHFORGE_EMBED_METAL_N_CB` | `0` (inherit `METAL_N_CB`) | Per-slot `GGML_METAL_N_CB` for embed and code-embed. Set to `1` on AMD discrete to serialise Metal command-buffer submission. |
 | `QUENCHFORGE_RERANK_BATCH_SIZE` | `0` (llama.cpp's 512-token default) | Rerank slot `--batch-size` and `--ubatch-size`. Raise this when the reranker takes (query, doc) pairs longer than 510 tokens (e.g. `bge-reranker-v2-m3` with ≥ 1k-token chunks). |
 | `QUENCHFORGE_RERANK_METAL_N_CB` | `0` (inherit `METAL_N_CB`) | Per-slot `GGML_METAL_N_CB` for the rerank slot. |
diff --git a/cmd/quenchforge/main.go b/cmd/quenchforge/main.go
index 4e55c6a..16753b9 100644
--- a/cmd/quenchforge/main.go
+++ b/cmd/quenchforge/main.go
@@ -908,15 +908,23 @@ type slotSpec struct {
 // only for the base arg shape plus the layering of the tuning result.
 // Move the per-profile decisions there when they change, not here.
 func buildSlotArgs(cfg config.Config, hwInfo hardware.Info, spec slotSpec, modelPath string) []string {
+	tn := tuning.KernelParams(hwInfo.Profile, hwInfo.GPUVRAMGB, spec.Kind, cfg)
+
+	// VRAM-tier-adaptive context ceiling: ContextSize only ever lowers
+	// cfg.MaxContext (small AMD cards), never raises it.
+	ctxSize := cfg.MaxContext
+	if tn.ContextSize > 0 && tn.ContextSize < ctxSize {
+		ctxSize = tn.ContextSize
+	}
+
 	args := []string{
 		"--model", modelPath,
 		"--host", "127.0.0.1",
 		"--port", fmt.Sprintf("%d", spec.Port),
-		"--ctx-size", fmt.Sprintf("%d", cfg.MaxContext),
+		"--ctx-size", fmt.Sprintf("%d", ctxSize),
 	}
 	args = append(args, spec.ExtraArgs...)
 
-	tn := tuning.KernelParams(hwInfo.Profile, spec.Kind, cfg)
 	if tn.BatchSize > 0 {
 		args = append(args, "--batch-size", fmt.Sprintf("%d", tn.BatchSize))
 	}
@@ -936,7 +944,7 @@ func buildSlotArgs(cfg config.Config, hwInfo hardware.Info, spec slotSpec, model
 // on AMD discrete).
 func slotEnv(cfg config.Config, hwInfo hardware.Info, kind gateway.SlotKind) []string {
 	ncb := cfg.MetalNCB
-	tn := tuning.KernelParams(hwInfo.Profile, kind, cfg)
+	tn := tuning.KernelParams(hwInfo.Profile, hwInfo.GPUVRAMGB, kind, cfg)
 	if tn.MetalNCB > 0 {
 		ncb = tn.MetalNCB
 	}
@@ -1022,7 +1030,7 @@ func startSlot(ctx context.Context, cfg config.Config, hwInfo hardware.Info, spe
 	// graph-compute buffer-corruption crash is non-deterministic and the
 	// slot stays dead after SIGABRT until manual restart. Tuning module
 	// owns the decision; we just translate AutoRespawn → RestartPolicy.
-	tn := tuning.KernelParams(hwInfo.Profile, spec.Kind, cfg)
+	tn := tuning.KernelParams(hwInfo.Profile, hwInfo.GPUVRAMGB, spec.Kind, cfg)
 	if tn.AutoRespawn {
 		slot.RestartPolicy = supervisor.PolicyExpBackoff
 	}
diff --git a/cmd/quenchforge/serve_test.go b/cmd/quenchforge/serve_test.go
index 32cefcd..487cc01 100644
--- a/cmd/quenchforge/serve_test.go
+++ b/cmd/quenchforge/serve_test.go
@@ -343,6 +343,35 @@ func TestBuildSlotArgs_EmbedKindsBatchOverride(t *testing.T) {
 	}
 }
 
+func TestBuildSlotArgs_LowVRAMAMDCapsContextAndUbatch(t *testing.T) {
+	// v0.8.0 adaptive sizing: an 8 GB AMD card (e.g. RX 5700) must get a
+	// capped --ctx-size 4096 (down from MaxContext 8192) and --ubatch-size
+	// 512 on embed/chat without any operator env var — the gap that used
+	// to force manual QUENCHFORGE_MAX_CONTEXT / _EMBED_UBATCH_SIZE tuning.
+	cfg := config.Config{MaxContext: 8192}
+	info := hardware.Info{Profile: hardware.ProfileRDNA1, GPUVRAMGB: 8}
+
+	embedArgs := buildSlotArgs(cfg, info, slotSpec{Kind: gateway.KindEmbed, Name: "embed", Port: 11501}, "/tmp/e.gguf")
+	if !containsArgPair(embedArgs, "--ctx-size", "4096") {
+		t.Errorf("8 GB AMD embed missing capped --ctx-size 4096: %v", embedArgs)
+	}
+	if !containsArgPair(embedArgs, "--ubatch-size", "512") {
+		t.Errorf("8 GB AMD embed missing scaled --ubatch-size 512: %v", embedArgs)
+	}
+
+	chatArgs := buildSlotArgs(cfg, info, slotSpec{Kind: gateway.KindChat, Name: "chat", Port: 11500}, "/tmp/c.gguf")
+	if !containsArgPair(chatArgs, "--ctx-size", "4096") {
+		t.Errorf("8 GB AMD chat missing capped --ctx-size 4096: %v", chatArgs)
+	}
+
+	// Regression: a 32 GB card keeps the full configured context.
+	hi := hardware.Info{Profile: hardware.ProfileVegaPro, GPUVRAMGB: 32}
+	hiArgs := buildSlotArgs(cfg, hi, slotSpec{Kind: gateway.KindChat, Name: "chat", Port: 11500}, "/tmp/c.gguf")
+	if !containsArgPair(hiArgs, "--ctx-size", "8192") {
+		t.Errorf("32 GB AMD chat should keep --ctx-size 8192 (no cap): %v", hiArgs)
+	}
+}
+
 func TestBuildSlotArgs_NonEmbedKindsSkipBatchOverride(t *testing.T) {
 	// Chat / rerank / whisper slots don't need the embed batch override
 	// (they decode autoregressively or operate per-pair). Adding it
diff --git a/internal/tuning/tuning.go b/internal/tuning/tuning.go
index f7aee56..3c453ea 100644
--- a/internal/tuning/tuning.go
+++ b/internal/tuning/tuning.go
@@ -16,9 +16,11 @@
 // kind on this hardware profile want", with table-driven tests.
 //
 // The function is intentionally pure (no I/O, no globals): it consumes
-// a profile, a slot kind, and a config snapshot, and returns a
-// `SlotTuning` describing the additional llama-server flags and env
-// vars the supervisor should layer on top of the base argv.
+// a profile, the detected GPU VRAM (GB), a slot kind, and a config
+// snapshot, and returns a `SlotTuning` describing the additional
+// llama-server flags and env vars the supervisor should layer on top of
+// the base argv. VRAM drives the adaptive context/ubatch sizing (see
+// amdSizing) so smaller AMD cards fit without operator hand-tuning.
 //
 // Honors operator overrides: env-driven config fields (cfg.EmbedUbatchSize,
 // cfg.EmbedMetalNCB, cfg.RerankBatchSize, cfg.RerankMetalNCB) win over
@@ -93,6 +95,14 @@ type SlotTuning struct {
 	// models and chat-decode races. See llama.cpp issue #19563 and patch 0002.
 	// Apple Silicon (UMA) does not need this; the concurrent path is correct there.
 	MetalConcurrencyDisable bool
+
+	// ContextSize, when non-zero, is a VRAM-tier-derived ceiling on the
+	// slot's --ctx-size. buildSlotArgs applies it as min(cfg.MaxContext,
+	// ContextSize), so it only ever LOWERS the configured context: small
+	// AMD cards (<= 11 GB) get a KV cache that fits without manual tuning,
+	// while >= 12 GB cards and non-AMD profiles leave this 0 (no cap) and
+	// keep cfg.MaxContext verbatim. See amdSizing.
+	ContextSize int
 }
 
 // KernelParams returns the tuning the supervisor should apply for the
@@ -108,14 +118,14 @@ type SlotTuning struct {
 // `~/Develop/quenchforge/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m:1665-1717`
 // — the `buf->is_shared` fast path uses plain `memcpy`). Adding flags
 // on those profiles would regress throughput without any safety win.
-func KernelParams(profile hardware.Profile, kind gateway.SlotKind, cfg config.Config) SlotTuning {
+func KernelParams(profile hardware.Profile, vramGB int, kind gateway.SlotKind, cfg config.Config) SlotTuning {
 	switch kind {
 	case gateway.KindChat:
-		return chatParams(profile)
+		return chatParams(profile, vramGB)
 	case gateway.KindEmbed, gateway.KindCodeEmbed:
-		return embedParams(profile, cfg)
+		return embedParams(profile, vramGB, cfg)
 	case gateway.KindRerank:
-		return rerankParams(profile, cfg)
+		return rerankParams(profile, vramGB, cfg)
 	}
 	// Whisper / imagegen and any future kinds fall through unchanged.
 	return SlotTuning{}
@@ -129,10 +139,11 @@ func KernelParams(profile hardware.Profile, kind gateway.SlotKind, cfg config.Co
 // theory that chat is naturally bursty; cerid eval workloads broke
 // that assumption (chat.log entry at 2026-05-16T23:14 — task 143
 // hit GGML_ASSERT at `set_tensor` after ~30 successful chat calls).
-func chatParams(profile hardware.Profile) SlotTuning {
+func chatParams(profile hardware.Profile, vramGB int) SlotTuning {
 	if !profileIsAMDDiscrete(profile) {
 		return SlotTuning{}
 	}
+	ctxCap, _ := amdSizing(vramGB)
 	// AMD-discrete chat slot runs on GPU as of v0.8.0. The MTLDispatchTypeConcurrent
 	// race that produced cross-call non-determinism is disabled via
 	// MetalConcurrencyDisable -> GGML_METAL_CONCURRENCY_DISABLE=1. The family-B
@@ -150,6 +161,7 @@ func chatParams(profile hardware.Profile) SlotTuning {
 			"--no-cache-prompt",
 			"--gpu-layers", "999",
 		},
+		ContextSize:             ctxCap,
 		MetalConcurrencyDisable: true,
 		AutoRespawn:             true,
 	}
@@ -174,23 +186,28 @@ func chatParams(profile hardware.Profile) SlotTuning {
 //   - AMD discrete profiles additionally enable AutoRespawn — the
 //     supervisor brings the slot back on a Metal SIGABRT instead of
 //     leaving it dead until manual restart.
-func embedParams(profile hardware.Profile, cfg config.Config) SlotTuning {
+func embedParams(profile hardware.Profile, vramGB int, cfg config.Config) SlotTuning {
 	ubatch := cfg.MaxContext
 	metalNCB := cfg.MetalNCB
+	ctxCap := 0
 	if profileIsAMDDiscrete(profile) {
-		// AMD-discrete on GPU (v0.8.0) needs the 1024 ubatch cap re-enabled —
+		// AMD-discrete on GPU (v0.8.0) needs the ubatch cap re-enabled —
 		// it bounds per-call Metal staging-buffer pressure even with patch 0002's
-		// pool in place. CLAUDE.md operational gotcha #2 documents this knob.
-		ubatch = amdEmbedUbatchDefault
+		// pool in place. As of v0.8.0 the cap is VRAM-tier-adaptive (1024 on
+		// >= 12 GB cards, 512 on 8 GB, 256 on 4 GB) so smaller cards don't OOM
+		// without an operator setting QUENCHFORGE_EMBED_UBATCH_SIZE by hand.
+		// CLAUDE.md operational gotcha #2 documents this knob.
+		ctxCap, ubatch = amdSizing(vramGB)
 		metalNCB = amdEmbedMetalNCBDefault
 	}
 	if cfg.EmbedUbatchSize > 0 {
 		ubatch = cfg.EmbedUbatchSize
 	}
 	t := SlotTuning{
-		UbatchSize: ubatch,
-		BatchSize:  ubatch,
-		MetalNCB:   metalNCB,
+		UbatchSize:  ubatch,
+		BatchSize:   ubatch,
+		MetalNCB:    metalNCB,
+		ContextSize: ctxCap,
 	}
 	if cfg.EmbedMetalNCB > 0 {
 		t.MetalNCB = cfg.EmbedMetalNCB
@@ -224,13 +241,15 @@ func embedParams(profile hardware.Profile, cfg config.Config) SlotTuning {
 //
 // AutoRespawn fires on AMD discrete same as embed. AMD profiles also
 // get the conservative MetalNCB=1 default same as embed.
-func rerankParams(profile hardware.Profile, cfg config.Config) SlotTuning {
+func rerankParams(profile hardware.Profile, vramGB int, cfg config.Config) SlotTuning {
 	t := SlotTuning{}
 	if cfg.RerankBatchSize > 0 {
 		t.BatchSize = cfg.RerankBatchSize
 		t.UbatchSize = cfg.RerankBatchSize
 	}
 	if profileIsAMDDiscrete(profile) {
+		ctxCap, _ := amdSizing(vramGB)
+		t.ContextSize = ctxCap
 		t.MetalNCB = amdEmbedMetalNCBDefault
 	}
 	if cfg.RerankMetalNCB > 0 {
@@ -251,6 +270,36 @@ func rerankParams(profile hardware.Profile, cfg config.Config) SlotTuning {
 	return t
 }
 
+// amdSizing returns the VRAM-tier-adaptive context ceiling and embed
+// ubatch for an AMD-discrete profile. A contextCap of 0 means "no cap —
+// honour cfg.MaxContext as-is".
+//
+// The high tier (>= 12 GB: Vega II/Duo, W6800X, W6900X, Vega 56/64,
+// 5600M) keeps the Vega-II-validated values: no context cap, ubatch
+// 1024. Smaller cards scale both down so the KV cache + Metal staging
+// buffers fit without an operator hand-tuning QUENCHFORGE_MAX_CONTEXT /
+// QUENCHFORGE_EMBED_UBATCH_SIZE:
+//
+//	VRAM        context cap   embed ubatch   example cards
+//	>= 12 GB    none (0)      1024           Vega II/Duo, W6800X, W6900X, Vega 56/64, 5600M
+//	7-11 GB     4096          512            RX 5700 / 5700 XT, W5700X
+//	<= 6 GB     2048          256            4 GB MacBook Pro dGPUs (5300M/5500M), Polaris 560X
+//
+// vramGB <= 0 means detection could not read VRAM; treat it as the high
+// tier so a probe miss never throttles the validated Vega II path. The
+// caps only ever LOWER cfg.MaxContext (buildSlotArgs takes the min), so a
+// high-VRAM operator who raised QUENCHFORGE_MAX_CONTEXT is unaffected.
+func amdSizing(vramGB int) (contextCap, ubatch int) {
+	switch {
+	case vramGB <= 0 || vramGB >= 12:
+		return 0, amdEmbedUbatchDefault
+	case vramGB >= 7:
+		return 4096, 512
+	default:
+		return 2048, 256
+	}
+}
+
 // profileIsAMDDiscrete inlines hardware.Info.IsAMDDiscrete logic
 // against a raw Profile (we don't have a full Info here). Kept in sync
 // with internal/hardware/hardware.go::IsAMDDiscrete by the test in
diff --git a/internal/tuning/tuning_test.go b/internal/tuning/tuning_test.go
index 20a4521..cd12475 100644
--- a/internal/tuning/tuning_test.go
+++ b/internal/tuning/tuning_test.go
@@ -36,6 +36,12 @@ var amdProfiles = []hardware.Profile{
 	hardware.ProfileRDNA2,
 }
 
+// vramHigh is a >= 12 GB headline VRAM (e.g. Vega II's 32 GB). At this
+// tier amdSizing imposes no context cap and keeps ubatch 1024, so the
+// pre-v0.8.0 assertions below remain valid verbatim. Low-VRAM behaviour
+// has its own dedicated tests.
+const vramHigh = 32
+
 func TestProfileIsAMDDiscrete_MatchesHardwarePackage(t *testing.T) {
 	// Cross-check our inline AMD predicate against hardware.Info's
 	// IsAMDDiscrete. If hardware adds or removes a profile from the
@@ -55,7 +61,7 @@ func TestKernelParams_ChatAMDGetsGPUWithConcurrencyDisable(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192}
 	for _, p := range amdProfiles {
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindChat, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindChat, cfg)
 			wantExtra := []string{
 				"--flash-attn", "off",
 				"--cache-ram", "0",
@@ -94,7 +100,7 @@ func TestKernelParams_ChatNonAMDIsEmpty(t *testing.T) {
 			continue
 		}
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindChat, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindChat, cfg)
 			if !slices.Equal(tn.ExtraArgs, nil) && len(tn.ExtraArgs) != 0 {
 				t.Errorf("chat %s should emit no ExtraArgs, got %v",
 					p, tn.ExtraArgs)
@@ -115,7 +121,7 @@ func TestKernelParams_EmbedDefaultsByProfile(t *testing.T) {
 	for _, p := range allProfiles {
 		for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} {
 			t.Run(string(p)+"/"+string(k), func(t *testing.T) {
-				tn := KernelParams(p, k, cfg)
+				tn := KernelParams(p, vramHigh, k, cfg)
 				wantUbatch := 8192
 				wantNCB := 0
 				if profileIsAMDDiscrete(p) {
@@ -148,7 +154,7 @@ func TestKernelParams_EmbedAMDGetsGPUWithConcurrencyDisable(t *testing.T) {
 	for _, p := range allProfiles {
 		for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} {
 			t.Run(string(p)+"/"+string(k), func(t *testing.T) {
-				tn := KernelParams(p, k, cfg)
+				tn := KernelParams(p, vramHigh, k, cfg)
 				hasGPUFlag := containsSubslice(tn.ExtraArgs, []string{"--gpu-layers", "999"})
 				if profileIsAMDDiscrete(p) {
 					if !hasGPUFlag {
@@ -181,7 +187,7 @@ func TestKernelParams_EmbedAMDMultithreading(t *testing.T) {
 	for _, p := range allProfiles {
 		for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed, gateway.KindRerank} {
 			t.Run(string(p)+"/"+string(k), func(t *testing.T) {
-				tn := KernelParams(p, k, cfg)
+				tn := KernelParams(p, vramHigh, k, cfg)
 				hasThreads := containsArgPair(tn.ExtraArgs, "--threads", "15")
 				hasParallel := containsArgPair(tn.ExtraArgs, "--parallel", "4")
 				if profileIsAMDDiscrete(p) {
@@ -244,7 +250,7 @@ func TestKernelParams_RerankAMDGetsGPUWithConcurrencyDisable(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192}
 	for _, p := range allProfiles {
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindRerank, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg)
 			hasGPUFlag := containsSubslice(tn.ExtraArgs, []string{"--gpu-layers", "999"})
 			if profileIsAMDDiscrete(p) {
 				if !hasGPUFlag {
@@ -273,7 +279,7 @@ func TestKernelParams_RerankAMDGetsMetalNCBDefault(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192}
 	for _, p := range allProfiles {
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindRerank, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg)
 			wantNCB := 0
 			if profileIsAMDDiscrete(p) {
 				wantNCB = amdEmbedMetalNCBDefault
@@ -289,7 +295,7 @@ func TestKernelParams_RerankAMDGetsMetalNCBDefault(t *testing.T) {
 func TestKernelParams_EmbedHonoursUbatchOverride(t *testing.T) {
 	// Operator-set QUENCHFORGE_EMBED_UBATCH_SIZE wins.
 	cfg := config.Config{MaxContext: 8192, EmbedUbatchSize: 1024}
-	tn := KernelParams(hardware.ProfileVegaPro, gateway.KindEmbed, cfg)
+	tn := KernelParams(hardware.ProfileVegaPro, vramHigh, gateway.KindEmbed, cfg)
 	if tn.UbatchSize != 1024 {
 		t.Errorf("UbatchSize = %d, want 1024 (env override)", tn.UbatchSize)
 	}
@@ -300,7 +306,7 @@ func TestKernelParams_EmbedHonoursUbatchOverride(t *testing.T) {
 
 func TestKernelParams_EmbedHonoursMetalNCBOverride(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192, EmbedMetalNCB: 1}
-	tn := KernelParams(hardware.ProfileVegaPro, gateway.KindEmbed, cfg)
+	tn := KernelParams(hardware.ProfileVegaPro, vramHigh, gateway.KindEmbed, cfg)
 	if tn.MetalNCB != 1 {
 		t.Errorf("MetalNCB = %d, want 1 (env override)", tn.MetalNCB)
 	}
@@ -311,7 +317,7 @@ func TestKernelParams_EmbedAMDGetsAutoRespawn(t *testing.T) {
 	for _, p := range amdProfiles {
 		for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} {
 			t.Run(string(p)+"/"+string(k), func(t *testing.T) {
-				tn := KernelParams(p, k, cfg)
+				tn := KernelParams(p, vramHigh, k, cfg)
 				if !tn.AutoRespawn {
 					t.Errorf("%s %s should request AutoRespawn on AMD", p, k)
 				}
@@ -329,7 +335,7 @@ func TestKernelParams_EmbedNonAMDNoAutoRespawn(t *testing.T) {
 			continue
 		}
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindEmbed, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindEmbed, cfg)
 			if tn.AutoRespawn {
 				t.Errorf("%s should NOT request AutoRespawn", p)
 			}
@@ -344,7 +350,7 @@ func TestKernelParams_RerankNoBatchOverrideByDefault(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192}
 	for _, p := range allProfiles {
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindRerank, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg)
 			if tn.BatchSize != 0 {
 				t.Errorf("%s rerank BatchSize = %d, want 0 (no override)",
 					p, tn.BatchSize)
@@ -355,7 +361,7 @@ func TestKernelParams_RerankNoBatchOverrideByDefault(t *testing.T) {
 
 func TestKernelParams_RerankHonoursBatchOverride(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192, RerankBatchSize: 2048}
-	tn := KernelParams(hardware.ProfileVegaPro, gateway.KindRerank, cfg)
+	tn := KernelParams(hardware.ProfileVegaPro, vramHigh, gateway.KindRerank, cfg)
 	if tn.BatchSize != 2048 {
 		t.Errorf("BatchSize = %d, want 2048", tn.BatchSize)
 	}
@@ -368,7 +374,7 @@ func TestKernelParams_RerankAMDGetsAutoRespawn(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192}
 	for _, p := range amdProfiles {
 		t.Run(string(p), func(t *testing.T) {
-			tn := KernelParams(p, gateway.KindRerank, cfg)
+			tn := KernelParams(p, vramHigh, gateway.KindRerank, cfg)
 			if !tn.AutoRespawn {
 				t.Errorf("%s rerank should request AutoRespawn on AMD", p)
 			}
@@ -376,6 +382,109 @@ func TestKernelParams_RerankAMDGetsAutoRespawn(t *testing.T) {
 	}
 }
 
+// ---------------------------------------------------------------------------
+// VRAM-tier-adaptive sizing (v0.8.0)
+// ---------------------------------------------------------------------------
+
+func TestAmdSizing_Tiers(t *testing.T) {
+	// Whitebox: the (contextCap, ubatch) curve over VRAM. <=0 and >=12
+	// are the high tier (no cap, validated 1024) so a probe miss or a
+	// big card never throttles. 8 GB scales to 4096/512; 4 GB to 2048/256.
+	cases := []struct {
+		vram       int
+		wantCtx    int
+		wantUbatch int
+	}{
+		{vram: 0, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault},  // probe miss -> high
+		{vram: -1, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // negative -> high
+		{vram: 32, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // Vega II
+		{vram: 16, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // W6800X-class
+		{vram: 12, wantCtx: 0, wantUbatch: amdEmbedUbatchDefault}, // tier boundary (incl)
+		{vram: 11, wantCtx: 4096, wantUbatch: 512},                // just below high
+		{vram: 8, wantCtx: 4096, wantUbatch: 512},                 // RX 5700
+		{vram: 7, wantCtx: 4096, wantUbatch: 512},                 // low boundary (incl)
+		{vram: 6, wantCtx: 2048, wantUbatch: 256},                 // tiny boundary
+		{vram: 4, wantCtx: 2048, wantUbatch: 256},                 // 4 GB MBP dGPU
+	}
+	for _, c := range cases {
+		ctx, ub := amdSizing(c.vram)
+		if ctx != c.wantCtx || ub != c.wantUbatch {
+			t.Errorf("amdSizing(%d) = (ctx %d, ubatch %d), want (ctx %d, ubatch %d)",
+				c.vram, ctx, ub, c.wantCtx, c.wantUbatch)
+		}
+	}
+}
+
+func TestKernelParams_EmbedLowVRAMScalesDown(t *testing.T) {
+	// An 8 GB AMD card must get the reduced embed ubatch (512) and a
+	// context ceiling (4096) without any operator env var.
+	cfg := config.Config{MaxContext: 8192}
+	for _, p := range amdProfiles {
+		for _, k := range []gateway.SlotKind{gateway.KindEmbed, gateway.KindCodeEmbed} {
+			t.Run(string(p)+"/"+string(k), func(t *testing.T) {
+				tn := KernelParams(p, 8, k, cfg)
+				if tn.UbatchSize != 512 || tn.BatchSize != 512 {
+					t.Errorf("%s %s ubatch/batch = %d/%d, want 512/512",
+						p, k, tn.UbatchSize, tn.BatchSize)
+				}
+				if tn.ContextSize != 4096 {
+					t.Errorf("%s %s ContextSize = %d, want 4096", p, k, tn.ContextSize)
+				}
+			})
+		}
+	}
+}
+
+func TestKernelParams_ContextCapAppliesToAllAMDSlots(t *testing.T) {
+	// A 4 GB card caps context to 2048 on every AMD slot kind (chat,
+	// embed, code-embed, rerank) — the KV cache is the dominant VRAM
+	// consumer and must shrink uniformly.
+	cfg := config.Config{MaxContext: 8192}
+	for _, k := range []gateway.SlotKind{
+		gateway.KindChat, gateway.KindEmbed, gateway.KindCodeEmbed, gateway.KindRerank,
+	} {
+		t.Run(string(k), func(t *testing.T) {
+			tn := KernelParams(hardware.ProfileVegaPro, 4, k, cfg)
+			if tn.ContextSize != 2048 {
+				t.Errorf("%s ContextSize = %d, want 2048", k, tn.ContextSize)
+			}
+		})
+	}
+}
+
+func TestKernelParams_HighVRAMAndNonAMDHaveNoContextCap(t *testing.T) {
+	// >= 12 GB AMD and every non-AMD profile must leave ContextSize 0 so
+	// buildSlotArgs keeps cfg.MaxContext verbatim (zero regression).
+	cfg := config.Config{MaxContext: 8192}
+	for _, p := range allProfiles {
+		for _, k := range []gateway.SlotKind{
+			gateway.KindChat, gateway.KindEmbed, gateway.KindRerank,
+		} {
+			t.Run(string(p)+"/"+string(k), func(t *testing.T) {
+				tn := KernelParams(p, vramHigh, k, cfg)
+				if tn.ContextSize != 0 {
+					t.Errorf("%s %s ContextSize = %d, want 0 (no cap)",
+						p, k, tn.ContextSize)
+				}
+			})
+		}
+	}
+}
+
+func TestKernelParams_UbatchOverrideBeatsTierButCapStands(t *testing.T) {
+	// An explicit QUENCHFORGE_EMBED_UBATCH_SIZE wins over the tier ubatch,
+	// but the VRAM context cap is independent and still applies — the two
+	// knobs protect different resources.
+	cfg := config.Config{MaxContext: 8192, EmbedUbatchSize: 2048}
+	tn := KernelParams(hardware.ProfileVegaPro, 4, gateway.KindEmbed, cfg)
+	if tn.UbatchSize != 2048 {
+		t.Errorf("UbatchSize = %d, want 2048 (operator override wins)", tn.UbatchSize)
+	}
+	if tn.ContextSize != 2048 {
+		t.Errorf("ContextSize = %d, want 2048 (cap independent of ubatch override)", tn.ContextSize)
+	}
+}
+
 func TestKernelParams_UnknownKindsAreEmpty(t *testing.T) {
 	// Whisper / imagegen / future kinds: tuning module shouldn't emit
 	// anything until explicitly added. Prevents accidental flag
@@ -383,7 +492,7 @@ func TestKernelParams_UnknownKindsAreEmpty(t *testing.T) {
 	cfg := config.Config{MaxContext: 8192}
 	for _, k := range []gateway.SlotKind{gateway.KindWhisper, gateway.KindImageGen} {
 		t.Run(string(k), func(t *testing.T) {
-			tn := KernelParams(hardware.ProfileVegaPro, k, cfg)
+			tn := KernelParams(hardware.ProfileVegaPro, vramHigh, k, cfg)
 			if tn.UbatchSize != 0 || tn.BatchSize != 0 || tn.MetalNCB != 0 ||
 				len(tn.ExtraArgs) != 0 || tn.AutoRespawn {
 				t.Errorf("%s should emit empty SlotTuning, got %+v", k, tn)