diff --git a/omlx/admin/benchmark.py b/omlx/admin/benchmark.py
index 7f0d620b..3788f703 100644
--- a/omlx/admin/benchmark.py
+++ b/omlx/admin/benchmark.py
@@ -152,6 +152,10 @@ def _compute_single_metrics(
     ttft_ms = ttft_s * 1000
     tpot_ms = (gen_duration / max(completion_tokens - 1, 1)) * 1000
     gen_tps = completion_tokens / max(gen_duration, 1e-9)
+    # wall_tg_tps: gen tokens over total wall (includes prefill). Used as the
+    # 1x baseline for Continuous Batching speedup so the ratio is symmetric
+    # with batch tg_tps (which is also total_gen / wall_time).
+    wall_tg_tps = completion_tokens / max(e2e_duration, 1e-9)
     processing_tps = prompt_tokens / max(ttft_s, 1e-9)
     total_throughput = (prompt_tokens + completion_tokens) / max(e2e_duration, 1e-9)
 
@@ -159,6 +163,7 @@ def _compute_single_metrics(
         "ttft_ms": round(ttft_ms, 1),
         "tpot_ms": round(tpot_ms, 2),
         "gen_tps": round(gen_tps, 1),
+        "wall_tg_tps": round(wall_tg_tps, 1),
         "processing_tps": round(processing_tps, 1),
         "e2e_latency_s": round(e2e_duration, 3),
         "total_throughput": round(total_throughput, 1),
@@ -238,6 +243,76 @@ async def _run_single_test(
     )
 
 
+
+async def _run_batch_test_via_stream_generate(
+    engine: Any,
+    prompts: list[str],
+    prompt_tokens: int,
+    max_tokens: int,
+    batch_size: int,
+) -> dict:
+    """Batch test path for engines that only expose high-level stream_generate
+    (no engine_core). Used for DFlashEngine where concurrent requests
+    serialize through _active_request; metrics will reflect serial behavior.
+    """
+
+    async def _single_request(prompt: str) -> dict:
+        start = time.perf_counter()
+        first_token = None
+        completion_tokens = 0
+        async for output in engine.stream_generate(
+            prompt=prompt, max_tokens=max_tokens, temperature=0.0
+        ):
+            ct = getattr(output, "completion_tokens", None)
+            if ct is not None and ct > 0 and first_token is None:
+                first_token = time.perf_counter()
+            if ct is not None:
+                completion_tokens = ct
+            if getattr(output, "finished", False):
+                break
+        end = time.perf_counter()
+        if first_token is None:
+            first_token = end
+        return {
+            "ttft_s": first_token - start,
+            "first_token_abs": first_token,
+            "completion_tokens": completion_tokens,
+            "wall_s": end - start,
+        }
+
+    wall_start = time.perf_counter()
+    results = await asyncio.gather(
+        *[_single_request(prompts[i]) for i in range(batch_size)]
+    )
+    wall_end = time.perf_counter()
+
+    total_gen_tokens = sum(r["completion_tokens"] for r in results)
+    total_prompt_tokens = prompt_tokens * batch_size
+    wall_time = wall_end - wall_start
+    avg_ttft_ms = (sum(r["ttft_s"] for r in results) / batch_size) * 1000
+
+    # pp TPS: total prompt tokens / time until ALL requests finish prefill
+    max_first_token = max(r["first_token_abs"] for r in results)
+    prefill_wall_time = max_first_token - wall_start
+    pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)
+
+    # tg TPS: wall-aggregate (total_gen / wall_time). Same formula used in
+    # the engine_core batch path so DFlash ↔ BatchedEngine ratios are
+    # symmetric. The alternative (gen_wall_time = wall_end - max_first_token)
+    # is inflated for any engine that serializes prefill or decode, and
+    # makes cross-engine speedup columns meaningless. See _run_batch_test.
+    tg_tps = total_gen_tokens / max(wall_time, 1e-9)
+
+    return {
+        "pp_tps": round(pp_tps, 1),
+        "tg_tps": round(tg_tps, 1),
+        "avg_ttft_ms": round(avg_ttft_ms, 1),
+        "e2e_latency_s": round(wall_time, 3),
+        "total_gen_tokens": total_gen_tokens,
+        "batch_size": batch_size,
+    }
+
+
 async def _run_batch_test(
     engine: Any,
     prompts: list[str],
@@ -247,7 +322,7 @@ async def _run_batch_test(
 ) -> dict:
     """Run a continuous batching benchmark test.
 
-    Submits batch_size concurrent requests via the engine core and measures
+    Submits batch_size concurrent requests via the engine and measures
     aggregate throughput including pp TPS and tg TPS.
 
     Args:
@@ -256,6 +331,18 @@ async def _run_batch_test(
                  has a unique UUID prefix.
         prompt_tokens: Number of prompt tokens per request (for pp TPS calc).
     """
+    # Dispatch: engines exposing engine_core (BatchedEngine, VLMBatchedEngine)
+    # use add_request/stream_outputs; DFlashEngine uses stream_generate (high
+    # level) and serializes requests via its _active_request lock.
+    if not hasattr(engine, "_engine"):
+        return await _run_batch_test_via_stream_generate(
+            engine=engine,
+            prompts=prompts,
+            prompt_tokens=prompt_tokens,
+            max_tokens=max_tokens,
+            batch_size=batch_size,
+        )
+
     from ..request import SamplingParams
 
     engine_core = engine._engine
@@ -314,10 +401,10 @@ async def _single_request(prompt: str) -> dict:
     prefill_wall_time = max_first_token - wall_start
     pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)
 
-    # tg TPS: total generated tokens / generation wall time
-    # Generation starts when the last request finishes prefill
-    gen_wall_time = wall_end - max_first_token
-    tg_tps = total_gen_tokens / max(gen_wall_time, 1e-9)
+    # tg TPS: wall-aggregate (total_gen / wall_time), same as the DFlash
+    # path. Honest cross-engine ratio with the Single Request wall_tg_tps
+    # baseline — see _run_batch_test_via_stream_generate.
+    tg_tps = total_gen_tokens / max(wall_time, 1e-9)
 
     return {
         "pp_tps": round(pp_tps, 1),
@@ -502,7 +589,11 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
         (r for r in single_results if r.get("pp") == 1024), None
     )
     if pp1024_single and batch_results:
-        baseline_tps = pp1024_single["gen_tps"]
+        # Use wall-aggregate baseline so 1x ↔ Nx ratios stay honest across
+        # engine types. Single Request table keeps showing gen_tps (peak
+        # decode rate) for context-length comparisons; Continuous Batching
+        # column needs the symmetric metric.
+        baseline_tps = pp1024_single["wall_tg_tps"]
         batching_results.append({
             "batch_size": 1,
             "tg_tps": baseline_tps,
@@ -536,6 +627,9 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
             "quantization": quantization,
             "context_length": context_length,
             "pp_tps": result["processing_tps"],
+            # Community board metric: peak decode rate (gen_tps, gen-only),
+            # NOT the wall-aggregate used for in-UI Continuous Batching
+            # speedup. Do not "tidy" this to wall_tg_tps.
             "tg_tps": result["gen_tps"],
             "ttft_ms": result.get("ttft_ms"),
             "peak_memory_gb": peak_mem_gb,
@@ -760,13 +854,12 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None:
         batch_prompts = [_generate_prompt(tokenizer, 1024) for _ in range(max_batch)]
 
         # Skip batch tests for engines without scheduler core (e.g. DFlashEngine)
-        if request.batch_sizes and not hasattr(engine, "_engine"):
-            logger.info(
-                "Batch test skipped: engine does not support concurrent batching"
-            )
-            current_test += len(request.batch_sizes)
-
-        for batch_size in request.batch_sizes if hasattr(engine, "_engine") else []:
+        # NOTE: DFlashEngine doesn't expose engine_core (`_engine`) but does
+        # support concurrent requests by serializing them through its
+        # _active_request lock. Running batch test on DFlash yields valid
+        # metrics that show serialization behavior (4x wall time, no
+        # aggregate gain) — strictly more useful than silently skipping.
+        for batch_size in request.batch_sizes:
             current_test += 1
             await _send_event(run, {
                 "type": "progress",
diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json
index 46c349b3..99f1279d 100644
--- a/omlx/admin/i18n/en.json
+++ b/omlx/admin/i18n/en.json
@@ -688,5 +688,28 @@
   "js.error.delete_model_failed": "Failed to delete model",
   "js.error.delete_model_connection": "Failed to delete model. Check server connection.",
   "js.success.download_started": "Download started: {repo_id}",
-  "js.success.settings_saved": "Settings saved successfully"
-}
\ No newline at end of file
+  "js.success.settings_saved": "Settings saved successfully",
+  "modal.model_settings.dflash_hint": "Block diffusion speculative decoding for 3-4x faster generation. Requires a DFlash draft model checkpoint.<br><strong>Single-stream only: requests run one at a time.</strong><br>* MLX impl by bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
+  "modal.model_settings.dflash_draft_model": "Draft Model",
+  "modal.model_settings.dflash_draft_model_placeholder": "Select draft model...",
+  "modal.model_settings.dflash_draft_model_help": "DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash)",
+  "modal.model_settings.dflash_draft_quant": "Draft Quantization",
+  "modal.model_settings.dflash_draft_quant_help": "Quantization of the draft model only — independent of target model quantization.",
+  "modal.model_settings.dflash_draft_quant_bf16": "bf16 (default)",
+  "modal.model_settings.dflash_max_ctx": "Max Context (fallback threshold)",
+  "modal.model_settings.dflash_max_ctx_placeholder": "unlimited",
+  "modal.model_settings.dflash_max_ctx_help": "Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.",
+  "modal.model_settings.dflash_max_concurrent": "Max Concurrent",
+  "modal.model_settings.dflash_max_concurrent_placeholder": "4 (default)",
+  "modal.model_settings.dflash_max_concurrent_help": "Cap on simultaneous in-flight DFlash requests; excess requests queue at the gate. DFlash decode is single-stream by design — this value does NOT increase throughput. It is a resource admission gate that bounds memory under bursts (each in-flight request holds its own KV cache, hundreds of MB to several GB) and keeps tail latency predictable. Default 4. On tight memory (< 64 GB) set 1–2; on 128 GB or more set 8. Leave empty for unlimited (not recommended unless you trust the upstream load shape).",
+  "modal.model_settings.dflash_l1_cache": "In-memory cache",
+  "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.",
+  "modal.model_settings.dflash_l1_max_entries": "In-memory cache max entries",
+  "modal.model_settings.dflash_l1_max_entries_help": "Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.",
+  "modal.model_settings.dflash_l1_max_gib": "In-memory cache size (GiB)",
+  "modal.model_settings.dflash_l1_max_gib_help": "Byte budget for L1 snapshots; LRU evicts when exceeded.",
+  "modal.model_settings.dflash_l2_cache": "SSD cache",
+  "modal.model_settings.dflash_l2_cache_hint": "L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (<code>dflash_l2/</code>).",
+  "modal.model_settings.dflash_l2_unavailable": "Enable oMLX paged SSD cache first (<code>--paged-ssd-cache-dir</code>).",
+  "modal.model_settings.dflash_l2_requires_l1": "Requires in-memory cache to be enabled."
+}
diff --git a/omlx/admin/i18n/zh-TW.json b/omlx/admin/i18n/zh-TW.json
index 95181843..d314877d 100644
--- a/omlx/admin/i18n/zh-TW.json
+++ b/omlx/admin/i18n/zh-TW.json
@@ -688,5 +688,28 @@
   "js.error.delete_model_failed": "刪除模型失敗",
   "js.error.delete_model_connection": "刪除模型失敗，請檢查伺服器連線。",
   "js.success.download_started": "已開始下載：{repo_id}",
-  "js.success.settings_saved": "設定儲存成功"
+  "js.success.settings_saved": "設定儲存成功",
+  "modal.model_settings.dflash_hint": "块扩散投机解码，单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。<br><strong>仅单流：请求按顺序处理。</strong><br>* MLX 实现：bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
+  "modal.model_settings.dflash_draft_model": "草稿模型",
+  "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
+  "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint（例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash）",
+  "modal.model_settings.dflash_draft_quant": "草稿模型量化",
+  "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化，与主模型量化独立。",
+  "modal.model_settings.dflash_draft_quant_bf16": "bf16（默认）",
+  "modal.model_settings.dflash_max_ctx": "最大上下文（fallback 阈值）",
+  "modal.model_settings.dflash_max_ctx_placeholder": "不限",
+  "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
+  "modal.model_settings.dflash_max_concurrent": "最大並發",
+  "modal.model_settings.dflash_max_concurrent_placeholder": "4（預設）",
+  "modal.model_settings.dflash_max_concurrent_help": "DFlash 同時處理中的請求數上限，超出的請求在入口排隊等空位。DFlash 解碼本身是嚴格單流設計——這個值不會提高吞吐速率，作用是資源 admission 閘門：突發流量下控制記憶體佔用（每個處理中的請求各自持有 KV cache，幾百 MB 到幾 GB），同時讓尾延遲可預測。預設 4。記憶體緊（< 64 GB）建議設 1-2；128 GB+ 可設 8。留空為不限（除非你確信上游流量形態可控，否則不推薦）。",
+  "modal.model_settings.dflash_l1_cache": "内存缓存",
+  "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
+  "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
+  "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
+  "modal.model_settings.dflash_l1_max_gib": "内存缓存大小（GiB）",
+  "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算；超过时 LRU 淘汰。",
+  "modal.model_settings.dflash_l2_cache": "SSD 缓存",
+  "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录（<code>dflash_l2/</code>）。",
+  "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache（<code>--paged-ssd-cache-dir</code>）。",
+  "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
 }
diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json
index 135aa192..6224ce6c 100644
--- a/omlx/admin/i18n/zh.json
+++ b/omlx/admin/i18n/zh.json
@@ -688,5 +688,28 @@
   "js.error.delete_model_failed": "删除模型失败",
   "js.error.delete_model_connection": "删除模型失败，请检查服务器连接。",
   "js.success.download_started": "已开始下载：{repo_id}",
-  "js.success.settings_saved": "设置已成功保存"
+  "js.success.settings_saved": "设置已成功保存",
+  "modal.model_settings.dflash_hint": "块扩散投机解码，单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。<br><strong>仅单流：请求按顺序处理。</strong><br>* MLX 实现：bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
+  "modal.model_settings.dflash_draft_model": "草稿模型",
+  "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
+  "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint（例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash）",
+  "modal.model_settings.dflash_draft_quant": "草稿模型量化",
+  "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化，与主模型量化独立。",
+  "modal.model_settings.dflash_draft_quant_bf16": "bf16（默认）",
+  "modal.model_settings.dflash_max_ctx": "最大上下文（fallback 阈值）",
+  "modal.model_settings.dflash_max_ctx_placeholder": "不限",
+  "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
+  "modal.model_settings.dflash_max_concurrent": "最大并发",
+  "modal.model_settings.dflash_max_concurrent_placeholder": "4（默认）",
+  "modal.model_settings.dflash_max_concurrent_help": "DFlash 同时处理中的请求数上限，超出的请求在入口排队等空位。DFlash 解码本身是严格单流设计——这个值不会提高吞吐速率，作用是资源 admission 闸门：突发流量下控制内存占用（每个处理中的请求各自持有 KV cache，几百 MB 到几 GB），同时让尾延迟可预测。默认 4。内存紧（< 64 GB）建议设 1-2；128 GB+ 可设 8。留空为不限（除非你确信上游流量形态可控，否则不推荐）。",
+  "modal.model_settings.dflash_l1_cache": "内存缓存",
+  "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
+  "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
+  "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
+  "modal.model_settings.dflash_l1_max_gib": "内存缓存大小（GiB）",
+  "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算；超过时 LRU 淘汰。",
+  "modal.model_settings.dflash_l2_cache": "SSD 缓存",
+  "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录（<code>dflash_l2/</code>）。",
+  "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache（<code>--paged-ssd-cache-dir</code>）。",
+  "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
 }
diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
index 3c0851a8..276f632e 100644
--- a/omlx/admin/routes.py
+++ b/omlx/admin/routes.py
@@ -135,6 +135,7 @@ class ModelSettingsRequest(BaseModel):
     dflash_draft_quant_activation_bits: Optional[int] = None
     dflash_draft_quant_group_size: Optional[int] = None
     dflash_max_ctx: Optional[int] = None
+    dflash_max_concurrent: Optional[int] = None
     dflash_in_memory_cache: Optional[bool] = None
     dflash_in_memory_cache_max_entries: Optional[int] = None
     dflash_in_memory_cache_max_bytes: Optional[int] = None
@@ -1662,6 +1663,7 @@ async def list_models(is_admin: bool = Depends(require_admin)):
                 "dflash_draft_quant_activation_bits": settings.dflash_draft_quant_activation_bits,
                 "dflash_draft_quant_group_size": settings.dflash_draft_quant_group_size,
                 "dflash_max_ctx": settings.dflash_max_ctx,
+                "dflash_max_concurrent": settings.dflash_max_concurrent,
                 "dflash_in_memory_cache": settings.dflash_in_memory_cache,
                 "dflash_in_memory_cache_max_entries": settings.dflash_in_memory_cache_max_entries,
                 "dflash_in_memory_cache_max_bytes": settings.dflash_in_memory_cache_max_bytes,
@@ -1946,6 +1948,10 @@ async def update_model_settings(
         # 0/None means "unlimited" — the engine treats None as no fallback threshold
         value = request.dflash_max_ctx
         current_settings.dflash_max_ctx = value if value and value > 0 else None
+    if "dflash_max_concurrent" in sent:
+        # 0/None means "unlimited" — the engine treats None as no concurrent cap
+        value = request.dflash_max_concurrent
+        current_settings.dflash_max_concurrent = value if value and value > 0 else None
     if "dflash_in_memory_cache" in sent:
         current_settings.dflash_in_memory_cache = bool(request.dflash_in_memory_cache)
     if "dflash_in_memory_cache_max_entries" in sent:
diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
index 94f1841f..e53c7331 100644
--- a/omlx/admin/static/js/dashboard.js
+++ b/omlx/admin/static/js/dashboard.js
@@ -1568,6 +1568,7 @@
                     dflash_draft_quant_activation_bits: settings.dflash_draft_quant_activation_bits || 16,
                     dflash_draft_quant_group_size: settings.dflash_draft_quant_group_size || 64,
                     dflash_max_ctx: settings.dflash_max_ctx ?? null,
+                    dflash_max_concurrent: settings.dflash_max_concurrent ?? null,
                     dflash_in_memory_cache: settings.dflash_in_memory_cache !== false,
                     dflash_in_memory_cache_max_entries: settings.dflash_in_memory_cache_max_entries || 4,
                     dflash_in_memory_cache_max_gib: settings.dflash_in_memory_cache_max_bytes
@@ -1676,6 +1677,9 @@
                                 dflash_max_ctx: this.modelSettings.dflash_enabled && this.modelSettings.dflash_max_ctx
                                     ? parseInt(this.modelSettings.dflash_max_ctx)
                                     : null,
+                                dflash_max_concurrent: this.modelSettings.dflash_enabled && this.modelSettings.dflash_max_concurrent
+                                    ? parseInt(this.modelSettings.dflash_max_concurrent)
+                                    : null,
                                 dflash_in_memory_cache: this.modelSettings.dflash_enabled
                                     ? !!this.modelSettings.dflash_in_memory_cache
                                     : true,
@@ -1771,6 +1775,7 @@
                         this.modelSettings.dflash_draft_quant_activation_bits = null;
                         this.modelSettings.dflash_draft_quant_group_size = null;
                         this.modelSettings.dflash_max_ctx = null;
+                        this.modelSettings.dflash_max_concurrent = null;
                         this.modelSettings.dflash_in_memory_cache = true;
                         this.modelSettings.dflash_in_memory_cache_max_entries = 4;
                         this.modelSettings.dflash_in_memory_cache_max_gib = 8;
@@ -2432,9 +2437,14 @@
             },
 
             benchGetSpeedup(batchResult) {
+                // Symmetric metric: batch tg_tps is wall-aggregate (total_gen /
+                // wall_time), so baseline must also be wall-aggregate. Using
+                // gen_tps (gen-only, excludes prefill) makes the ratio look
+                // sub-1x for any engine that doesn't parallelize prefill.
                 const baseline = this.benchSingleResults.find(r => r.pp === 1024);
-                if (!baseline || !baseline.gen_tps || baseline.gen_tps <= 0) return 0;
-                return batchResult.tg_tps / baseline.gen_tps;
+                const base = baseline && baseline.wall_tg_tps ? baseline.wall_tg_tps : (baseline ? baseline.gen_tps : 0);
+                if (!base || base <= 0) return 0;
+                return batchResult.tg_tps / base;
             },
 
             benchFormatMemory(bytes) {
@@ -2468,7 +2478,7 @@
                             pad(r.ttft_ms.toFixed(1), 10),
                             pad(r.tpot_ms.toFixed(2), 10),
                             pad(r.processing_tps.toFixed(1) + ' tok/s', 12),
-                            pad(r.gen_tps.toFixed(1) + ' tok/s', 12),
+                            pad(((r.wall_tg_tps ?? r.gen_tps)).toFixed(1) + ' tok/s', 12),
                             pad(r.e2e_latency_s.toFixed(3), 10),
                             pad(r.total_throughput.toFixed(1) + ' tok/s', 12),
                             pad(this.benchFormatMemory(r.peak_memory_bytes), 10),
@@ -2487,10 +2497,13 @@
                     lines.push('-'.repeat(80));
                     const hdr = [rpad('Batch', 8), pad('tg TPS', 12), pad('Speedup', 8), pad('pp TPS', 12), pad('pp TPS/req', 12), pad('TTFT(ms)', 10), pad('E2E(s)', 10)];
                     lines.push(hdr.join('  '));
+                    // 1x baseline uses wall_tg_tps (wall-aggregate) to stay
+                    // symmetric with batch tg_tps below. See benchGetSpeedup.
+                    const baseTg = baseline && baseline.wall_tg_tps ? baseline.wall_tg_tps : (baseline ? baseline.gen_tps : 0);
                     if (baseline) {
                         const row = [
                             rpad('1x', 8),
-                            pad(baseline.gen_tps.toFixed(1) + ' tok/s', 12),
+                            pad(baseTg.toFixed(1) + ' tok/s', 12),
                             pad('1.00x', 8),
                             pad(baseline.processing_tps.toFixed(1) + ' tok/s', 12),
                             pad(baseline.processing_tps.toFixed(1) + ' tok/s', 12),
@@ -2500,7 +2513,7 @@
                         lines.push(row.join('  '));
                     }
                     for (const r of results) {
-                        const speedup = baseline && baseline.gen_tps > 0 ? (r.tg_tps / baseline.gen_tps).toFixed(2) + 'x' : '-';
+                        const speedup = baseTg > 0 ? (r.tg_tps / baseTg).toFixed(2) + 'x' : '-';
                         const row = [
                             rpad(r.batch_size + 'x', 8),
                             pad(r.tg_tps.toFixed(1) + ' tok/s', 12),
diff --git a/omlx/admin/templates/dashboard/_bench.html b/omlx/admin/templates/dashboard/_bench.html
index e2a31432..49f846a0 100644
--- a/omlx/admin/templates/dashboard/_bench.html
+++ b/omlx/admin/templates/dashboard/_bench.html
@@ -188,7 +188,7 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('bench.headi
                                             <td class="px-4 py-3 text-right tabular-nums text-neutral-700" x-text="r.ttft_ms.toFixed(1)"></td>
                                             <td class="px-4 py-3 text-right tabular-nums text-neutral-700" x-text="r.tpot_ms.toFixed(2)"></td>
                                             <td class="px-4 py-3 text-right tabular-nums text-neutral-700" x-text="r.processing_tps.toFixed(1) + ' tok/s'"></td>
-                                            <td class="px-4 py-3 text-right tabular-nums font-medium text-neutral-900" x-text="r.gen_tps.toFixed(1) + ' tok/s'"></td>
+                                            <td class="px-4 py-3 text-right tabular-nums font-medium text-neutral-900" x-text="(r.wall_tg_tps ?? r.gen_tps).toFixed(1) + ' tok/s'"></td>
                                             <td class="px-4 py-3 text-right tabular-nums text-neutral-700" x-text="r.e2e_latency_s.toFixed(3) + 's'"></td>
                                             <td class="px-4 py-3 text-right tabular-nums text-neutral-700" x-text="r.total_throughput.toFixed(1) + ' tok/s'"></td>
                                             <td class="px-4 py-3 text-right tabular-nums text-neutral-700" x-text="benchFormatMemory(r.peak_memory_bytes)"></td>
@@ -225,7 +225,7 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('bench.headi
                                     <!-- 1x baseline from single request -->
                                     <tr x-show="benchSingleResults.some(r => r.pp === 1024)" class="border-b border-neutral-100 bg-neutral-50/50">
                                         <td class="px-4 py-3 font-medium text-neutral-500">{{ t('bench.results.batch.baseline') }}</td>
-                                        <td class="px-4 py-3 text-right tabular-nums text-neutral-500" x-text="(benchSingleResults.find(r => r.pp === 1024)?.gen_tps?.toFixed(1) || '-') + ' tok/s'"></td>
+                                        <td class="px-4 py-3 text-right tabular-nums text-neutral-500" x-text="((benchSingleResults.find(r => r.pp === 1024)?.wall_tg_tps ?? benchSingleResults.find(r => r.pp === 1024)?.gen_tps)?.toFixed(1) || '-') + ' tok/s'"></td>
                                         <td class="px-4 py-3 text-right tabular-nums text-neutral-500">1.00x</td>
                                         <td class="px-4 py-3 text-right tabular-nums text-neutral-500" x-text="(benchSingleResults.find(r => r.pp === 1024)?.processing_tps?.toFixed(1) || '-') + ' tok/s'"></td>
                                         <td class="px-4 py-3 text-right tabular-nums text-neutral-500" x-text="(benchSingleResults.find(r => r.pp === 1024)?.processing_tps?.toFixed(1) || '-') + ' tok/s'"></td>
diff --git a/omlx/admin/templates/dashboard/_modal_model_settings.html b/omlx/admin/templates/dashboard/_modal_model_settings.html
index e5c262e6..39e122f1 100644
--- a/omlx/admin/templates/dashboard/_modal_model_settings.html
+++ b/omlx/admin/templates/dashboard/_modal_model_settings.html
@@ -661,10 +661,10 @@ <h4 class="text-xs font-bold uppercase tracking-widest text-neutral-400 mb-3">{{
                                     </div>
                                     <div x-show="modelSettings.specprefill_enabled" x-transition class="space-y-3 pt-1">
                                         <div>
-                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">Draft Model</label>
+                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">{{ t('modal.model_settings.dflash_draft_model') }}</label>
                                             <select x-model="modelSettings.specprefill_draft_model"
                                                     class="w-full px-4 py-2.5 border border-neutral-200 rounded-xl text-sm focus:ring-2 focus:ring-neutral-900 focus:border-transparent transition-all bg-white">
-                                                <option value="">Select draft model...</option>
+                                                <option value="">{{ t('modal.model_settings.dflash_draft_model_placeholder') }}</option>
                                                 <template x-for="m in models.filter(m => m.id !== selectedModel?.id)" :key="m.id">
                                                     <option :value="m.model_path || m.id" x-text="m.id" :selected="modelSettings.specprefill_draft_model === (m.model_path || m.id)"></option>
                                                 </template>
@@ -770,15 +770,21 @@ <h4 class="text-xs font-bold uppercase tracking-widest text-neutral-400 mb-3">{{
                                             </div>
                                         </div>
                                         <div>
-                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">Max Context (fallback threshold)</label>
-                                            <input type="number" x-model.number="modelSettings.dflash_max_ctx" min="1" step="1024" placeholder="unlimited"
+                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">{{ t('modal.model_settings.dflash_max_ctx') }}</label>
+                                            <input type="number" x-model.number="modelSettings.dflash_max_ctx" min="1" step="1024" :placeholder="t('modal.model_settings.dflash_max_ctx_placeholder')"
+                                                   class="w-full px-4 py-2.5 border border-neutral-200 rounded-xl text-sm focus:ring-2 focus:ring-neutral-900 focus:border-transparent transition-all">
+                                            <p class="text-xs text-neutral-400 mt-1">{{ t('modal.model_settings.dflash_max_ctx_help') }}</p>
+                                        </div>
+                                        <div>
+                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">{{ t('modal.model_settings.dflash_max_concurrent') }}</label>
+                                            <input type="number" x-model.number="modelSettings.dflash_max_concurrent" min="1" step="1" :placeholder="t('modal.model_settings.dflash_max_concurrent_placeholder')"
                                                    class="w-full px-4 py-2.5 border border-neutral-200 rounded-xl text-sm focus:ring-2 focus:ring-neutral-900 focus:border-transparent transition-all">
-                                            <p class="text-xs text-neutral-400 mt-1">Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.</p>
+                                            <p class="text-xs text-neutral-400 mt-1">{{ t('modal.model_settings.dflash_max_concurrent_help') }}</p>
                                         </div>
                                         <div class="flex items-start justify-between gap-3 pt-2 border-t border-neutral-200">
                                             <div class="min-w-0">
-                                                <span class="text-sm font-medium text-neutral-700">In-memory cache</span>
-                                                <p class="text-xs text-neutral-500 mt-0.5">DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.</p>
+                                                <span class="text-sm font-medium text-neutral-700">{{ t('modal.model_settings.dflash_l1_cache') }}</span>
+                                                <p class="text-xs text-neutral-500 mt-0.5">{{ t('modal.model_settings.dflash_l1_cache_hint') }}</p>
                                             </div>
                                             <button type="button" @click="modelSettings.dflash_in_memory_cache = !modelSettings.dflash_in_memory_cache; if (!modelSettings.dflash_in_memory_cache) modelSettings.dflash_ssd_cache = false"
                                                     :class="modelSettings.dflash_in_memory_cache ? 'bg-black' : 'bg-neutral-200'"
@@ -788,23 +794,23 @@ <h4 class="text-xs font-bold uppercase tracking-widest text-neutral-400 mb-3">{{
                                             </button>
                                         </div>
                                         <div x-show="modelSettings.dflash_in_memory_cache" x-transition>
-                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">In-memory cache max entries</label>
+                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">{{ t('modal.model_settings.dflash_l1_max_entries') }}</label>
                                             <input type="number" x-model.number="modelSettings.dflash_in_memory_cache_max_entries" min="1" max="128" step="1" placeholder="4"
                                                    class="w-full px-4 py-2.5 border border-neutral-200 rounded-xl text-sm focus:ring-2 focus:ring-neutral-900 focus:border-transparent transition-all">
-                                            <p class="text-xs text-neutral-400 mt-1">Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.</p>
+                                            <p class="text-xs text-neutral-400 mt-1">{{ t('modal.model_settings.dflash_l1_max_entries_help') }}</p>
                                         </div>
                                         <div x-show="modelSettings.dflash_in_memory_cache" x-transition>
-                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">In-memory cache size (GiB)</label>
+                                            <label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">{{ t('modal.model_settings.dflash_l1_max_gib') }}</label>
                                             <input type="number" x-model.number="modelSettings.dflash_in_memory_cache_max_gib" min="1" max="256" step="1" placeholder="8"
                                                    class="w-full px-4 py-2.5 border border-neutral-200 rounded-xl text-sm focus:ring-2 focus:ring-neutral-900 focus:border-transparent transition-all">
-                                            <p class="text-xs text-neutral-400 mt-1">Byte budget for L1 snapshots; LRU evicts when exceeded.</p>
+                                            <p class="text-xs text-neutral-400 mt-1">{{ t('modal.model_settings.dflash_l1_max_gib_help') }}</p>
                                         </div>
                                         <div class="flex items-start justify-between gap-3">
                                             <div class="min-w-0">
-                                                <span class="text-sm font-medium text-neutral-700">SSD cache</span>
-                                                <p class="text-xs text-neutral-500 mt-0.5">L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (<code>dflash_l2/</code>).</p>
-                                                <p x-show="!modelSettings.dflash_ssd_cache_available" class="text-xs text-amber-600 mt-1">Enable oMLX paged SSD cache first (<code>--paged-ssd-cache-dir</code>).</p>
-                                                <p x-show="modelSettings.dflash_ssd_cache_available && !modelSettings.dflash_in_memory_cache" class="text-xs text-amber-600 mt-1">Requires in-memory cache to be enabled.</p>
+                                                <span class="text-sm font-medium text-neutral-700">{{ t('modal.model_settings.dflash_l2_cache') }}</span>
+                                                <p class="text-xs text-neutral-500 mt-0.5">{{ t('modal.model_settings.dflash_l2_cache_hint') | safe }}</p>
+                                                <p x-show="!modelSettings.dflash_ssd_cache_available" class="text-xs text-amber-600 mt-1">{{ t('modal.model_settings.dflash_l2_unavailable') | safe }}</p>
+                                                <p x-show="modelSettings.dflash_ssd_cache_available && !modelSettings.dflash_in_memory_cache" class="text-xs text-amber-600 mt-1">{{ t('modal.model_settings.dflash_l2_requires_l1') }}</p>
                                             </div>
                                             <button type="button"
                                                     @click="if (modelSettings.dflash_ssd_cache_available && modelSettings.dflash_in_memory_cache) modelSettings.dflash_ssd_cache = !modelSettings.dflash_ssd_cache"
diff --git a/omlx/engine/dflash.py b/omlx/engine/dflash.py
index a2388f3b..66c6ab38 100644
--- a/omlx/engine/dflash.py
+++ b/omlx/engine/dflash.py
@@ -1,12 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-DFlash engine for block diffusion speculative decoding.
-
-This engine wraps dflash-mlx (>= 0.1.5) to provide 3-4x faster decoding on
-Apple Silicon for Qwen and Gemma4 model families. By default it serves all
-requests through dflash; setting ``model_settings.dflash_max_ctx`` opts into
-evicting the dflash models and delegating long-context requests to omlx's
-BatchedEngine/VLMBatchedEngine (paged cache, SSD cache, continuous batching).
+DFlash engine for block diffusion speculative decoding (Path A layout).
+
+Wraps dflash-mlx (>= 0.1.5) to provide 3-4x faster decoding on Apple Silicon
+for Qwen and Gemma4 model families.
+
+Path A layout: eagerly stands up **both** an embedded ``VLMBatchedEngine``
+(BG path: paged cache + SSD cache + continuous batching) **and** a DFlash
+drafter attached to the same target weights (dflash path: speculative
+decode). Per-request the engine routes between them based on concurrency,
+KV pressure, and context length; weights are shared (not re-loaded), so
+the only extra cost over plain VLM is the small drafter checkpoint.
+
+This replaces the pre-Path-A one-way eviction layout where exceeding
+``dflash_max_ctx`` permanently tore down dflash and started a fallback
+engine. The ``_in_fallback_mode`` flag and ``_evict_dflash_and_start_fallback``
+helper are gone; both paths coexist for the engine's lifetime.
 """
 
 import asyncio
@@ -65,14 +74,15 @@ def is_dflash_compatible(model_path: str | Path) -> tuple[bool, str]:
 
 class DFlashEngine(BaseEngine):
     """
-    DFlash speculative decoding engine with optional batched fallback.
-
-    For prompts within ``model_settings.dflash_max_ctx`` (or always, when the
-    threshold is None), uses block diffusion speculative decoding for 3-4x
-    faster generation. When the threshold is exceeded, evicts dflash models
-    from memory and delegates to a fallback engine (BatchedEngine or
-    VLMBatchedEngine) that provides paged cache, SSD cache, and continuous
-    batching.
+    DFlash speculative decoding engine with a long-lived embedded BG engine.
+
+    Path A layout: ``start()`` brings up both an embedded
+    ``VLMBatchedEngine``/``BatchedEngine`` (paged cache, SSD cache,
+    continuous batching) AND a DFlash drafter attached to the **same**
+    target weights via ``DFlashVLMTargetWrapper``. Per-request, ``_route``
+    decides between the dflash decode path (fast, capped concurrency,
+    bounded context) and the BG path (everything else). Weights are not
+    duplicated; the only extra memory cost is the small drafter.
     """
 
     def __init__(
@@ -84,9 +94,12 @@ def __init__(
         draft_quant_activation_bits: int | None = None,
         draft_quant_group_size: int | None = None,
         model_settings: Any | None = None,
-        fallback_engine_type: str = "batched",
+        fallback_engine_type: str = "auto",
         scheduler_config: Any | None = None,
         omlx_ssd_cache_dir: str | Path | None = None,
+        dflash_max_concurrent: int = 4,
+        dflash_kv_pressure_threshold: float = 0.7,
+        dflash_lazy_drafter: bool = False,
     ):
         self._model_name = model_name
         self._draft_model_path = draft_model_path
@@ -95,7 +108,7 @@ def __init__(
         self._draft_quant_activation_bits = draft_quant_activation_bits
         self._draft_quant_group_size = draft_quant_group_size
         self._model_settings = model_settings
-        self._fallback_engine_type = fallback_engine_type
+        self._fallback_engine_type = self._resolve_fallback_engine_type(fallback_engine_type, model_name)
         self._scheduler_config = scheduler_config
         self._omlx_ssd_cache_dir = (
             Path(omlx_ssd_cache_dir) if omlx_ssd_cache_dir else None
@@ -108,16 +121,51 @@ def __init__(
         self._tokenizer_obj = None
         self._executor_tokenizer = None
         self._loaded = False
-        self._active_request = False
+        self._active_count = 0
         self._model_type_str = None
-        self._fallback_engine: BaseEngine | None = None
-        self._in_fallback_mode = False
+        self._target_ops: Any | None = None
+        self._draft_backend: Any | None = None
+        self._draft_meta: dict[str, Any] | None = None
+        # Path A double-engine layout: embedded VLM stays up for the
+        # engine's lifetime; the dflash bundle hooks into the same
+        # already-loaded target weights.
+        self._embedded_vlm: BaseEngine | None = None
+        self._dflash_bundle: Any | None = None
         self._runtime_context: Any | None = None
         self._dflash_prefix_cache: Any | None = None
+        # Routing counters (read by get_stats; also used by smoke test).
+        self._dflash_routed_count = 0
+        self._bg_routed_count = 0
+        self._last_route: str | None = None
 
         self._max_dflash_ctx = (
             getattr(model_settings, "dflash_max_ctx", None) if model_settings else None
         )
+        # Path A behavioural change: previously this defaulted to None
+        # (unlimited concurrent dflash requests, only context fallback
+        # ever bumped them off); Path A defaults to 1 so the BG path is
+        # actually exercised under concurrency. The model_settings value
+        # still wins when set explicitly.
+        settings_concurrent = (
+            getattr(model_settings, "dflash_max_concurrent", None) if model_settings else None
+        )
+        self._max_dflash_concurrent = (
+            int(settings_concurrent) if settings_concurrent is not None
+            else int(dflash_max_concurrent)
+        )
+        self._kv_pressure_threshold = float(dflash_kv_pressure_threshold)
+        # Lazy drafter loading: defer wrapper + factory call until first
+        # dflash-routed request. Saves ~28% throughput when workload is
+        # bg-heavy (drafter co-loaded in Metal causes contention even when
+        # idle, confirmed by D5 bench 2026-05-12). Cold-start cost: first
+        # dflash request includes ~3s drafter load latency.
+        self._dflash_lazy_drafter = bool(dflash_lazy_drafter)
+        # Lazily created in start() — asyncio.Semaphore needs a running event
+        # loop in some Python versions, and __init__ is sync.
+        self._concurrent_sem: asyncio.Semaphore | None = None
+        # Created in start() too; guards lazy-drafter race when multiple
+        # concurrent requests trigger first load simultaneously.
+        self._drafter_load_lock: asyncio.Lock | None = None
         self._in_memory_cache_enabled = (
             bool(getattr(model_settings, "dflash_in_memory_cache", True))
             if model_settings
@@ -151,6 +199,21 @@ def tokenizer(self) -> Any:
     def model_type(self) -> str | None:
         return self._model_type_str
 
+    @staticmethod
+    def _resolve_fallback_engine_type(requested: str, model_name: str) -> str:
+        """Resolve fallback_engine_type='auto' by inspecting model config.
+
+        Path A is initially Gemma 4 focused (multimodal). Defaulting to
+        'batched' was wrong for VLM models because mlx_lm cannot load
+        Gemma 4 ConditionalGeneration architecture. Auto-detect via
+        omlx.speculative.detect_fallback_engine_type (vision_config /
+        audio_config markers in config.json).
+        """
+        if requested != "auto":
+            return requested
+        from ..speculative import detect_fallback_engine_type
+        return detect_fallback_engine_type(model_name)
+
     @staticmethod
     def _build_quant_spec(
         weight_bits: int | None,
@@ -214,144 +277,232 @@ async def start(self) -> None:
 
         loop = asyncio.get_running_loop()
 
-        def _load_models():
-            from dflash_mlx.draft_backend import make_draft_backend
-            from dflash_mlx.runtime.loading import (
-                load_draft_bundle,
-                load_target_bundle,
-            )
+        # Build runtime context first — the dflash factory consults it for
+        # verify_config and cache setup.
+        self._runtime_context = self._build_runtime_context()
 
-            target_bundle = load_target_bundle(self._model_name)
-            draft, draft_meta = load_draft_bundle(
-                self._draft_model_path,
-                draft_quant=self._build_quant_spec(
-                    self._draft_quant_weight_bits,
-                    self._draft_quant_activation_bits,
-                    self._draft_quant_group_size,
-                ) if self._draft_quant_enabled else None,
+        # 1) Bring up the embedded BG engine. This is the canonical owner
+        #    of the target weights; the dflash drafter will attach to the
+        #    SAME ``_vlm_model`` instance, so memory does not double.
+        if self._fallback_engine_type == "vlm":
+            from .vlm import VLMBatchedEngine
+            self._embedded_vlm = VLMBatchedEngine(
+                model_name=self._model_name,
+                scheduler_config=self._scheduler_config,
+                model_settings=self._model_settings,
             )
-            draft_backend = make_draft_backend()
-            return target_bundle, draft, draft_backend
-
-        result = await loop.run_in_executor(get_mlx_executor(), _load_models)
-        target_bundle, self._draft_model, self._draft_backend = result
-        self._target_model = target_bundle.model
-        self._tokenizer_obj = target_bundle.tokenizer
-        self._target_ops = target_bundle.target_ops
-        target_meta = target_bundle.meta
-
+        else:
+            from .batched import BatchedEngine
+            self._embedded_vlm = BatchedEngine(
+                model_name=self._model_name,
+                scheduler_config=self._scheduler_config,
+                model_settings=self._model_settings,
+            )
+        await self._embedded_vlm.start()
+
+        # Discover the loaded model + tokenizer on the embedded engine.
+        # VLMBatchedEngine: ``_vlm_model`` / ``_tokenizer``.
+        # BatchedEngine: ``_model`` / ``_tokenizer``.
+        embedded_model = getattr(self._embedded_vlm, "_vlm_model", None) \
+            or getattr(self._embedded_vlm, "_model", None)
+        if embedded_model is None:
+            raise RuntimeError(
+                "DFlashEngine: embedded engine did not expose a loaded model "
+                "after start() — neither _vlm_model nor _model is set"
+            )
+        self._tokenizer_obj = getattr(self._embedded_vlm, "_tokenizer", None) \
+            or getattr(self._embedded_vlm, "tokenizer", None)
         # Deep-copy tokenizer for executor-thread usage (dflash generation).
-        # The original self._tokenizer_obj stays for event-loop operations
-        # (encode, apply_chat_template, count_chat_tokens).
         # See: https://github.com/huggingface/tokenizers/issues/537
         self._executor_tokenizer = copy.deepcopy(self._tokenizer_obj)
 
-        # Extract model_type from config
-        config = target_meta.get("config", {})
-        if isinstance(config, dict):
-            self._model_type_str = config.get("model_type")
-        elif hasattr(config, "model_type"):
-            self._model_type_str = config.model_type
-
-        self._runtime_context = self._build_runtime_context()
+        # 2-3) Drafter loading. Eager path runs now; lazy path defers
+        # until first dflash-routed request via _ensure_drafter_loaded.
+        self._drafter_load_lock = asyncio.Lock()
+        if self._dflash_lazy_drafter:
+            logger.info(
+                "DFlashEngine: lazy_drafter mode — drafter NOT loaded yet "
+                "(loads on first dflash-routed request)"
+            )
+        else:
+            await self._load_drafter_bundle(embedded_model)
+
+        # Extract model_type from the embedded engine's config so the API
+        # layer's reasoning detection still works.
+        cfg = getattr(embedded_model, "config", None)
+        if cfg is not None:
+            if isinstance(cfg, dict):
+                self._model_type_str = cfg.get("model_type")
+            else:
+                self._model_type_str = getattr(cfg, "model_type", None)
 
         self._loaded = True
-        self._in_fallback_mode = False
+        if self._max_dflash_concurrent:
+            self._concurrent_sem = asyncio.Semaphore(self._max_dflash_concurrent)
         max_ctx_display = "unlimited" if self._max_dflash_ctx is None else self._max_dflash_ctx
         logger.info(
-            f"DFlashEngine loaded: target={self._model_name}, "
+            f"DFlashEngine loaded (Path A double-engine): target={self._model_name}, "
             f"draft={self._draft_model_path}, "
+            f"embedded_engine={self._fallback_engine_type}, "
             f"max_ctx={max_ctx_display}, "
-            f"fallback={self._fallback_engine_type}, "
+            f"max_concurrent={self._max_dflash_concurrent}, "
+            f"kv_pressure_threshold={self._kv_pressure_threshold}, "
             f"l1_cache={self._in_memory_cache_enabled}, "
             f"l2_cache={self._resolve_dflash_l2_dir() is not None}"
         )
 
-    async def _evict_dflash_and_start_fallback(self) -> None:
-        """Evict dflash models from memory, verify release, then start fallback engine."""
-        from dflash_mlx.cache.manager import shutdown_runtime_cache_manager
+    async def _load_drafter_bundle(self, embedded_model: Any | None = None) -> None:
+        """Load the dflash drafter bundle (wrapper + factory attach).
 
-        from ..engine_core import get_mlx_executor
+        Used by start() in eager mode and _ensure_drafter_loaded() in lazy
+        mode. Caller is responsible for serialization (start() runs once;
+        lazy path holds self._drafter_load_lock).
 
-        loop = asyncio.get_running_loop()
-        pre_active = mx.get_active_memory()
+        ``embedded_model`` is passed when called from start() (avoids a
+        second getattr); lazy path resolves it from self._embedded_vlm.
+        """
+        from ..speculative.dflash_factory import attach_dflash_to_loaded_target
+        from ..engine_core import get_mlx_executor
 
-        # Release dflash model and cache references
-        shutdown_runtime_cache_manager()
-        self._dflash_prefix_cache = None
-        self._runtime_context = None
-        self._target_model = None
-        self._target_ops = None
-        self._draft_model = None
-        self._draft_backend = None
-        self._executor_tokenizer = None
+        if embedded_model is None:
+            embedded_model = getattr(self._embedded_vlm, "_vlm_model", None) \
+                or getattr(self._embedded_vlm, "_model", None)
+            if embedded_model is None:
+                raise RuntimeError(
+                    "DFlashEngine._load_drafter_bundle: embedded engine "
+                    "did not expose a loaded model"
+                )
 
-        # Force memory reclaim with settle barrier
-        gc.collect()
-        await loop.run_in_executor(
-            get_mlx_executor(),
-            lambda: (mx.synchronize(), mx.clear_cache()),
+        # Path A generalization: probe upstream dflash_mlx target_ops directly
+        # first. Apply the mlx_vlm→mlx_lm shape wrapper only when the upstream
+        # ops can't recognize the model.
+        #
+        # Currently:
+        #   - QwenGdnTargetOps walks `target.language_model` + uses structural
+        #     hasattr checks, so it accepts both mlx_lm-loaded Qwen and
+        #     mlx_vlm-loaded Qwen 3.x natively. NO wrapper.
+        #   - Gemma4TargetOps reads `text_wrapper.args.layer_types` and
+        #     `inner._get_per_layer_inputs` (mlx_lm-only attribute names), so
+        #     mlx_vlm-loaded Gemma 4 falls through. WRAPPER required.
+        #
+        # The try/except below auto-routes each family without family
+        # hardcoding here. When `bstnxbt/dflash-mlx` upstream generalizes
+        # Gemma4TargetOps to match QwenGdnTargetOps's VLM-aware pattern,
+        # the wrapper path goes idle for Gemma 4 too and we can eventually
+        # delete the wrapper module.
+        target_for_dflash = embedded_model
+        try:
+            from dflash_mlx.engine.target_ops import resolve_target_ops
+            resolve_target_ops(target_for_dflash)
+            logger.info(
+                "DFlashEngine: upstream dflash_mlx ops resolved embedded model "
+                "directly — no wrapper needed (family: %s)",
+                type(target_for_dflash).__name__,
+            )
+        except Exception as e:  # NotImplementedError or other rejection
+            from ..speculative.dflash_vlm_target_wrap import DFlashVLMTargetWrapper
+            logger.info(
+                "DFlashEngine: upstream ops rejected embedded model "
+                "(%s: %s); applying DFlashVLMTargetWrapper for mlx_vlm→mlx_lm "
+                "shape bridge",
+                type(e).__name__, str(e)[:120],
+            )
+            target_for_dflash = DFlashVLMTargetWrapper(embedded_model)
+
+        self._target_model = target_for_dflash
+        draft_quant_spec = (
+            self._build_quant_spec(
+                self._draft_quant_weight_bits,
+                self._draft_quant_activation_bits,
+                self._draft_quant_group_size,
+            )
+            if self._draft_quant_enabled
+            else None
         )
 
-        # Poll for actual memory release (same pattern as engine_pool._unload_engine)
-        for settle_round in range(10):
-            active_now = mx.get_active_memory()
-            freed = pre_active - active_now
-            if freed > 0:
-                logger.info(
-                    f"DFlash models evicted: freed={freed / 1024**3:.2f}GB "
-                    f"(round {settle_round + 1})"
-                )
-                break
-            await asyncio.sleep(0.5)
-            gc.collect()
-            await loop.run_in_executor(
-                get_mlx_executor(),
-                lambda: (mx.synchronize(), mx.clear_cache()),
+        def _attach_drafter() -> Any:
+            return attach_dflash_to_loaded_target(
+                target_model=target_for_dflash,
+                draft_path=self._draft_model_path,
+                draft_quant=draft_quant_spec,
+                runtime_context=self._runtime_context,
             )
-        else:
-            logger.warning("DFlash model eviction: memory settle timed out")
 
-        # Start fallback engine
-        if self._fallback_engine_type == "vlm":
-            from .vlm import VLMBatchedEngine
-            self._fallback_engine = VLMBatchedEngine(
-                model_name=self._model_name,
-                scheduler_config=self._scheduler_config,
-                model_settings=self._model_settings,
+        loop = asyncio.get_running_loop()
+        self._dflash_bundle = await loop.run_in_executor(
+            get_mlx_executor(), _attach_drafter
+        )
+        self._draft_model = self._dflash_bundle.draft_model
+        self._target_ops = self._dflash_bundle.target_ops
+        self._draft_backend = self._dflash_bundle.draft_backend
+
+    async def _ensure_drafter_loaded(self) -> None:
+        """Lazy-load drafter on first dflash-routed request. Idempotent and
+        concurrent-safe via self._drafter_load_lock (double-checked).
+
+        Called from generate / stream_generate just before entering the
+        DFlash decode path. No-op if drafter already loaded (eager mode or
+        previous lazy invocation).
+        """
+        if self._dflash_bundle is not None:
+            return
+        # _drafter_load_lock is created in start(); if we got here without
+        # start() having run, the engine is in an invalid state.
+        if self._drafter_load_lock is None:
+            raise RuntimeError(
+                "DFlashEngine._ensure_drafter_loaded called before start()"
             )
-        else:
-            from .batched import BatchedEngine
-            self._fallback_engine = BatchedEngine(
-                model_name=self._model_name,
-                scheduler_config=self._scheduler_config,
-                model_settings=self._model_settings,
+        async with self._drafter_load_lock:
+            if self._dflash_bundle is not None:  # double-check after lock
+                return
+            logger.info(
+                "DFlashEngine: loading drafter on first dflash-routed request "
+                "(lazy mode)"
             )
-        await self._fallback_engine.start()
-        self._in_fallback_mode = True
-        logger.info(
-            f"DFlash fallback engine started: {self._fallback_engine_type}"
-        )
+            await self._load_drafter_bundle()
+            logger.info("DFlashEngine: drafter loaded")
 
     async def stop(self) -> None:
         from dflash_mlx.cache.manager import shutdown_runtime_cache_manager
 
-        if self._fallback_engine is not None:
-            await self._fallback_engine.stop()
-            self._fallback_engine = None
+        # Tear down dflash drafter side first (releases prefix-cache /
+        # snapshot service / kernel state) before the embedded engine
+        # disposes of the shared target weights.
         try:
             shutdown_runtime_cache_manager()
         except Exception as exc:
             logger.debug(f"shutdown_runtime_cache_manager: {exc}")
         self._dflash_prefix_cache = None
         self._runtime_context = None
-        self._target_model = None
-        self._target_ops = None
+        self._dflash_bundle = None
         self._draft_model = None
+        self._target_ops = None
         self._draft_backend = None
+        # Wrapper; underlying weights belong to embedded_vlm and get torn
+        # down when the embedded engine stops below.
+        self._target_model = None
         self._tokenizer_obj = None
         self._executor_tokenizer = None
-        self._in_fallback_mode = False
+
+        # Force a barrier so MLX releases any draft buffers before the
+        # embedded engine starts its own teardown.
+        gc.collect()
+        try:
+            from ..engine_core import get_mlx_executor
+            loop = asyncio.get_running_loop()
+            await loop.run_in_executor(
+                get_mlx_executor(),
+                lambda: (mx.synchronize(), mx.clear_cache()),
+            )
+        except Exception as exc:
+            logger.debug(f"DFlashEngine.stop barrier: {exc}")
+
+        if self._embedded_vlm is not None:
+            await self._embedded_vlm.stop()
+            self._embedded_vlm = None
+
+        self._runtime_context = None
+        self._tokenizer_obj = None
         self._loaded = False
         logger.info("DFlashEngine stopped")
 
@@ -436,10 +587,123 @@ def count_chat_tokens(
         )
         return len(self._tokenizer_obj.encode(prompt))
 
-    def _should_fallback(self, prompt_tokens: list[int]) -> bool:
-        if self._max_dflash_ctx is None:
-            return False
-        return len(prompt_tokens) >= self._max_dflash_ctx
+    def _kv_pressure(self) -> float | None:
+        """Read the embedded engine's paged KV cache usage ratio.
+
+        Returns a float in [0.0, 1.0] or None if the cache isn't exposed yet
+        (engine still starting, or accessor path drifted on an omlx upgrade).
+        The accessor chain — ``_embedded_vlm._engine.engine.scheduler.
+        paged_cache_manager.usage`` — was verified against omlx 0.x
+        (cache/paged_cache.py: ``PagedCacheManager.usage`` is a @property).
+        Falls back through a few common attr names just in case the upstream
+        renames it, so routing still works without a hot fix.
+
+        Wrapped in a broad except: this is best-effort telemetry feeding
+        a routing heuristic; it must never break the inference path.
+        """
+        try:
+            scheduler = self._embedded_vlm._engine.engine.scheduler  # type: ignore[union-attr]
+        except (AttributeError, RuntimeError):
+            return None
+        # 1) Most direct: PagedCacheManager.usage (current omlx 0.x).
+        mgr = getattr(scheduler, "paged_cache_manager", None)
+        if mgr is not None:
+            # NOTE: do NOT use mgr.usage property — it computes
+            # 1 - free_block_queue.num_free_blocks/(max_blocks-1), but
+            # num_free_blocks is the bounded FREE QUEUE size (capped ~256),
+            # not the unallocated block count. On a near-empty cache it
+            # still returns ~0.997 because the free queue size << max_blocks.
+            # Correct: allocated_count / max_blocks.
+            try:
+                max_blocks = getattr(mgr, "max_blocks", None)
+                if max_blocks and max_blocks > 0:
+                    alloc_count = getattr(mgr, "_current_allocated_count", None)
+                    if alloc_count is None:
+                        allocated = getattr(mgr, "allocated_blocks", None) or {}
+                        alloc_count = len(allocated)
+                    if alloc_count is not None:
+                        return float(alloc_count) / float(max_blocks)
+            except (AttributeError, TypeError, ZeroDivisionError):
+                pass
+        return None
+
+    def _route(self, prompt_tokens: list[int]) -> str:
+        """Decide whether this request runs on dflash or the embedded BG engine.
+
+        Path A signals (D3 layout):
+
+          * If we're already at the dflash concurrency cap, route to BG.
+          * If the embedded engine's paged KV cache usage exceeds the
+            configured pressure threshold, route to BG (avoid evicting
+            unrelated requests just to fit a dflash decode).
+          * If the prompt is at or past ``dflash_max_ctx``, route to BG.
+          * Otherwise route to dflash.
+
+        ``_active_count`` is sampled (not held); the increment for the
+        accepted request happens in the dflash decode path under
+        ``_concurrent_sem``, so the cap is enforced even when several
+        requests race here. Reading without locking is OK: the worst case
+        is a marginal request getting routed to dflash and then blocking
+        on the semaphore, which is the same behaviour the BG-route would
+        produce.
+
+        Side effect: records the decision via ``_record_route`` (counters
+        + jsonl metric line). Callers should NOT call ``_record_route``
+        again from ``generate`` / ``stream_generate``.
+        """
+        ctx_len = len(prompt_tokens)
+        if self._max_dflash_concurrent is not None \
+                and self._active_count >= self._max_dflash_concurrent:
+            self._record_route("bg", "concurrency", ctx_len, None)
+            return "bg"
+
+        kv_pressure = self._kv_pressure()
+        if kv_pressure is not None and kv_pressure > self._kv_pressure_threshold:
+            self._record_route("bg", "kv_pressure", ctx_len, kv_pressure)
+            return "bg"
+
+        if self._max_dflash_ctx is not None and ctx_len >= self._max_dflash_ctx:
+            self._record_route("bg", "max_ctx", ctx_len, kv_pressure)
+            return "bg"
+
+        self._record_route("dflash", "default", ctx_len, kv_pressure)
+        return "dflash"
+
+    def _record_route(
+        self,
+        routed_to: str,
+        reason: str,
+        ctx_len: int,
+        kv_pressure: float | None,
+    ) -> None:
+        """Bookkeeping for one routing decision: counters + jsonl metric.
+
+        Metric write is best-effort (size-capped, env-disable-able); see
+        ``omlx.metrics.dflash_routing`` for the size guard and disable
+        knobs. ``request_id`` is intentionally omitted — dflash.py has no
+        handle on the API-layer request id; threading one through is a
+        D3.x cleanup.
+        """
+        import time
+
+        from ..metrics.dflash_routing import write_routing_decision
+
+        self._last_route = routed_to
+        if routed_to == "dflash":
+            self._dflash_routed_count += 1
+        else:
+            self._bg_routed_count += 1
+
+        write_routing_decision({
+            "ts": time.time(),
+            "model_name": self._model_name,
+            "ctx_len": ctx_len,
+            "active_count": self._active_count,
+            "kv_usage_ratio": kv_pressure,
+            "projected_kv_after": None,  # spec A.2 — D3.x placeholder.
+            "routed_to": routed_to,
+            "reason": reason,
+        })
 
     def _get_think_token_id(self, attr: str) -> int | None:
         """Safely read think_start_id / think_end_id from the tokenizer."""
@@ -578,6 +842,9 @@ def _run_generate_streaming(
             except ImportError:
                 pass
 
+            from dflash_mlx.engine.events import TokenEvent, SummaryEvent
+
+            prompt_token_count = 0
             for event in event_iter:
                 if stop_event.is_set():
                     logger.info("DFlash generation aborted by client")
@@ -622,6 +889,9 @@ def _run_generate_streaming(
                     asyncio.run_coroutine_threadsafe(
                         queue.put(("", [], True, metrics)), loop
                     )
+                # Other event types (PrefillProgressEvent, PrefillCompleteEvent,
+                # SnapshotPublishedEvent, etc.) are informational — skip silently.
+                # snapshot_service handles snapshot lifecycle automatically.
 
                 # Cycle, memory, prefill, and snapshot events are consumed by the
                 # runtime cache manager and metrics layers — omlx does not surface
@@ -648,7 +918,7 @@ def _run_generate_streaming(
                 queue.put(("", [], True, {"aborted": stop_event.is_set()})),
                 loop,
             )
-            self._active_request = False
+            self._active_count -= 1
 
     async def generate(
         self,
@@ -668,108 +938,118 @@ async def generate(
 
         prompt_tokens = self._tokenizer_obj.encode(prompt)
 
-        # Fallback: evict dflash models, start LLM/VLM engine
-        if self._should_fallback(prompt_tokens):
-            if not self._in_fallback_mode:
-                logger.info(
-                    f"DFlash context fallback: {len(prompt_tokens)} >= {self._max_dflash_ctx}, "
-                    f"evicting dflash models and switching to {self._fallback_engine_type} engine"
-                )
-                await self._evict_dflash_and_start_fallback()
-            return await self._fallback_engine.generate(
+        # Path A routing: decide between dflash and the long-lived embedded
+        # BG engine. Both stay loaded for the engine's lifetime, so no
+        # eviction / reload is involved on either branch. ``_route`` records
+        # the decision internally (counters + jsonl metric); callers must
+        # NOT invoke ``_record_route`` again.
+        route = self._route(prompt_tokens)
+        if route == "bg":
+            return await self._embedded_vlm.generate(
                 prompt=prompt, max_tokens=max_tokens, temperature=temperature,
                 top_p=top_p, top_k=top_k, min_p=min_p,
                 repetition_penalty=repetition_penalty,
                 presence_penalty=presence_penalty, stop=stop, **kwargs,
             )
 
-        # Already in fallback mode but short context came in.
-        # Stay in fallback mode (reloading dflash models is expensive).
-        if self._in_fallback_mode:
-            return await self._fallback_engine.generate(
-                prompt=prompt, max_tokens=max_tokens, temperature=temperature,
-                top_p=top_p, top_k=top_k, min_p=min_p,
-                repetition_penalty=repetition_penalty,
-                presence_penalty=presence_penalty, stop=stop, **kwargs,
-            )
+        # Lazy drafter: load now if not yet loaded (no-op in eager mode).
+        await self._ensure_drafter_loaded()
 
-        from ..engine_core import get_mlx_executor
+        # Concurrent cap: hold at most ``dflash_max_concurrent`` requests
+        # inside the DFlash decode path. ``_route`` already bounced excess
+        # callers to the BG engine, but the semaphore stays as a guard
+        # against in-flight races where ``_active_count`` was sampled
+        # before the previous request incremented it.
+        if self._concurrent_sem is not None:
+            await self._concurrent_sem.acquire()
 
-        loop = asyncio.get_running_loop()
-        stop_event = threading.Event()
+        try:
+            from ..engine_core import get_mlx_executor
 
-        def _run():
-            from dflash_mlx.engine.events import SummaryEvent, TokenEvent
+            loop = asyncio.get_running_loop()
+            stop_event = threading.Event()
 
-            event_iter = None
-            try:
-                event_iter, prefix_flow, stop_ids = self._stream_dflash_events(
-                    prompt_tokens=prompt_tokens,
-                    max_tokens=max_tokens,
-                )
-                tokens: list[int] = []
-                summary: SummaryEvent | None = None
-                for event in event_iter:
-                    if stop_event.is_set():
-                        logger.info("DFlash generation aborted by client")
-                        break
-                    if isinstance(event, TokenEvent):
-                        token_id = int(event.token_id)
-                        if token_id in stop_ids:
-                            continue
-                        tokens.append(token_id)
-                    elif isinstance(event, SummaryEvent):
-                        summary = event
-                return summary, tokens
-            finally:
-                if event_iter is not None:
-                    close = getattr(event_iter, "close", None)
-                    if close is not None:
-                        try:
-                            close()
-                        except Exception as exc:
-                            logger.debug(f"event_iter.close() raised: {exc}")
-                self._active_request = False
-
-        self._active_request = True
-        future = loop.run_in_executor(get_mlx_executor(), _run)
-        try:
-            summary, generated = await asyncio.shield(asyncio.wrap_future(future))
-        except asyncio.CancelledError:
-            stop_event.set()
-            logger.info("DFlash generate cancelled, waiting for executor to drain")
+            def _run():
+                event_iter = None
+                try:
+                    event_iter, prefix_flow, stop_ids = self._stream_dflash_events(
+                        prompt_tokens=prompt_tokens,
+                        max_tokens=max_tokens,
+                    )
+                    from dflash_mlx.engine.events import TokenEvent, SummaryEvent
+
+                    tokens: list[int] = []
+                    summary: Any = None
+                    for event in event_iter:
+                        if stop_event.is_set():
+                            logger.info("DFlash generation aborted by client")
+                            break
+                        if isinstance(event, TokenEvent):
+                            token_id = int(event.token_id)
+                            if token_id in stop_ids:
+                                continue
+                            tokens.append(token_id)
+                        elif isinstance(event, SummaryEvent):
+                            summary = event
+                        # Other events (progress, snapshots) are informational.
+                    return summary, tokens
+                finally:
+                    if event_iter is not None:
+                        close = getattr(event_iter, "close", None)
+                        if close is not None:
+                            try:
+                                close()
+                            except Exception as exc:
+                                logger.debug(f"event_iter.close() raised: {exc}")
+                    self._active_count -= 1
+
+            self._active_count += 1
+            future = loop.run_in_executor(get_mlx_executor(), _run)
             try:
-                await asyncio.wait_for(asyncio.wrap_future(future), timeout=10.0)
-            except asyncio.TimeoutError:
-                logger.warning("DFlash executor did not exit within 10s after abort")
-            except Exception:
-                pass
-            raise
-        text = self._tokenizer_obj.decode(generated, skip_special_tokens=True)
-        text = clean_special_tokens(text)
-
-        # Reasoning models (Qwen3.x with enable_thinking, DeepSeek, MiniMax, ...)
-        # have <think>\n at the END of the prompt, so the model's first
-        # generated token is already INSIDE the thinking block. The opening
-        # tag never appears in the output, which would prevent extract_thinking
-        # / ThinkingParser from separating reasoning from content. Prepend
-        # the tag here so the API layer can split them correctly.
-        if self._detect_needs_think_prefix(prompt_tokens):
-            text = self._think_prefix_text() + text
-
-        prompt_token_count = (
-            int(summary.prompt_token_count) if summary is not None else len(prompt_tokens)
-        )
-        completion_token_count = (
-            int(summary.generation_tokens) if summary is not None else len(generated)
-        )
-        return GenerationOutput(
-            text=text,
-            tokens=generated,
-            prompt_tokens=prompt_token_count,
-            completion_tokens=completion_token_count,
-            finish_reason="stop",
-        )
+                summary, generated = await asyncio.shield(asyncio.wrap_future(future))
+            except asyncio.CancelledError:
+                stop_event.set()
+                logger.info("DFlash generate cancelled, waiting for executor to drain")
+                try:
+                    await asyncio.wait_for(asyncio.wrap_future(future), timeout=10.0)
+                except asyncio.TimeoutError:
+                    logger.warning("DFlash executor did not exit within 10s after abort")
+                except Exception:
+                    pass
+                raise
+            # summary is a SummaryEvent dataclass (upstream API) or None.
+
+            text = self._tokenizer_obj.decode(generated, skip_special_tokens=True)
+            text = clean_special_tokens(text)
+
+            # Reasoning models (Qwen3.x with enable_thinking, DeepSeek, MiniMax, ...)
+            # have <think>\n at the END of the prompt, so the model's first
+            # generated token is already INSIDE the thinking block. The opening
+            # tag never appears in the output, which would prevent extract_thinking
+            # / ThinkingParser from separating reasoning from content. Prepend
+            # the tag here so the API layer can split them correctly.
+            if self._detect_needs_think_prefix(prompt_tokens):
+                text = self._think_prefix_text() + text
+
+            # summary is a SummaryEvent dataclass (upstream API) or None if
+            # generation ended before reaching the summary event.
+            prompt_tokens_count = (
+                int(summary.prompt_token_count) if summary is not None else len(prompt_tokens)
+            )
+            completion_tokens_count = (
+                int(summary.generation_tokens) if summary is not None else len(generated)
+            )
+
+            return GenerationOutput(
+                text=text,
+                tokens=generated,
+                prompt_tokens=prompt_tokens_count,
+                completion_tokens=completion_tokens_count,
+                finish_reason="stop",
+            )
+        finally:
+            if self._concurrent_sem is not None:
+                self._concurrent_sem.release()
 
     async def stream_generate(
         self,
@@ -789,15 +1069,13 @@ async def stream_generate(
 
         prompt_tokens = self._tokenizer_obj.encode(prompt)
 
-        # Fallback: evict dflash models, start LLM/VLM engine
-        if self._should_fallback(prompt_tokens):
-            if not self._in_fallback_mode:
-                logger.info(
-                    f"DFlash context fallback: {len(prompt_tokens)} >= {self._max_dflash_ctx}, "
-                    f"evicting dflash models and switching to {self._fallback_engine_type} engine"
-                )
-                await self._evict_dflash_and_start_fallback()
-            async for output in self._fallback_engine.stream_generate(
+        # Path A routing: see ``generate``. Streaming mirrors the same
+        # routing decision; the dflash side keeps its concurrency cap via
+        # the semaphore released in the finally clause below. ``_route``
+        # records the decision; do not call ``_record_route`` again here.
+        route = self._route(prompt_tokens)
+        if route == "bg":
+            async for output in self._embedded_vlm.stream_generate(
                 prompt=prompt, max_tokens=max_tokens, temperature=temperature,
                 top_p=top_p, top_k=top_k, min_p=min_p,
                 repetition_penalty=repetition_penalty,
@@ -806,16 +1084,15 @@ async def stream_generate(
                 yield output
             return
 
-        # Already in fallback mode — stay there
-        if self._in_fallback_mode:
-            async for output in self._fallback_engine.stream_generate(
-                prompt=prompt, max_tokens=max_tokens, temperature=temperature,
-                top_p=top_p, top_k=top_k, min_p=min_p,
-                repetition_penalty=repetition_penalty,
-                presence_penalty=presence_penalty, stop=stop, **kwargs,
-            ):
-                yield output
-            return
+        # Lazy drafter: load now if not yet loaded (no-op in eager mode).
+        await self._ensure_drafter_loaded()
+
+        # Concurrent cap: hold at most ``dflash_max_concurrent`` requests
+        # inside the DFlash streaming path. Released in the finally clause
+        # below so it fires even when the async generator is cancelled
+        # mid-iteration.
+        if self._concurrent_sem is not None:
+            await self._concurrent_sem.acquire()
 
         prompt_len = len(prompt_tokens)
         loop = asyncio.get_running_loop()
@@ -831,7 +1108,7 @@ async def stream_generate(
         think_prefix_pending = needs_think_prefix
 
         from ..engine_core import get_mlx_executor
-        self._active_request = True
+        self._active_count += 1
         future = loop.run_in_executor(
             get_mlx_executor(),
             self._run_generate_streaming,
@@ -894,6 +1171,8 @@ async def stream_generate(
                 )
             except Exception as exc:
                 logger.debug(f"DFlash executor future raised: {exc}")
+            if self._concurrent_sem is not None:
+                self._concurrent_sem.release()
 
     async def chat(
         self,
@@ -959,9 +1238,9 @@ async def stream_chat(
             yield output
 
     def has_active_requests(self) -> bool:
-        if self._fallback_engine is not None and self._fallback_engine.has_active_requests():
+        if self._embedded_vlm is not None and self._embedded_vlm.has_active_requests():
             return True
-        return self._active_request
+        return self._active_count > 0
 
     def get_stats(self) -> dict[str, Any]:
         return {
@@ -969,14 +1248,22 @@ def get_stats(self) -> dict[str, Any]:
             "model_name": self._model_name,
             "draft_model": self._draft_model_path,
             "max_dflash_ctx": self._max_dflash_ctx,
-            "fallback_engine_type": self._fallback_engine_type,
-            "in_fallback_mode": self._in_fallback_mode,
+            "max_dflash_concurrent": self._max_dflash_concurrent,
+            "kv_pressure_threshold": self._kv_pressure_threshold,
+            "active_count": self._active_count,
+            "embedded_engine_type": self._fallback_engine_type,
+            "last_route": self._last_route,
+            "dflash_routed_count": self._dflash_routed_count,
+            "bg_routed_count": self._bg_routed_count,
+            "concurrent_sem_locked": (
+                self._concurrent_sem.locked() if self._concurrent_sem is not None else False
+            ),
             "loaded": self._loaded,
             "in_memory_cache": self._in_memory_cache_enabled,
             "ssd_cache": self._resolve_dflash_l2_dir() is not None,
         }
 
     def get_cache_stats(self) -> dict[str, Any] | None:
-        if self._fallback_engine is not None:
-            return self._fallback_engine.get_cache_stats()
+        if self._embedded_vlm is not None:
+            return self._embedded_vlm.get_cache_stats()
         return None
diff --git a/omlx/engine_pool.py b/omlx/engine_pool.py
index 319dfa78..348c66f4 100644
--- a/omlx/engine_pool.py
+++ b/omlx/engine_pool.py
@@ -642,6 +642,20 @@ async def _load_engine(self, model_id: str, force_lm: bool = False) -> None:
                             omlx_ssd_cache_dir=getattr(
                                 self._scheduler_config, "paged_ssd_cache_dir", None
                             ),
+                            # Path A: pass routing knobs explicitly. ``model_settings``
+                            # values still win inside DFlashEngine.__init__; these
+                            # are the engine-pool-level defaults when the per-model
+                            # settings don't set them. D3 fix: ``or 1`` collapsed
+                            # an explicit 0 (user disables dflash decode path) into
+                            # the default 1. Now None -> 1, explicit 0 stays 0.
+                            dflash_max_concurrent=(
+                                1
+                                if getattr(model_settings, "dflash_max_concurrent", None) is None
+                                else int(getattr(model_settings, "dflash_max_concurrent"))
+                            ),
+                            dflash_kv_pressure_threshold=getattr(
+                                model_settings, "dflash_kv_pressure_threshold", 0.7
+                            ) or 0.7,
                         )
                         logger.info(f"DFlash enabled for {model_id}, draft={dflash_draft}")
                     except ImportError:
@@ -697,7 +711,10 @@ async def _load_engine(self, model_id: str, force_lm: bool = False) -> None:
                         model_settings=model_settings,
                     )
 
-            _is_dflash_engine = engine is not None and type(engine).__name__ == "DFlashEngine"
+            # D3: replace string-name sniff with duck typing on the Path A
+            # ``_dflash_bundle`` attr. Survives renames / subclassing and is a
+            # stronger contract than ``type().__name__``.
+            _is_dflash_engine = engine is not None and hasattr(engine, "_dflash_bundle")
 
             try:
                 await engine.start()
diff --git a/omlx/metrics/__init__.py b/omlx/metrics/__init__.py
new file mode 100644
index 00000000..2dc05956
--- /dev/null
+++ b/omlx/metrics/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+"""omlx metrics helpers (jsonl writers with size guard)."""
diff --git a/omlx/metrics/dflash_routing.py b/omlx/metrics/dflash_routing.py
new file mode 100644
index 00000000..06041067
--- /dev/null
+++ b/omlx/metrics/dflash_routing.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+"""DFlash routing decision metric writer with size guard.
+
+Path A produces one routing decision per request (dflash vs bg, plus reason
+and a sampled KV pressure). D3 lands a minimal append-only jsonl writer so
+the choice can be analyzed offline (acceptance ratio under load, KV pressure
+trip rates, etc.) without taking a dependency on a metrics backend.
+
+Guards:
+- Size guard: once the file exceeds ``DFLASH_METRIC_MAX_SIZE`` bytes
+  (default 500 MiB) we stop writing and log once. Rotation is a deferred
+  D3.x story — for now, ``rm`` or archive the file to reset.
+- Test/disable guard: ``DFLASH_METRIC_DISABLE=1`` short-circuits the write.
+  Used by pytest to avoid filesystem writes during unit tests that
+  exercise ``_route()`` directly.
+- I/O errors are swallowed at DEBUG level — metric writes must never
+  break the inference path.
+
+Spec reference: ``docs/dflash-pathA-spec.md`` §2 (A.1 record schema) and
+§11.1 (size guard rationale).
+"""
+
+import json
+import logging
+import os
+import threading
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_MAX_SIZE = 500 * 1024 * 1024  # 500 MiB
+_lock = threading.Lock()
+_size_warned = False
+
+
+def get_metric_path() -> Path:
+    """Resolve the dflash routing metric file path.
+
+    Honors ``DFLASH_METRIC_DIR`` for tests / staging environments that
+    want isolation. Directory is created on demand so a fresh box can
+    start emitting metrics without a manual ``mkdir``.
+    """
+    base = Path(
+        os.environ.get(
+            "DFLASH_METRIC_DIR",
+            str(Path.home() / ".omlx" / "metrics"),
+        )
+    )
+    base.mkdir(parents=True, exist_ok=True)
+    return base / "dflash_routing.jsonl"
+
+
+def write_routing_decision(record: dict[str, Any]) -> None:
+    """Append one routing decision to the jsonl log.
+
+    Args:
+        record: A dict with at least ``ts``, ``model_name``, ``routed_to``,
+            ``reason``. ``ctx_len``, ``active_count``, ``kv_usage_ratio``
+            are expected by downstream analysis but not required here —
+            the writer just dumps what the caller passes.
+
+    Returns:
+        None. Errors are logged at DEBUG; the inference path never sees them.
+    """
+    global _size_warned
+    if os.environ.get("DFLASH_METRIC_DISABLE"):
+        return
+
+    max_size = int(os.environ.get("DFLASH_METRIC_MAX_SIZE", _DEFAULT_MAX_SIZE))
+    try:
+        path = get_metric_path()
+        with _lock:
+            if path.exists() and path.stat().st_size > max_size:
+                if not _size_warned:
+                    logger.warning(
+                        "dflash_routing.jsonl exceeded %d bytes — STOPPED "
+                        "writing routing metrics. Rotate the file (rm or "
+                        "archive) or set DFLASH_METRIC_MAX_SIZE higher.",
+                        max_size,
+                    )
+                    _size_warned = True
+                return
+            with path.open("a") as f:
+                f.write(json.dumps(record) + "\n")
+    except OSError as e:
+        logger.debug("dflash metric write failed: %s", e)
diff --git a/omlx/model_settings.py b/omlx/model_settings.py
index 61031062..e6153613 100644
--- a/omlx/model_settings.py
+++ b/omlx/model_settings.py
@@ -67,6 +67,13 @@ class ModelSettings:
         dflash_draft_quant_activation_bits: Quantization activation bits (16, 32).
         dflash_draft_quant_group_size: Quantization group size (32, 64, 128).
         dflash_max_ctx: Token threshold to fall back to BatchedEngine (None = unlimited).
+        dflash_max_concurrent: Cap on simultaneously in-flight DFlash requests (default 4,
+            None = unlimited). Above this count, new requests block on an asyncio.Semaphore
+            until a slot opens. DFlash decode is single-stream by design — this is NOT a
+            throughput knob; it is an admission-control gate that bounds memory under
+            bursts (each in-flight request holds its own KV cache, hundreds of MB to
+            several GB) and keeps tail latency predictable. Lower (1-2) on tight RAM
+            (< 64 GB); 8 on 128 GB+; leave None only if upstream load shape is trusted.
         dflash_in_memory_cache: Enable DFlash L1 (RAM) prefix cache.
         dflash_in_memory_cache_max_entries: L1 cache max entries (default 4, matches dflash balanced profile).
         dflash_in_memory_cache_max_bytes: L1 cache byte budget.
@@ -130,6 +137,8 @@ class ModelSettings:
     dflash_draft_quant_activation_bits: Optional[int] = None  # 16, 32
     dflash_draft_quant_group_size: Optional[int] = None  # 32, 64, 128
     dflash_max_ctx: Optional[int] = None  # None = unlimited; trigger BatchedEngine fallback when prompt_len >= this
+    dflash_max_concurrent: Optional[int] = 4  # In-flight admission cap (semaphore queue); None = unlimited. Default 4 bounds memory under bursts; raise on 128GB+ machines, lower on tight RAM.
+    dflash_kv_pressure_threshold: Optional[float] = None  # Float in (0,1] — embedded VLM KV cache usage above this routes new requests to BG path. None defers to DFlashEngine default 0.7.
     # DFlash prefix cache (private to dflash; separate from omlx tiered cache because
     # snapshots include draft model GDN state and target hidden chunks omlx never tracks)
     dflash_in_memory_cache: bool = True
diff --git a/omlx/speculative/__init__.py b/omlx/speculative/__init__.py
index 8875a2b5..219afb5b 100644
--- a/omlx/speculative/__init__.py
+++ b/omlx/speculative/__init__.py
@@ -5,4 +5,63 @@
 infrastructure with upstream speculative-decoding implementations in mlx-lm
 and mlx-vlm. Pure helpers (no business logic of their own) so the surface
 of internal-API dependencies is easy to audit on each upstream bump.
+
+Upstream compatibility patches (applied at import time):
+
+- ``dflash_mlx.runtime.get_stop_token_ids``: HF GemmaTokenizer's
+  ``eos_token_ids`` (plural) attribute returns ``int`` rather than a list.
+  Upstream wraps in ``list(...)`` which raises ``TypeError: 'int' object is
+  not iterable``. We monkey-patch to coerce int → ``[int]``. Discovered
+  during D3 spike6 end-to-end on m5max with Gemma 4. The patch is idempotent
+  and only modifies the upstream module attribute at runtime, not the source.
 """
+
+def _patch_dflash_get_stop_token_ids() -> None:
+    """Coerce int eos_token_ids to list before upstream wraps in list()."""
+    try:
+        import dflash_mlx.runtime as _dflash_runtime
+    except ImportError:
+        return
+    _original = _dflash_runtime.get_stop_token_ids
+    if getattr(_original, "_omlx_patched", False):
+        return
+    def _patched(tokenizer):
+        eid = getattr(tokenizer, "eos_token_ids", None)
+        if isinstance(eid, int):
+            return [eid]
+        return _original(tokenizer)
+    _patched._omlx_patched = True
+    _dflash_runtime.get_stop_token_ids = _patched
+
+
+_patch_dflash_get_stop_token_ids()
+
+
+def detect_fallback_engine_type(model_name: str) -> str:
+    """Return ``"vlm"`` if model has vision/audio capability, else ``"batched"``.
+
+    Uses ``vision_config``/``audio_config`` keys in ``config.json`` as the
+    canonical multimodal marker. ``processor_config.json`` alone is NOT a
+    reliable signal because text-only mlx-community wrappers may include
+    one for tokenizer setup (e.g. some Qwen 3.5 packaging).
+
+    For HF repo IDs (non-path), returns ``"batched"`` — caller should pass
+    a resolved local model directory.
+    """
+    import json
+    from pathlib import Path
+
+    p = Path(model_name)
+    if not p.is_dir():
+        return "batched"
+    cfg_path = p / "config.json"
+    if not cfg_path.exists():
+        return "batched"
+    try:
+        with cfg_path.open() as f:
+            cfg = json.load(f)
+        if "vision_config" in cfg or "audio_config" in cfg:
+            return "vlm"
+    except (OSError, json.JSONDecodeError, ValueError):
+        pass
+    return "batched"
diff --git a/omlx/speculative/dflash_factory.py b/omlx/speculative/dflash_factory.py
new file mode 100644
index 00000000..376a84c1
--- /dev/null
+++ b/omlx/speculative/dflash_factory.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Factory for attaching a DFlash drafter to an already-loaded VLM target.
+
+The vanilla ``dflash_mlx.runtime.bundle.load_runtime_bundle`` invokes
+``mlx_lm.utils.load(target_ref)`` internally — when the omlx DFlashEngine
+also runs an embedded ``VLMBatchedEngine`` (Path A double-engine layout),
+that means the target weights are materialized twice and Gemma 4 model
+memory doubles. This module skips the second load: it accepts a target
+model that the embedded VLM has already loaded (wrapped through
+``DFlashVLMTargetWrapper`` so dflash's mlx_lm-shaped TargetOps can see it)
+and only loads the small drafter.
+
+The mlx_vlm Gemma 4 model never matches ``Gemma4TargetOps.supports_model``
+(its inner module exposes ``model.embed_tokens`` rather than
+``inner.embed_tokens``); we bypass ``resolve_target_ops`` dispatch by
+constructing ``Gemma4TargetOps`` directly. For Qwen GDN targets we still
+go through ``resolve_target_ops`` since the wrapper-equivalent isn't
+needed there — Path A's first cut is Gemma 4 only, but we keep the
+factory generic so a future Qwen wrapper drops in.
+
+Mirrored side-effects from ``load_target_bundle`` that the dflash decode
+path relies on (Gemma 4 specifics):
+
+  * ``install_speculative_hooks`` — no-op on Gemma 4, kept for symmetry.
+  * ``configure_full_attention_split`` — no-op on Gemma 4.
+  * ``install_verify_linears`` — applied to the real mlx_vlm model
+    (``wrapped._vlm``) since it walks ``leaf_modules()`` and the wrapper
+    is not an ``nn.Module``. ``runtime_context.verify`` decides whether
+    this runs.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AttachedDFlashBundle:
+    """Subset of ``dflash_mlx.runtime.bundle.RuntimeBundle`` that omits the
+    target model (already owned by the embedded VLM engine).
+
+    Holding the wrapper here (``target_wrapper``) keeps it alive for the
+    lifetime of the bundle; dropping the only reference would let the proxy
+    views inside it get GC'd while dflash is mid-decode.
+    """
+
+    target_ops: Any
+    draft_model: Any
+    draft_backend: Any
+    draft_meta: dict[str, Any]
+    runtime_context: Any
+    target_wrapper: Any
+    resolved_draft_ref: str
+    effective_draft_quant: str | None
+
+
+def _construct_target_ops_for(wrapped_target: Any) -> Any:
+    """Construct TargetOps without going through ``resolve_target_ops``.
+
+    ``DFlashVLMTargetWrapper`` makes the mlx_vlm Gemma 4 model look
+    mlx_lm-shaped enough for ``Gemma4TargetOps`` runtime methods, but the
+    dispatch predicate ``supports_model`` still returns False because it
+    probes ``inner.embed_tokens`` which lives at ``model.embed_tokens`` on
+    the vlm model. We side-step dispatch and instantiate the family ops
+    directly based on the wrapped model's model_type.
+    """
+    # Walk through proxy to the real config; the wrapper exposes args view.
+    model_type = None
+    lm = getattr(wrapped_target, "language_model", None)
+    if lm is not None:
+        args = getattr(lm, "args", None)
+        if args is not None:
+            model_type = getattr(args, "model_type", None)
+    if model_type is None:
+        cfg = getattr(wrapped_target, "config", None)
+        if cfg is not None:
+            model_type = getattr(cfg, "model_type", None)
+
+    if isinstance(model_type, str) and "gemma4" in model_type.lower():
+        from dflash_mlx.engine.target_gemma4 import Gemma4TargetOps
+        return Gemma4TargetOps()
+
+    # Fall back to dispatch for any other family (covers Qwen GDN once a
+    # wrapper exists for it). Will raise NotImplementedError if not
+    # compatible — caller catches and falls back.
+    from dflash_mlx.engine.target_ops import resolve_target_ops
+    return resolve_target_ops(wrapped_target)
+
+
+def attach_dflash_to_loaded_target(
+    target_model: Any,
+    draft_path: str,
+    draft_quant: str | None,
+    runtime_context: Any,
+) -> AttachedDFlashBundle:
+    """Attach a DFlash drafter to a pre-loaded (and wrapped) target.
+
+    Args:
+        target_model: An mlx_vlm model already loaded by the embedded VLM
+            engine, wrapped through ``DFlashVLMTargetWrapper`` so dflash's
+            TargetOps see an mlx_lm-shaped surface.
+        draft_path: HF ref / local path of the DFlash drafter checkpoint.
+        draft_quant: Draft quantization spec string (e.g. ``"w4"``,
+            ``"w8"``) or None for the model default.
+        runtime_context: dflash runtime context (carries verify config,
+            cache config, etc.).
+
+    Returns:
+        ``AttachedDFlashBundle`` with the drafter, backend, ops, and
+        wrapper attached. The wrapper is retained so its proxy lifetime
+        ties to the bundle's.
+    """
+    from dflash_mlx.draft_backend import make_draft_backend
+    from dflash_mlx.engine.target_ops import bind_draft_to_target
+    from dflash_mlx.runtime.loading import load_draft_bundle
+    from dflash_mlx.runtime.registry import (
+        resolve_effective_draft_quant,
+        resolve_model_support_spec,
+    )
+
+    target_ops = _construct_target_ops_for(target_model)
+
+    # Mirror load_target_bundle's optional install_verify_linears step.
+    # Gemma 4 reports supports_verify_linear=True; install_speculative_hooks
+    # and configure_full_attention_split are no-ops on Gemma 4 but we still
+    # call them for symmetry (and so a future non-Gemma family works).
+    capabilities = target_ops.capabilities_for(target_model)
+    target_ops.install_speculative_hooks(target_model)
+    # Gemma 4's configure_full_attention_split takes the wrapped model and
+    # does nothing; pass enabled=False to match the safe default when we
+    # don't know quantize_kv_cache config from the embedded engine.
+    target_ops.configure_full_attention_split(
+        target_model, enabled=False, chunk_size=8,
+    )
+
+    verify_cfg = getattr(runtime_context, "verify", None)
+    verify_mode = getattr(verify_cfg, "mode", "auto") if verify_cfg else "auto"
+    verify_enabled = (
+        bool(capabilities.supports_verify_linear)
+        and verify_mode != "off"
+    )
+    if verify_enabled:
+        # install_verify_linears walks model.leaf_modules(); the proxy
+        # wrapper is not an nn.Module so swap on the underlying mlx_vlm
+        # model. The proxy continues to see swapped layers via __getattr__.
+        real_model = getattr(target_model, "_vlm", target_model)
+        from dflash_mlx.verify_linear import install_verify_linears
+        enable_qmm = bool(getattr(verify_cfg, "enable_qmm", True)) if verify_cfg else True
+        n_swapped = install_verify_linears(real_model, enable_qmm=enable_qmm)
+        logger.info(
+            f"DFlash factory: installed verify_linear on {n_swapped} "
+            f"QuantizedLinear modules of {type(real_model).__name__}"
+        )
+
+    # Resolve drafter via registry when a non-empty path was given (the
+    # registry returns None for unknown bases; explicit paths flow through
+    # unchanged).
+    resolved_draft_ref = draft_path
+    support_spec = None
+    try:
+        support_spec = resolve_model_support_spec(draft_path)
+    except Exception:
+        support_spec = None
+    effective_draft_quant = resolve_effective_draft_quant(
+        draft_quant=draft_quant,
+        resolved_draft_ref=resolved_draft_ref,
+        support_spec=support_spec,
+    )
+
+    draft_model, draft_meta = load_draft_bundle(
+        resolved_draft_ref,
+        lazy=True,
+        draft_quant=effective_draft_quant,
+    )
+    draft_meta = dict(draft_meta)
+    draft_meta["draft_quant_spec"] = effective_draft_quant
+    draft_meta["draft_quant_source"] = (
+        "explicit"
+        if (draft_quant or "").strip()
+        and (draft_quant or "").strip().lower() != "none"
+        else "model_default"
+        if effective_draft_quant is not None
+        else "none"
+    )
+
+    draft_backend = make_draft_backend()
+    bind_draft_to_target(draft_model, target_model, target_ops=target_ops)
+
+    return AttachedDFlashBundle(
+        target_ops=target_ops,
+        draft_model=draft_model,
+        draft_backend=draft_backend,
+        draft_meta=draft_meta,
+        runtime_context=runtime_context,
+        target_wrapper=target_model,
+        resolved_draft_ref=str(resolved_draft_ref),
+        effective_draft_quant=effective_draft_quant,
+    )
diff --git a/omlx/speculative/dflash_vlm_target_wrap.py b/omlx/speculative/dflash_vlm_target_wrap.py
new file mode 100644
index 00000000..94ec94fa
--- /dev/null
+++ b/omlx/speculative/dflash_vlm_target_wrap.py
@@ -0,0 +1,140 @@
+"""mlx_vlm -> dflash_mlx Gemma4TargetOps adapter wrapper.
+
+Non-destructive proxy view: wraps an mlx_vlm-loaded Gemma 4 model and
+exposes the mlx_lm-shaped surface that ``dflash_mlx.engine.target_gemma4.
+Gemma4TargetOps`` consumes.
+
+Spike validation (tmp_spike/spike{1,2,3}_*.py @ 2026-05-11) confirmed:
+  - DecoderLayer.__call__ signature is identical between mlx_vlm and mlx_lm
+    Gemma 4 implementations.
+  - install_verify_linears only touches QuantizedLinear modules; the vlm
+    vision projector (plain nn.Linear) is left alone.
+  - make_cache(enable_speculative_linear_cache=True) ignores the kwarg for
+    Gemma 4 (it just calls ``wrapper.make_cache()``).
+
+After cross-reading both source trees the surface drift is:
+
+  1. ``lm.args.X``                          -> ``lm.config.X``           (rename)
+  2. ``lm.args.num_kv_shared_layers``       -> covered by (1)
+  3. ``inner._get_per_layer_inputs``        SIGNATURE MISMATCH:
+        mlx_lm:   (input_ids, input_embeddings=None)   two args
+        mlx_vlm:  (input_ids)                          one arg
+     This is not just a rename - we adapt by dropping the embeddings arg.
+     CAVEAT: dflash's input_ids=None / embeddings-only path (nearest-vocab
+     reconstruction in mlx_lm) is unsupported here. Multimodal-embedding-
+     fed forward will break with NotImplementedError - documented as a
+     known limitation; only matters if Gemma4TargetOps is invoked with
+     image-tower embeddings directly (Path A target=text only, so safe).
+  4. ``inner._project_per_layer_inputs``    -> ``inner.project_per_layer_inputs``
+     (pure rename - signatures match)
+  5. ``lm.final_logit_softcapping``         attribute already exists on
+     mlx_vlm LanguageModel; also reachable via args view (1).
+  6. ``Gemma4TargetOps.supports_model``     returns False on vlm models
+     (model_type == "gemma4", inner.embed_tokens does not exist on the
+     vlm Model -> language_model.model.embed_tokens does). We bypass by
+     constructing ``Gemma4TargetOps()`` directly instead of routing through
+     ``resolve_target_ops`` dispatch.
+
+Use:
+    wrapped = DFlashVLMTargetWrapper(vlm_model)
+    ops = Gemma4TargetOps()                       # skip dispatch
+    ops.install_speculative_hooks(wrapped)        # no-op for Gemma 4
+    cache = ops.make_cache(wrapped, enable_speculative_linear_cache=True)
+"""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import mlx.core as mx
+
+
+class _ArgsView:
+    """Expose ``lm.args.X`` reading underneath from ``lm.config``."""
+
+    __slots__ = ("_config",)
+
+    def __init__(self, config: Any) -> None:
+        object.__setattr__(self, "_config", config)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._config, name)
+
+
+class _InnerView:
+    """Non-destructive proxy for ``LanguageModel.model`` (the Gemma4TextModel).
+
+    Bridges the private-name variants and adapts the
+    ``_get_per_layer_inputs`` signature to the mlx_lm two-arg form.
+    """
+
+    __slots__ = ("_inner",)
+
+    def __init__(self, inner: Any) -> None:
+        object.__setattr__(self, "_inner", inner)
+
+    def _get_per_layer_inputs(
+        self,
+        input_ids: Optional[mx.array],
+        input_embeddings: Optional[mx.array] = None,
+    ) -> mx.array:
+        # mlx_vlm only knows how to derive per-layer inputs from input_ids.
+        # The mlx_lm nearest-vocab reconstruction (input_ids=None path) is
+        # not implemented in mlx_vlm; refuse rather than silently miscompute.
+        if input_ids is None:
+            raise NotImplementedError(
+                "DFlashVLMTargetWrapper: mlx_vlm Gemma 4 cannot derive "
+                "per-layer inputs from input_embeddings alone. Provide "
+                "input_ids (text-only target path)."
+            )
+        return self._inner.get_per_layer_inputs(input_ids)
+
+    @property
+    def _project_per_layer_inputs(self):
+        # Signature already matches (input_embeddings, per_layer_inputs=None).
+        return self._inner.project_per_layer_inputs
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._inner, name)
+
+
+class _LangModelView:
+    """Non-destructive proxy for ``LanguageModel``; exposes ``args``."""
+
+    __slots__ = ("_lm", "_inner_view", "_args_view")
+
+    def __init__(self, lm: Any) -> None:
+        object.__setattr__(self, "_lm", lm)
+        object.__setattr__(self, "_inner_view", _InnerView(lm.model))
+        object.__setattr__(self, "_args_view", _ArgsView(lm.config))
+
+    @property
+    def args(self) -> Any:
+        return self._args_view
+
+    @property
+    def model(self) -> Any:
+        return self._inner_view
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._lm, name)
+
+
+class DFlashVLMTargetWrapper:
+    """Wrap an mlx_vlm Gemma 4 model so it presents an mlx_lm-shaped surface
+    to ``dflash_mlx.engine.target_gemma4.Gemma4TargetOps``.
+    """
+
+    __slots__ = ("_vlm", "_lm_view")
+
+    def __init__(self, vlm_model: Any) -> None:
+        object.__setattr__(self, "_vlm", vlm_model)
+        object.__setattr__(
+            self, "_lm_view", _LangModelView(vlm_model.language_model)
+        )
+
+    @property
+    def language_model(self) -> Any:
+        return self._lm_view
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._vlm, name)
diff --git a/tests/test_dflash_engine.py b/tests/test_dflash_engine.py
index 011e76aa..1bd047d3 100644
--- a/tests/test_dflash_engine.py
+++ b/tests/test_dflash_engine.py
@@ -216,8 +216,14 @@ def test_cache_stats_returns_none(self):
         )
         assert engine.get_cache_stats() is None
 
-    def test_should_fallback_unlimited_when_max_ctx_none(self):
-        """A None threshold means dflash handles every prompt size."""
+    def test_route_unlimited_when_max_ctx_none(self):
+        """A None threshold means dflash handles every prompt size.
+
+        Path A renamed ``_should_fallback`` to ``_route`` and made it return
+        a string (``"dflash"`` vs ``"bg"``) instead of a bool. The underlying
+        ``dflash_max_ctx`` gate still exists; this test preserves the same
+        coverage in the new ergonomics.
+        """
         try:
             from omlx.engine.dflash import DFlashEngine
         except ImportError:
@@ -228,9 +234,13 @@ def test_should_fallback_unlimited_when_max_ctx_none(self):
             draft_model_path="test-draft",
             model_settings=ModelSettings(dflash_max_ctx=None),
         )
-        assert engine._should_fallback([0] * 10_000) is False
+        assert engine._route([0] * 10_000) == "dflash"
 
-    def test_should_fallback_triggers_at_threshold(self):
+    def test_route_triggers_bg_at_max_ctx_threshold(self):
+        """Path A: prompts >= dflash_max_ctx route to the BG engine.
+
+        Successor to ``test_should_fallback_triggers_at_threshold``.
+        """
         try:
             from omlx.engine.dflash import DFlashEngine
         except ImportError:
@@ -241,8 +251,8 @@ def test_should_fallback_triggers_at_threshold(self):
             draft_model_path="test-draft",
             model_settings=ModelSettings(dflash_max_ctx=4096),
         )
-        assert engine._should_fallback([0] * 4095) is False
-        assert engine._should_fallback([0] * 4096) is True
+        assert engine._route([0] * 4095) == "dflash"
+        assert engine._route([0] * 4096) == "bg"
 
     def test_build_quant_spec(self):
         try: