jundot · panwudi · May 12, 2026
diff --git a/omlx/admin/benchmark.py b/omlx/admin/benchmark.py
@@ -152,13 +152,18 @@ def _compute_single_metrics(
     ttft_ms = ttft_s * 1000
     tpot_ms = (gen_duration / max(completion_tokens - 1, 1)) * 1000
     gen_tps = completion_tokens / max(gen_duration, 1e-9)
+    # wall_tg_tps: gen tokens over total wall (includes prefill). Used as the
+    # 1x baseline for Continuous Batching speedup so the ratio is symmetric
+    # with batch tg_tps (which is also total_gen / wall_time).
+    wall_tg_tps = completion_tokens / max(e2e_duration, 1e-9)
     processing_tps = prompt_tokens / max(ttft_s, 1e-9)
     total_throughput = (prompt_tokens + completion_tokens) / max(e2e_duration, 1e-9)
 
     return {
         "ttft_ms": round(ttft_ms, 1),
         "tpot_ms": round(tpot_ms, 2),
         "gen_tps": round(gen_tps, 1),
+        "wall_tg_tps": round(wall_tg_tps, 1),
         "processing_tps": round(processing_tps, 1),
         "e2e_latency_s": round(e2e_duration, 3),
         "total_throughput": round(total_throughput, 1),
@@ -238,6 +243,76 @@ async def _run_single_test(
     )
 
 
+
+async def _run_batch_test_via_stream_generate(
+    engine: Any,
+    prompts: list[str],
+    prompt_tokens: int,
+    max_tokens: int,
+    batch_size: int,
+) -> dict:
+    """Batch test path for engines that only expose high-level stream_generate
+    (no engine_core). Used for DFlashEngine where concurrent requests
+    serialize through _active_request; metrics will reflect serial behavior.
+    """
+
+    async def _single_request(prompt: str) -> dict:
+        start = time.perf_counter()
+        first_token = None
+        completion_tokens = 0
+        async for output in engine.stream_generate(
+            prompt=prompt, max_tokens=max_tokens, temperature=0.0
+        ):
+            ct = getattr(output, "completion_tokens", None)
+            if ct is not None and ct > 0 and first_token is None:
+                first_token = time.perf_counter()
+            if ct is not None:
+                completion_tokens = ct
+            if getattr(output, "finished", False):
+                break
+        end = time.perf_counter()
+        if first_token is None:
+            first_token = end
+        return {
+            "ttft_s": first_token - start,
+            "first_token_abs": first_token,
+            "completion_tokens": completion_tokens,
+            "wall_s": end - start,
+        }
+
+    wall_start = time.perf_counter()
+    results = await asyncio.gather(
+        *[_single_request(prompts[i]) for i in range(batch_size)]
+    )
+    wall_end = time.perf_counter()
+
+    total_gen_tokens = sum(r["completion_tokens"] for r in results)
+    total_prompt_tokens = prompt_tokens * batch_size
+    wall_time = wall_end - wall_start
+    avg_ttft_ms = (sum(r["ttft_s"] for r in results) / batch_size) * 1000
+
+    # pp TPS: total prompt tokens / time until ALL requests finish prefill
+    max_first_token = max(r["first_token_abs"] for r in results)
+    prefill_wall_time = max_first_token - wall_start
+    pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)
+
+    # tg TPS: wall-aggregate (total_gen / wall_time). Same formula used in
+    # the engine_core batch path so DFlash ↔ BatchedEngine ratios are
+    # symmetric. The alternative (gen_wall_time = wall_end - max_first_token)
+    # is inflated for any engine that serializes prefill or decode, and
+    # makes cross-engine speedup columns meaningless. See _run_batch_test.
+    tg_tps = total_gen_tokens / max(wall_time, 1e-9)
+
+    return {
+        "pp_tps": round(pp_tps, 1),
+        "tg_tps": round(tg_tps, 1),
+        "avg_ttft_ms": round(avg_ttft_ms, 1),
+        "e2e_latency_s": round(wall_time, 3),
+        "total_gen_tokens": total_gen_tokens,
+        "batch_size": batch_size,
+    }
+
+
 async def _run_batch_test(
     engine: Any,
     prompts: list[str],
@@ -247,7 +322,7 @@ async def _run_batch_test(
 ) -> dict:
     """Run a continuous batching benchmark test.
 
-    Submits batch_size concurrent requests via the engine core and measures
+    Submits batch_size concurrent requests via the engine and measures
     aggregate throughput including pp TPS and tg TPS.
 
     Args:
@@ -256,6 +331,18 @@ async def _run_batch_test(
                  has a unique UUID prefix.
         prompt_tokens: Number of prompt tokens per request (for pp TPS calc).
     """
+    # Dispatch: engines exposing engine_core (BatchedEngine, VLMBatchedEngine)
+    # use add_request/stream_outputs; DFlashEngine uses stream_generate (high
+    # level) and serializes requests via its _active_request lock.
+    if not hasattr(engine, "_engine"):
+        return await _run_batch_test_via_stream_generate(
+            engine=engine,
+            prompts=prompts,
+            prompt_tokens=prompt_tokens,
+            max_tokens=max_tokens,
+            batch_size=batch_size,
+        )
+
     from ..request import SamplingParams
 
     engine_core = engine._engine
@@ -314,10 +401,10 @@ async def _single_request(prompt: str) -> dict:
     prefill_wall_time = max_first_token - wall_start
     pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)
 
-    # tg TPS: total generated tokens / generation wall time
-    # Generation starts when the last request finishes prefill
-    gen_wall_time = wall_end - max_first_token
-    tg_tps = total_gen_tokens / max(gen_wall_time, 1e-9)
+    # tg TPS: wall-aggregate (total_gen / wall_time), same as the DFlash
+    # path. Honest cross-engine ratio with the Single Request wall_tg_tps
+    # baseline — see _run_batch_test_via_stream_generate.
+    tg_tps = total_gen_tokens / max(wall_time, 1e-9)
 
     return {
         "pp_tps": round(pp_tps, 1),
@@ -502,7 +589,11 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
         (r for r in single_results if r.get("pp") == 1024), None
     )
     if pp1024_single and batch_results:
-        baseline_tps = pp1024_single["gen_tps"]
+        # Use wall-aggregate baseline so 1x ↔ Nx ratios stay honest across
+        # engine types. Single Request table keeps showing gen_tps (peak
+        # decode rate) for context-length comparisons; Continuous Batching
+        # column needs the symmetric metric.
+        baseline_tps = pp1024_single["wall_tg_tps"]
         batching_results.append({
             "batch_size": 1,
             "tg_tps": baseline_tps,
@@ -536,6 +627,9 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
             "quantization": quantization,
             "context_length": context_length,
             "pp_tps": result["processing_tps"],
+            # Community board metric: peak decode rate (gen_tps, gen-only),
+            # NOT the wall-aggregate used for in-UI Continuous Batching
+            # speedup. Do not "tidy" this to wall_tg_tps.
             "tg_tps": result["gen_tps"],
             "ttft_ms": result.get("ttft_ms"),
             "peak_memory_gb": peak_mem_gb,
@@ -760,13 +854,12 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None:
         batch_prompts = [_generate_prompt(tokenizer, 1024) for _ in range(max_batch)]
 
         # Skip batch tests for engines without scheduler core (e.g. DFlashEngine)
-        if request.batch_sizes and not hasattr(engine, "_engine"):
-            logger.info(
-                "Batch test skipped: engine does not support concurrent batching"
-            )
-            current_test += len(request.batch_sizes)
-
-        for batch_size in request.batch_sizes if hasattr(engine, "_engine") else []:
+        # NOTE: DFlashEngine doesn't expose engine_core (`_engine`) but does
+        # support concurrent requests by serializing them through its
+        # _active_request lock. Running batch test on DFlash yields valid
+        # metrics that show serialization behavior (4x wall time, no
+        # aggregate gain) — strictly more useful than silently skipping.
+        for batch_size in request.batch_sizes:
             current_test += 1
             await _send_event(run, {
                 "type": "progress",

diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json
@@ -688,5 +688,28 @@
   "js.error.delete_model_failed": "Failed to delete model",
   "js.error.delete_model_connection": "Failed to delete model. Check server connection.",
   "js.success.download_started": "Download started: {repo_id}",
-  "js.success.settings_saved": "Settings saved successfully"
-}
+  "js.success.settings_saved": "Settings saved successfully",
+  "modal.model_settings.dflash_hint": "Block diffusion speculative decoding for 3-4x faster generation. Requires a DFlash draft model checkpoint.<br><strong>Single-stream only: requests run one at a time.</strong><br>* MLX impl by bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
+  "modal.model_settings.dflash_draft_model": "Draft Model",
+  "modal.model_settings.dflash_draft_model_placeholder": "Select draft model...",
+  "modal.model_settings.dflash_draft_model_help": "DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash)",
+  "modal.model_settings.dflash_draft_quant": "Draft Quantization",
+  "modal.model_settings.dflash_draft_quant_help": "Quantization of the draft model only — independent of target model quantization.",
+  "modal.model_settings.dflash_draft_quant_bf16": "bf16 (default)",
+  "modal.model_settings.dflash_max_ctx": "Max Context (fallback threshold)",
+  "modal.model_settings.dflash_max_ctx_placeholder": "unlimited",
+  "modal.model_settings.dflash_max_ctx_help": "Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.",
+  "modal.model_settings.dflash_max_concurrent": "Max Concurrent",
+  "modal.model_settings.dflash_max_concurrent_placeholder": "4 (default)",
+  "modal.model_settings.dflash_max_concurrent_help": "Cap on simultaneous in-flight DFlash requests; excess requests queue at the gate. DFlash decode is single-stream by design — this value does NOT increase throughput. It is a resource admission gate that bounds memory under bursts (each in-flight request holds its own KV cache, hundreds of MB to several GB) and keeps tail latency predictable. Default 4. On tight memory (< 64 GB) set 1–2; on 128 GB or more set 8. Leave empty for unlimited (not recommended unless you trust the upstream load shape).",
+  "modal.model_settings.dflash_l1_cache": "In-memory cache",
+  "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.",
+  "modal.model_settings.dflash_l1_max_entries": "In-memory cache max entries",
+  "modal.model_settings.dflash_l1_max_entries_help": "Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.",
+  "modal.model_settings.dflash_l1_max_gib": "In-memory cache size (GiB)",
+  "modal.model_settings.dflash_l1_max_gib_help": "Byte budget for L1 snapshots; LRU evicts when exceeded.",
+  "modal.model_settings.dflash_l2_cache": "SSD cache",
+  "modal.model_settings.dflash_l2_cache_hint": "L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (<code>dflash_l2/</code>).",
+  "modal.model_settings.dflash_l2_unavailable": "Enable oMLX paged SSD cache first (<code>--paged-ssd-cache-dir</code>).",
+  "modal.model_settings.dflash_l2_requires_l1": "Requires in-memory cache to be enabled."
+}
diff --git a/omlx/admin/i18n/zh-TW.json b/omlx/admin/i18n/zh-TW.json
@@ -688,5 +688,28 @@
   "js.error.delete_model_failed": "刪除模型失敗",
   "js.error.delete_model_connection": "刪除模型失敗，請檢查伺服器連線。",
   "js.success.download_started": "已開始下載：{repo_id}",
-  "js.success.settings_saved": "設定儲存成功"
+  "js.success.settings_saved": "設定儲存成功",
+  "modal.model_settings.dflash_hint": "块扩散投机解码，单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。<br><strong>仅单流：请求按顺序处理。</strong><br>* MLX 实现：bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
+  "modal.model_settings.dflash_draft_model": "草稿模型",
+  "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
+  "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint（例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash）",
+  "modal.model_settings.dflash_draft_quant": "草稿模型量化",
+  "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化，与主模型量化独立。",
+  "modal.model_settings.dflash_draft_quant_bf16": "bf16（默认）",
+  "modal.model_settings.dflash_max_ctx": "最大上下文（fallback 阈值）",
+  "modal.model_settings.dflash_max_ctx_placeholder": "不限",
+  "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
+  "modal.model_settings.dflash_max_concurrent": "最大並發",
+  "modal.model_settings.dflash_max_concurrent_placeholder": "4（預設）",
+  "modal.model_settings.dflash_max_concurrent_help": "DFlash 同時處理中的請求數上限，超出的請求在入口排隊等空位。DFlash 解碼本身是嚴格單流設計——這個值不會提高吞吐速率，作用是資源 admission 閘門：突發流量下控制記憶體佔用（每個處理中的請求各自持有 KV cache，幾百 MB 到幾 GB），同時讓尾延遲可預測。預設 4。記憶體緊（< 64 GB）建議設 1-2；128 GB+ 可設 8。留空為不限（除非你確信上游流量形態可控，否則不推薦）。",
+  "modal.model_settings.dflash_l1_cache": "内存缓存",
+  "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
+  "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
+  "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
+  "modal.model_settings.dflash_l1_max_gib": "内存缓存大小（GiB）",
+  "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算；超过时 LRU 淘汰。",
+  "modal.model_settings.dflash_l2_cache": "SSD 缓存",
+  "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录（<code>dflash_l2/</code>）。",
+  "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache（<code>--paged-ssd-cache-dir</code>）。",
+  "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
 }
diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json
@@ -688,5 +688,28 @@
   "js.error.delete_model_failed": "删除模型失败",
   "js.error.delete_model_connection": "删除模型失败，请检查服务器连接。",
   "js.success.download_started": "已开始下载：{repo_id}",
-  "js.success.settings_saved": "设置已成功保存"
+  "js.success.settings_saved": "设置已成功保存",
+  "modal.model_settings.dflash_hint": "块扩散投机解码，单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。<br><strong>仅单流：请求按顺序处理。</strong><br>* MLX 实现：bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
+  "modal.model_settings.dflash_draft_model": "草稿模型",
+  "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
+  "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint（例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash）",
+  "modal.model_settings.dflash_draft_quant": "草稿模型量化",
+  "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化，与主模型量化独立。",
+  "modal.model_settings.dflash_draft_quant_bf16": "bf16（默认）",
+  "modal.model_settings.dflash_max_ctx": "最大上下文（fallback 阈值）",
+  "modal.model_settings.dflash_max_ctx_placeholder": "不限",
+  "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
+  "modal.model_settings.dflash_max_concurrent": "最大并发",
+  "modal.model_settings.dflash_max_concurrent_placeholder": "4（默认）",
+  "modal.model_settings.dflash_max_concurrent_help": "DFlash 同时处理中的请求数上限，超出的请求在入口排队等空位。DFlash 解码本身是严格单流设计——这个值不会提高吞吐速率，作用是资源 admission 闸门：突发流量下控制内存占用（每个处理中的请求各自持有 KV cache，几百 MB 到几 GB），同时让尾延迟可预测。默认 4。内存紧（< 64 GB）建议设 1-2；128 GB+ 可设 8。留空为不限（除非你确信上游流量形态可控，否则不推荐）。",
+  "modal.model_settings.dflash_l1_cache": "内存缓存",
+  "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
+  "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
+  "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
+  "modal.model_settings.dflash_l1_max_gib": "内存缓存大小（GiB）",
+  "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算；超过时 LRU 淘汰。",
+  "modal.model_settings.dflash_l2_cache": "SSD 缓存",
+  "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录（<code>dflash_l2/</code>）。",
+  "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache（<code>--paged-ssd-cache-dir</code>）。",
+  "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
 }
diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
@@ -135,6 +135,7 @@ class ModelSettingsRequest(BaseModel):
     dflash_draft_quant_activation_bits: Optional[int] = None
     dflash_draft_quant_group_size: Optional[int] = None
     dflash_max_ctx: Optional[int] = None
+    dflash_max_concurrent: Optional[int] = None
     dflash_in_memory_cache: Optional[bool] = None
     dflash_in_memory_cache_max_entries: Optional[int] = None
     dflash_in_memory_cache_max_bytes: Optional[int] = None
@@ -1662,6 +1663,7 @@ async def list_models(is_admin: bool = Depends(require_admin)):
                 "dflash_draft_quant_activation_bits": settings.dflash_draft_quant_activation_bits,
                 "dflash_draft_quant_group_size": settings.dflash_draft_quant_group_size,
                 "dflash_max_ctx": settings.dflash_max_ctx,
+                "dflash_max_concurrent": settings.dflash_max_concurrent,
                 "dflash_in_memory_cache": settings.dflash_in_memory_cache,
                 "dflash_in_memory_cache_max_entries": settings.dflash_in_memory_cache_max_entries,
                 "dflash_in_memory_cache_max_bytes": settings.dflash_in_memory_cache_max_bytes,
@@ -1946,6 +1948,10 @@ async def update_model_settings(
         # 0/None means "unlimited" — the engine treats None as no fallback threshold
         value = request.dflash_max_ctx
         current_settings.dflash_max_ctx = value if value and value > 0 else None
+    if "dflash_max_concurrent" in sent:
+        # 0/None means "unlimited" — the engine treats None as no concurrent cap
+        value = request.dflash_max_concurrent
+        current_settings.dflash_max_concurrent = value if value and value > 0 else None
     if "dflash_in_memory_cache" in sent:
         current_settings.dflash_in_memory_cache = bool(request.dflash_in_memory_cache)
     if "dflash_in_memory_cache_max_entries" in sent: