diff --git a/omlx/admin/benchmark.py b/omlx/admin/benchmark.py
index 7f0d620b..3788f703 100644
--- a/omlx/admin/benchmark.py
+++ b/omlx/admin/benchmark.py
@@ -152,6 +152,10 @@ def _compute_single_metrics(
ttft_ms = ttft_s * 1000
tpot_ms = (gen_duration / max(completion_tokens - 1, 1)) * 1000
gen_tps = completion_tokens / max(gen_duration, 1e-9)
+ # wall_tg_tps: gen tokens over total wall (includes prefill). Used as the
+ # 1x baseline for Continuous Batching speedup so the ratio is symmetric
+ # with batch tg_tps (which is also total_gen / wall_time).
+ wall_tg_tps = completion_tokens / max(e2e_duration, 1e-9)
processing_tps = prompt_tokens / max(ttft_s, 1e-9)
total_throughput = (prompt_tokens + completion_tokens) / max(e2e_duration, 1e-9)
@@ -159,6 +163,7 @@ def _compute_single_metrics(
"ttft_ms": round(ttft_ms, 1),
"tpot_ms": round(tpot_ms, 2),
"gen_tps": round(gen_tps, 1),
+ "wall_tg_tps": round(wall_tg_tps, 1),
"processing_tps": round(processing_tps, 1),
"e2e_latency_s": round(e2e_duration, 3),
"total_throughput": round(total_throughput, 1),
@@ -238,6 +243,76 @@ async def _run_single_test(
)
+
+async def _run_batch_test_via_stream_generate(
+ engine: Any,
+ prompts: list[str],
+ prompt_tokens: int,
+ max_tokens: int,
+ batch_size: int,
+) -> dict:
+ """Batch test path for engines that only expose high-level stream_generate
+ (no engine_core). Used for DFlashEngine where concurrent requests
+ serialize through _active_request; metrics will reflect serial behavior.
+ """
+
+ async def _single_request(prompt: str) -> dict:
+ start = time.perf_counter()
+ first_token = None
+ completion_tokens = 0
+ async for output in engine.stream_generate(
+ prompt=prompt, max_tokens=max_tokens, temperature=0.0
+ ):
+ ct = getattr(output, "completion_tokens", None)
+ if ct is not None and ct > 0 and first_token is None:
+ first_token = time.perf_counter()
+ if ct is not None:
+ completion_tokens = ct
+ if getattr(output, "finished", False):
+ break
+ end = time.perf_counter()
+ if first_token is None:
+ first_token = end
+ return {
+ "ttft_s": first_token - start,
+ "first_token_abs": first_token,
+ "completion_tokens": completion_tokens,
+ "wall_s": end - start,
+ }
+
+ wall_start = time.perf_counter()
+ results = await asyncio.gather(
+ *[_single_request(prompts[i]) for i in range(batch_size)]
+ )
+ wall_end = time.perf_counter()
+
+ total_gen_tokens = sum(r["completion_tokens"] for r in results)
+ total_prompt_tokens = prompt_tokens * batch_size
+ wall_time = wall_end - wall_start
+ avg_ttft_ms = (sum(r["ttft_s"] for r in results) / batch_size) * 1000
+
+ # pp TPS: total prompt tokens / time until ALL requests finish prefill
+ max_first_token = max(r["first_token_abs"] for r in results)
+ prefill_wall_time = max_first_token - wall_start
+ pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)
+
+ # tg TPS: wall-aggregate (total_gen / wall_time). Same formula used in
+ # the engine_core batch path so DFlash ↔ BatchedEngine ratios are
+ # symmetric. The alternative (gen_wall_time = wall_end - max_first_token)
+ # is inflated for any engine that serializes prefill or decode, and
+ # makes cross-engine speedup columns meaningless. See _run_batch_test.
+ tg_tps = total_gen_tokens / max(wall_time, 1e-9)
+
+ return {
+ "pp_tps": round(pp_tps, 1),
+ "tg_tps": round(tg_tps, 1),
+ "avg_ttft_ms": round(avg_ttft_ms, 1),
+ "e2e_latency_s": round(wall_time, 3),
+ "total_gen_tokens": total_gen_tokens,
+ "batch_size": batch_size,
+ }
+
+
async def _run_batch_test(
engine: Any,
prompts: list[str],
@@ -247,7 +322,7 @@ async def _run_batch_test(
) -> dict:
"""Run a continuous batching benchmark test.
- Submits batch_size concurrent requests via the engine core and measures
+ Submits batch_size concurrent requests via the engine and measures
aggregate throughput including pp TPS and tg TPS.
Args:
@@ -256,6 +331,18 @@ async def _run_batch_test(
has a unique UUID prefix.
prompt_tokens: Number of prompt tokens per request (for pp TPS calc).
"""
+ # Dispatch: engines exposing engine_core (BatchedEngine, VLMBatchedEngine)
+ # use add_request/stream_outputs; DFlashEngine uses stream_generate (high
+ # level) and serializes requests via its _active_request lock.
+ if not hasattr(engine, "_engine"):
+ return await _run_batch_test_via_stream_generate(
+ engine=engine,
+ prompts=prompts,
+ prompt_tokens=prompt_tokens,
+ max_tokens=max_tokens,
+ batch_size=batch_size,
+ )
+
from ..request import SamplingParams
engine_core = engine._engine
@@ -314,10 +401,10 @@ async def _single_request(prompt: str) -> dict:
prefill_wall_time = max_first_token - wall_start
pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)
- # tg TPS: total generated tokens / generation wall time
- # Generation starts when the last request finishes prefill
- gen_wall_time = wall_end - max_first_token
- tg_tps = total_gen_tokens / max(gen_wall_time, 1e-9)
+ # tg TPS: wall-aggregate (total_gen / wall_time), same as the DFlash
+ # path. Honest cross-engine ratio with the Single Request wall_tg_tps
+ # baseline — see _run_batch_test_via_stream_generate.
+ tg_tps = total_gen_tokens / max(wall_time, 1e-9)
return {
"pp_tps": round(pp_tps, 1),
@@ -502,7 +589,11 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
(r for r in single_results if r.get("pp") == 1024), None
)
if pp1024_single and batch_results:
- baseline_tps = pp1024_single["gen_tps"]
+ # Use wall-aggregate baseline so 1x ↔ Nx ratios stay honest across
+ # engine types. Single Request table keeps showing gen_tps (peak
+ # decode rate) for context-length comparisons; Continuous Batching
+ # column needs the symmetric metric.
+ baseline_tps = pp1024_single["wall_tg_tps"]
batching_results.append({
"batch_size": 1,
"tg_tps": baseline_tps,
@@ -536,6 +627,9 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
"quantization": quantization,
"context_length": context_length,
"pp_tps": result["processing_tps"],
+ # Community board metric: peak decode rate (gen_tps, gen-only),
+ # NOT the wall-aggregate used for in-UI Continuous Batching
+ # speedup. Do not "tidy" this to wall_tg_tps.
"tg_tps": result["gen_tps"],
"ttft_ms": result.get("ttft_ms"),
"peak_memory_gb": peak_mem_gb,
@@ -760,13 +854,12 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None:
batch_prompts = [_generate_prompt(tokenizer, 1024) for _ in range(max_batch)]
# Skip batch tests for engines without scheduler core (e.g. DFlashEngine)
- if request.batch_sizes and not hasattr(engine, "_engine"):
- logger.info(
- "Batch test skipped: engine does not support concurrent batching"
- )
- current_test += len(request.batch_sizes)
-
- for batch_size in request.batch_sizes if hasattr(engine, "_engine") else []:
+ # NOTE: DFlashEngine doesn't expose engine_core (`_engine`) but does
+ # support concurrent requests by serializing them through its
+ # _active_request lock. Running batch test on DFlash yields valid
+ # metrics that show serialization behavior (4x wall time, no
+ # aggregate gain) — strictly more useful than silently skipping.
+ for batch_size in request.batch_sizes:
current_test += 1
await _send_event(run, {
"type": "progress",
diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json
index 46c349b3..99f1279d 100644
--- a/omlx/admin/i18n/en.json
+++ b/omlx/admin/i18n/en.json
@@ -688,5 +688,28 @@
"js.error.delete_model_failed": "Failed to delete model",
"js.error.delete_model_connection": "Failed to delete model. Check server connection.",
"js.success.download_started": "Download started: {repo_id}",
- "js.success.settings_saved": "Settings saved successfully"
-}
\ No newline at end of file
+ "js.success.settings_saved": "Settings saved successfully",
+ "modal.model_settings.dflash_hint": "Block diffusion speculative decoding for 3-4x faster generation. Requires a DFlash draft model checkpoint.
Single-stream only: requests run one at a time.
* MLX impl by bstnxbt(GitHub)",
+ "modal.model_settings.dflash_draft_model": "Draft Model",
+ "modal.model_settings.dflash_draft_model_placeholder": "Select draft model...",
+ "modal.model_settings.dflash_draft_model_help": "DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash)",
+ "modal.model_settings.dflash_draft_quant": "Draft Quantization",
+ "modal.model_settings.dflash_draft_quant_help": "Quantization of the draft model only — independent of target model quantization.",
+ "modal.model_settings.dflash_draft_quant_bf16": "bf16 (default)",
+ "modal.model_settings.dflash_max_ctx": "Max Context (fallback threshold)",
+ "modal.model_settings.dflash_max_ctx_placeholder": "unlimited",
+ "modal.model_settings.dflash_max_ctx_help": "Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.",
+ "modal.model_settings.dflash_max_concurrent": "Max Concurrent",
+ "modal.model_settings.dflash_max_concurrent_placeholder": "4 (default)",
+ "modal.model_settings.dflash_max_concurrent_help": "Cap on simultaneous in-flight DFlash requests; excess requests queue at the gate. DFlash decode is single-stream by design — this value does NOT increase throughput. It is a resource admission gate that bounds memory under bursts (each in-flight request holds its own KV cache, hundreds of MB to several GB) and keeps tail latency predictable. Default 4. On tight memory (< 64 GB) set 1–2; on 128 GB or more set 8. Leave empty for unlimited (not recommended unless you trust the upstream load shape).",
+ "modal.model_settings.dflash_l1_cache": "In-memory cache",
+ "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.",
+ "modal.model_settings.dflash_l1_max_entries": "In-memory cache max entries",
+ "modal.model_settings.dflash_l1_max_entries_help": "Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.",
+ "modal.model_settings.dflash_l1_max_gib": "In-memory cache size (GiB)",
+ "modal.model_settings.dflash_l1_max_gib_help": "Byte budget for L1 snapshots; LRU evicts when exceeded.",
+ "modal.model_settings.dflash_l2_cache": "SSD cache",
+ "modal.model_settings.dflash_l2_cache_hint": "L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (dflash_l2/).",
+ "modal.model_settings.dflash_l2_unavailable": "Enable oMLX paged SSD cache first (--paged-ssd-cache-dir).",
+ "modal.model_settings.dflash_l2_requires_l1": "Requires in-memory cache to be enabled."
+}
diff --git a/omlx/admin/i18n/zh-TW.json b/omlx/admin/i18n/zh-TW.json
index 95181843..d314877d 100644
--- a/omlx/admin/i18n/zh-TW.json
+++ b/omlx/admin/i18n/zh-TW.json
@@ -688,5 +688,28 @@
"js.error.delete_model_failed": "刪除模型失敗",
"js.error.delete_model_connection": "刪除模型失敗,請檢查伺服器連線。",
"js.success.download_started": "已開始下載:{repo_id}",
- "js.success.settings_saved": "設定儲存成功"
+ "js.success.settings_saved": "設定儲存成功",
+ "modal.model_settings.dflash_hint": "块扩散投机解码,单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。
仅单流:请求按顺序处理。
* MLX 实现:bstnxbt(GitHub)",
+ "modal.model_settings.dflash_draft_model": "草稿模型",
+ "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
+ "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint(例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash)",
+ "modal.model_settings.dflash_draft_quant": "草稿模型量化",
+ "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化,与主模型量化独立。",
+ "modal.model_settings.dflash_draft_quant_bf16": "bf16(默认)",
+ "modal.model_settings.dflash_max_ctx": "最大上下文(fallback 阈值)",
+ "modal.model_settings.dflash_max_ctx_placeholder": "不限",
+ "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
+ "modal.model_settings.dflash_max_concurrent": "最大並發",
+ "modal.model_settings.dflash_max_concurrent_placeholder": "4(預設)",
+ "modal.model_settings.dflash_max_concurrent_help": "DFlash 同時處理中的請求數上限,超出的請求在入口排隊等空位。DFlash 解碼本身是嚴格單流設計——這個值不會提高吞吐速率,作用是資源 admission 閘門:突發流量下控制記憶體佔用(每個處理中的請求各自持有 KV cache,幾百 MB 到幾 GB),同時讓尾延遲可預測。預設 4。記憶體緊(< 64 GB)建議設 1-2;128 GB+ 可設 8。留空為不限(除非你確信上游流量形態可控,否則不推薦)。",
+ "modal.model_settings.dflash_l1_cache": "内存缓存",
+ "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
+ "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
+ "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
+ "modal.model_settings.dflash_l1_max_gib": "内存缓存大小(GiB)",
+ "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算;超过时 LRU 淘汰。",
+ "modal.model_settings.dflash_l2_cache": "SSD 缓存",
+ "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录(dflash_l2/)。",
+ "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache(--paged-ssd-cache-dir)。",
+ "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
}
diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json
index 135aa192..6224ce6c 100644
--- a/omlx/admin/i18n/zh.json
+++ b/omlx/admin/i18n/zh.json
@@ -688,5 +688,28 @@
"js.error.delete_model_failed": "删除模型失败",
"js.error.delete_model_connection": "删除模型失败,请检查服务器连接。",
"js.success.download_started": "已开始下载:{repo_id}",
- "js.success.settings_saved": "设置已成功保存"
+ "js.success.settings_saved": "设置已成功保存",
+ "modal.model_settings.dflash_hint": "块扩散投机解码,单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。
仅单流:请求按顺序处理。
* MLX 实现:bstnxbt(GitHub)",
+ "modal.model_settings.dflash_draft_model": "草稿模型",
+ "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
+ "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint(例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash)",
+ "modal.model_settings.dflash_draft_quant": "草稿模型量化",
+ "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化,与主模型量化独立。",
+ "modal.model_settings.dflash_draft_quant_bf16": "bf16(默认)",
+ "modal.model_settings.dflash_max_ctx": "最大上下文(fallback 阈值)",
+ "modal.model_settings.dflash_max_ctx_placeholder": "不限",
+ "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
+ "modal.model_settings.dflash_max_concurrent": "最大并发",
+ "modal.model_settings.dflash_max_concurrent_placeholder": "4(默认)",
+ "modal.model_settings.dflash_max_concurrent_help": "DFlash 同时处理中的请求数上限,超出的请求在入口排队等空位。DFlash 解码本身是严格单流设计——这个值不会提高吞吐速率,作用是资源 admission 闸门:突发流量下控制内存占用(每个处理中的请求各自持有 KV cache,几百 MB 到几 GB),同时让尾延迟可预测。默认 4。内存紧(< 64 GB)建议设 1-2;128 GB+ 可设 8。留空为不限(除非你确信上游流量形态可控,否则不推荐)。",
+ "modal.model_settings.dflash_l1_cache": "内存缓存",
+ "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
+ "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
+ "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
+ "modal.model_settings.dflash_l1_max_gib": "内存缓存大小(GiB)",
+ "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算;超过时 LRU 淘汰。",
+ "modal.model_settings.dflash_l2_cache": "SSD 缓存",
+ "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录(dflash_l2/)。",
+ "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache(--paged-ssd-cache-dir)。",
+ "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
}
diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
index 3c0851a8..276f632e 100644
--- a/omlx/admin/routes.py
+++ b/omlx/admin/routes.py
@@ -135,6 +135,7 @@ class ModelSettingsRequest(BaseModel):
dflash_draft_quant_activation_bits: Optional[int] = None
dflash_draft_quant_group_size: Optional[int] = None
dflash_max_ctx: Optional[int] = None
+ dflash_max_concurrent: Optional[int] = None
dflash_in_memory_cache: Optional[bool] = None
dflash_in_memory_cache_max_entries: Optional[int] = None
dflash_in_memory_cache_max_bytes: Optional[int] = None
@@ -1662,6 +1663,7 @@ async def list_models(is_admin: bool = Depends(require_admin)):
"dflash_draft_quant_activation_bits": settings.dflash_draft_quant_activation_bits,
"dflash_draft_quant_group_size": settings.dflash_draft_quant_group_size,
"dflash_max_ctx": settings.dflash_max_ctx,
+ "dflash_max_concurrent": settings.dflash_max_concurrent,
"dflash_in_memory_cache": settings.dflash_in_memory_cache,
"dflash_in_memory_cache_max_entries": settings.dflash_in_memory_cache_max_entries,
"dflash_in_memory_cache_max_bytes": settings.dflash_in_memory_cache_max_bytes,
@@ -1946,6 +1948,10 @@ async def update_model_settings(
# 0/None means "unlimited" — the engine treats None as no fallback threshold
value = request.dflash_max_ctx
current_settings.dflash_max_ctx = value if value and value > 0 else None
+ if "dflash_max_concurrent" in sent:
+ # 0/None means "unlimited" — the engine treats None as no concurrent cap
+ value = request.dflash_max_concurrent
+ current_settings.dflash_max_concurrent = value if value and value > 0 else None
if "dflash_in_memory_cache" in sent:
current_settings.dflash_in_memory_cache = bool(request.dflash_in_memory_cache)
if "dflash_in_memory_cache_max_entries" in sent:
diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
index 94f1841f..e53c7331 100644
--- a/omlx/admin/static/js/dashboard.js
+++ b/omlx/admin/static/js/dashboard.js
@@ -1568,6 +1568,7 @@
dflash_draft_quant_activation_bits: settings.dflash_draft_quant_activation_bits || 16,
dflash_draft_quant_group_size: settings.dflash_draft_quant_group_size || 64,
dflash_max_ctx: settings.dflash_max_ctx ?? null,
+ dflash_max_concurrent: settings.dflash_max_concurrent ?? null,
dflash_in_memory_cache: settings.dflash_in_memory_cache !== false,
dflash_in_memory_cache_max_entries: settings.dflash_in_memory_cache_max_entries || 4,
dflash_in_memory_cache_max_gib: settings.dflash_in_memory_cache_max_bytes
@@ -1676,6 +1677,9 @@
dflash_max_ctx: this.modelSettings.dflash_enabled && this.modelSettings.dflash_max_ctx
? parseInt(this.modelSettings.dflash_max_ctx)
: null,
+ dflash_max_concurrent: this.modelSettings.dflash_enabled && this.modelSettings.dflash_max_concurrent
+ ? parseInt(this.modelSettings.dflash_max_concurrent)
+ : null,
dflash_in_memory_cache: this.modelSettings.dflash_enabled
? !!this.modelSettings.dflash_in_memory_cache
: true,
@@ -1771,6 +1775,7 @@
this.modelSettings.dflash_draft_quant_activation_bits = null;
this.modelSettings.dflash_draft_quant_group_size = null;
this.modelSettings.dflash_max_ctx = null;
+ this.modelSettings.dflash_max_concurrent = null;
this.modelSettings.dflash_in_memory_cache = true;
this.modelSettings.dflash_in_memory_cache_max_entries = 4;
this.modelSettings.dflash_in_memory_cache_max_gib = 8;
@@ -2432,9 +2437,14 @@
},
benchGetSpeedup(batchResult) {
+ // Symmetric metric: batch tg_tps is wall-aggregate (total_gen /
+ // wall_time), so baseline must also be wall-aggregate. Using
+ // gen_tps (gen-only, excludes prefill) makes the ratio look
+ // sub-1x for any engine that doesn't parallelize prefill.
const baseline = this.benchSingleResults.find(r => r.pp === 1024);
- if (!baseline || !baseline.gen_tps || baseline.gen_tps <= 0) return 0;
- return batchResult.tg_tps / baseline.gen_tps;
+ const base = baseline && baseline.wall_tg_tps ? baseline.wall_tg_tps : (baseline ? baseline.gen_tps : 0);
+ if (!base || base <= 0) return 0;
+ return batchResult.tg_tps / base;
},
benchFormatMemory(bytes) {
@@ -2468,7 +2478,7 @@
pad(r.ttft_ms.toFixed(1), 10),
pad(r.tpot_ms.toFixed(2), 10),
pad(r.processing_tps.toFixed(1) + ' tok/s', 12),
- pad(r.gen_tps.toFixed(1) + ' tok/s', 12),
+ pad(((r.wall_tg_tps ?? r.gen_tps)).toFixed(1) + ' tok/s', 12),
pad(r.e2e_latency_s.toFixed(3), 10),
pad(r.total_throughput.toFixed(1) + ' tok/s', 12),
pad(this.benchFormatMemory(r.peak_memory_bytes), 10),
@@ -2487,10 +2497,13 @@
lines.push('-'.repeat(80));
const hdr = [rpad('Batch', 8), pad('tg TPS', 12), pad('Speedup', 8), pad('pp TPS', 12), pad('pp TPS/req', 12), pad('TTFT(ms)', 10), pad('E2E(s)', 10)];
lines.push(hdr.join(' '));
+ // 1x baseline uses wall_tg_tps (wall-aggregate) to stay
+ // symmetric with batch tg_tps below. See benchGetSpeedup.
+ const baseTg = baseline && baseline.wall_tg_tps ? baseline.wall_tg_tps : (baseline ? baseline.gen_tps : 0);
if (baseline) {
const row = [
rpad('1x', 8),
- pad(baseline.gen_tps.toFixed(1) + ' tok/s', 12),
+ pad(baseTg.toFixed(1) + ' tok/s', 12),
pad('1.00x', 8),
pad(baseline.processing_tps.toFixed(1) + ' tok/s', 12),
pad(baseline.processing_tps.toFixed(1) + ' tok/s', 12),
@@ -2500,7 +2513,7 @@
lines.push(row.join(' '));
}
for (const r of results) {
- const speedup = baseline && baseline.gen_tps > 0 ? (r.tg_tps / baseline.gen_tps).toFixed(2) + 'x' : '-';
+ const speedup = baseTg > 0 ? (r.tg_tps / baseTg).toFixed(2) + 'x' : '-';
const row = [
rpad(r.batch_size + 'x', 8),
pad(r.tg_tps.toFixed(1) + ' tok/s', 12),
diff --git a/omlx/admin/templates/dashboard/_bench.html b/omlx/admin/templates/dashboard/_bench.html
index e2a31432..49f846a0 100644
--- a/omlx/admin/templates/dashboard/_bench.html
+++ b/omlx/admin/templates/dashboard/_bench.html
@@ -188,7 +188,7 @@
{{ t('bench.headi
|
|
|
- |
+ |
|
|
|
@@ -225,7 +225,7 @@ {{ t('bench.headi
| {{ t('bench.results.batch.baseline') }} |
- |
+ |
1.00x |
|
|
diff --git a/omlx/admin/templates/dashboard/_modal_model_settings.html b/omlx/admin/templates/dashboard/_modal_model_settings.html
index e5c262e6..39e122f1 100644
--- a/omlx/admin/templates/dashboard/_modal_model_settings.html
+++ b/omlx/admin/templates/dashboard/_modal_model_settings.html
@@ -661,10 +661,10 @@ {{
-
+
-
-
{{ t('modal.model_settings.dflash_max_ctx') }}
+
+
{{ t('modal.model_settings.dflash_max_ctx_help') }}
+
+
+
+
-
Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.
+
{{ t('modal.model_settings.dflash_max_concurrent_help') }}
-
In-memory cache
-
DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.
+
{{ t('modal.model_settings.dflash_l1_cache') }}
+
{{ t('modal.model_settings.dflash_l1_cache_hint') }}
-
+
-
Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.
+
{{ t('modal.model_settings.dflash_l1_max_entries_help') }}
-
+
-
Byte budget for L1 snapshots; LRU evicts when exceeded.
+
{{ t('modal.model_settings.dflash_l1_max_gib_help') }}
-
SSD cache
-
L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (dflash_l2/).
-
Enable oMLX paged SSD cache first (--paged-ssd-cache-dir).
-
Requires in-memory cache to be enabled.
+
{{ t('modal.model_settings.dflash_l2_cache') }}
+
{{ t('modal.model_settings.dflash_l2_cache_hint') | safe }}
+
{{ t('modal.model_settings.dflash_l2_unavailable') | safe }}
+
{{ t('modal.model_settings.dflash_l2_requires_l1') }}