diff --git a/omlx/admin/benchmark.py b/omlx/admin/benchmark.py index 7f0d620b..3788f703 100644 --- a/omlx/admin/benchmark.py +++ b/omlx/admin/benchmark.py @@ -152,6 +152,10 @@ def _compute_single_metrics( ttft_ms = ttft_s * 1000 tpot_ms = (gen_duration / max(completion_tokens - 1, 1)) * 1000 gen_tps = completion_tokens / max(gen_duration, 1e-9) + # wall_tg_tps: gen tokens over total wall (includes prefill). Used as the + # 1x baseline for Continuous Batching speedup so the ratio is symmetric + # with batch tg_tps (which is also total_gen / wall_time). + wall_tg_tps = completion_tokens / max(e2e_duration, 1e-9) processing_tps = prompt_tokens / max(ttft_s, 1e-9) total_throughput = (prompt_tokens + completion_tokens) / max(e2e_duration, 1e-9) @@ -159,6 +163,7 @@ def _compute_single_metrics( "ttft_ms": round(ttft_ms, 1), "tpot_ms": round(tpot_ms, 2), "gen_tps": round(gen_tps, 1), + "wall_tg_tps": round(wall_tg_tps, 1), "processing_tps": round(processing_tps, 1), "e2e_latency_s": round(e2e_duration, 3), "total_throughput": round(total_throughput, 1), @@ -238,6 +243,76 @@ async def _run_single_test( ) + +async def _run_batch_test_via_stream_generate( + engine: Any, + prompts: list[str], + prompt_tokens: int, + max_tokens: int, + batch_size: int, +) -> dict: + """Batch test path for engines that only expose high-level stream_generate + (no engine_core). Used for DFlashEngine where concurrent requests + serialize through _active_request; metrics will reflect serial behavior. + """ + + async def _single_request(prompt: str) -> dict: + start = time.perf_counter() + first_token = None + completion_tokens = 0 + async for output in engine.stream_generate( + prompt=prompt, max_tokens=max_tokens, temperature=0.0 + ): + ct = getattr(output, "completion_tokens", None) + if ct is not None and ct > 0 and first_token is None: + first_token = time.perf_counter() + if ct is not None: + completion_tokens = ct + if getattr(output, "finished", False): + break + end = time.perf_counter() + if first_token is None: + first_token = end + return { + "ttft_s": first_token - start, + "first_token_abs": first_token, + "completion_tokens": completion_tokens, + "wall_s": end - start, + } + + wall_start = time.perf_counter() + results = await asyncio.gather( + *[_single_request(prompts[i]) for i in range(batch_size)] + ) + wall_end = time.perf_counter() + + total_gen_tokens = sum(r["completion_tokens"] for r in results) + total_prompt_tokens = prompt_tokens * batch_size + wall_time = wall_end - wall_start + avg_ttft_ms = (sum(r["ttft_s"] for r in results) / batch_size) * 1000 + + # pp TPS: total prompt tokens / time until ALL requests finish prefill + max_first_token = max(r["first_token_abs"] for r in results) + prefill_wall_time = max_first_token - wall_start + pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9) + + # tg TPS: wall-aggregate (total_gen / wall_time). Same formula used in + # the engine_core batch path so DFlash ↔ BatchedEngine ratios are + # symmetric. The alternative (gen_wall_time = wall_end - max_first_token) + # is inflated for any engine that serializes prefill or decode, and + # makes cross-engine speedup columns meaningless. See _run_batch_test. + tg_tps = total_gen_tokens / max(wall_time, 1e-9) + + return { + "pp_tps": round(pp_tps, 1), + "tg_tps": round(tg_tps, 1), + "avg_ttft_ms": round(avg_ttft_ms, 1), + "e2e_latency_s": round(wall_time, 3), + "total_gen_tokens": total_gen_tokens, + "batch_size": batch_size, + } + + async def _run_batch_test( engine: Any, prompts: list[str], @@ -247,7 +322,7 @@ async def _run_batch_test( ) -> dict: """Run a continuous batching benchmark test. - Submits batch_size concurrent requests via the engine core and measures + Submits batch_size concurrent requests via the engine and measures aggregate throughput including pp TPS and tg TPS. Args: @@ -256,6 +331,18 @@ async def _run_batch_test( has a unique UUID prefix. prompt_tokens: Number of prompt tokens per request (for pp TPS calc). """ + # Dispatch: engines exposing engine_core (BatchedEngine, VLMBatchedEngine) + # use add_request/stream_outputs; DFlashEngine uses stream_generate (high + # level) and serializes requests via its _active_request lock. + if not hasattr(engine, "_engine"): + return await _run_batch_test_via_stream_generate( + engine=engine, + prompts=prompts, + prompt_tokens=prompt_tokens, + max_tokens=max_tokens, + batch_size=batch_size, + ) + from ..request import SamplingParams engine_core = engine._engine @@ -314,10 +401,10 @@ async def _single_request(prompt: str) -> dict: prefill_wall_time = max_first_token - wall_start pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9) - # tg TPS: total generated tokens / generation wall time - # Generation starts when the last request finishes prefill - gen_wall_time = wall_end - max_first_token - tg_tps = total_gen_tokens / max(gen_wall_time, 1e-9) + # tg TPS: wall-aggregate (total_gen / wall_time), same as the DFlash + # path. Honest cross-engine ratio with the Single Request wall_tg_tps + # baseline — see _run_batch_test_via_stream_generate. + tg_tps = total_gen_tokens / max(wall_time, 1e-9) return { "pp_tps": round(pp_tps, 1), @@ -502,7 +589,11 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None: (r for r in single_results if r.get("pp") == 1024), None ) if pp1024_single and batch_results: - baseline_tps = pp1024_single["gen_tps"] + # Use wall-aggregate baseline so 1x ↔ Nx ratios stay honest across + # engine types. Single Request table keeps showing gen_tps (peak + # decode rate) for context-length comparisons; Continuous Batching + # column needs the symmetric metric. + baseline_tps = pp1024_single["wall_tg_tps"] batching_results.append({ "batch_size": 1, "tg_tps": baseline_tps, @@ -536,6 +627,9 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None: "quantization": quantization, "context_length": context_length, "pp_tps": result["processing_tps"], + # Community board metric: peak decode rate (gen_tps, gen-only), + # NOT the wall-aggregate used for in-UI Continuous Batching + # speedup. Do not "tidy" this to wall_tg_tps. "tg_tps": result["gen_tps"], "ttft_ms": result.get("ttft_ms"), "peak_memory_gb": peak_mem_gb, @@ -760,13 +854,12 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None: batch_prompts = [_generate_prompt(tokenizer, 1024) for _ in range(max_batch)] # Skip batch tests for engines without scheduler core (e.g. DFlashEngine) - if request.batch_sizes and not hasattr(engine, "_engine"): - logger.info( - "Batch test skipped: engine does not support concurrent batching" - ) - current_test += len(request.batch_sizes) - - for batch_size in request.batch_sizes if hasattr(engine, "_engine") else []: + # NOTE: DFlashEngine doesn't expose engine_core (`_engine`) but does + # support concurrent requests by serializing them through its + # _active_request lock. Running batch test on DFlash yields valid + # metrics that show serialization behavior (4x wall time, no + # aggregate gain) — strictly more useful than silently skipping. + for batch_size in request.batch_sizes: current_test += 1 await _send_event(run, { "type": "progress", diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json index 46c349b3..99f1279d 100644 --- a/omlx/admin/i18n/en.json +++ b/omlx/admin/i18n/en.json @@ -688,5 +688,28 @@ "js.error.delete_model_failed": "Failed to delete model", "js.error.delete_model_connection": "Failed to delete model. Check server connection.", "js.success.download_started": "Download started: {repo_id}", - "js.success.settings_saved": "Settings saved successfully" -} \ No newline at end of file + "js.success.settings_saved": "Settings saved successfully", + "modal.model_settings.dflash_hint": "Block diffusion speculative decoding for 3-4x faster generation. Requires a DFlash draft model checkpoint.
Single-stream only: requests run one at a time.
* MLX impl by bstnxbt(GitHub)", + "modal.model_settings.dflash_draft_model": "Draft Model", + "modal.model_settings.dflash_draft_model_placeholder": "Select draft model...", + "modal.model_settings.dflash_draft_model_help": "DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash)", + "modal.model_settings.dflash_draft_quant": "Draft Quantization", + "modal.model_settings.dflash_draft_quant_help": "Quantization of the draft model only — independent of target model quantization.", + "modal.model_settings.dflash_draft_quant_bf16": "bf16 (default)", + "modal.model_settings.dflash_max_ctx": "Max Context (fallback threshold)", + "modal.model_settings.dflash_max_ctx_placeholder": "unlimited", + "modal.model_settings.dflash_max_ctx_help": "Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.", + "modal.model_settings.dflash_max_concurrent": "Max Concurrent", + "modal.model_settings.dflash_max_concurrent_placeholder": "4 (default)", + "modal.model_settings.dflash_max_concurrent_help": "Cap on simultaneous in-flight DFlash requests; excess requests queue at the gate. DFlash decode is single-stream by design — this value does NOT increase throughput. It is a resource admission gate that bounds memory under bursts (each in-flight request holds its own KV cache, hundreds of MB to several GB) and keeps tail latency predictable. Default 4. On tight memory (< 64 GB) set 1–2; on 128 GB or more set 8. Leave empty for unlimited (not recommended unless you trust the upstream load shape).", + "modal.model_settings.dflash_l1_cache": "In-memory cache", + "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.", + "modal.model_settings.dflash_l1_max_entries": "In-memory cache max entries", + "modal.model_settings.dflash_l1_max_entries_help": "Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.", + "modal.model_settings.dflash_l1_max_gib": "In-memory cache size (GiB)", + "modal.model_settings.dflash_l1_max_gib_help": "Byte budget for L1 snapshots; LRU evicts when exceeded.", + "modal.model_settings.dflash_l2_cache": "SSD cache", + "modal.model_settings.dflash_l2_cache_hint": "L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (dflash_l2/).", + "modal.model_settings.dflash_l2_unavailable": "Enable oMLX paged SSD cache first (--paged-ssd-cache-dir).", + "modal.model_settings.dflash_l2_requires_l1": "Requires in-memory cache to be enabled." +} diff --git a/omlx/admin/i18n/zh-TW.json b/omlx/admin/i18n/zh-TW.json index 95181843..d314877d 100644 --- a/omlx/admin/i18n/zh-TW.json +++ b/omlx/admin/i18n/zh-TW.json @@ -688,5 +688,28 @@ "js.error.delete_model_failed": "刪除模型失敗", "js.error.delete_model_connection": "刪除模型失敗,請檢查伺服器連線。", "js.success.download_started": "已開始下載:{repo_id}", - "js.success.settings_saved": "設定儲存成功" + "js.success.settings_saved": "設定儲存成功", + "modal.model_settings.dflash_hint": "块扩散投机解码,单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。
仅单流:请求按顺序处理。
* MLX 实现:bstnxbt(GitHub)", + "modal.model_settings.dflash_draft_model": "草稿模型", + "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...", + "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint(例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash)", + "modal.model_settings.dflash_draft_quant": "草稿模型量化", + "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化,与主模型量化独立。", + "modal.model_settings.dflash_draft_quant_bf16": "bf16(默认)", + "modal.model_settings.dflash_max_ctx": "最大上下文(fallback 阈值)", + "modal.model_settings.dflash_max_ctx_placeholder": "不限", + "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。", + "modal.model_settings.dflash_max_concurrent": "最大並發", + "modal.model_settings.dflash_max_concurrent_placeholder": "4(預設)", + "modal.model_settings.dflash_max_concurrent_help": "DFlash 同時處理中的請求數上限,超出的請求在入口排隊等空位。DFlash 解碼本身是嚴格單流設計——這個值不會提高吞吐速率,作用是資源 admission 閘門:突發流量下控制記憶體佔用(每個處理中的請求各自持有 KV cache,幾百 MB 到幾 GB),同時讓尾延遲可預測。預設 4。記憶體緊(< 64 GB)建議設 1-2;128 GB+ 可設 8。留空為不限(除非你確信上游流量形態可控,否則不推薦)。", + "modal.model_settings.dflash_l1_cache": "内存缓存", + "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。", + "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数", + "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。", + "modal.model_settings.dflash_l1_max_gib": "内存缓存大小(GiB)", + "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算;超过时 LRU 淘汰。", + "modal.model_settings.dflash_l2_cache": "SSD 缓存", + "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录(dflash_l2/)。", + "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache(--paged-ssd-cache-dir)。", + "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。" } diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json index 135aa192..6224ce6c 100644 --- a/omlx/admin/i18n/zh.json +++ b/omlx/admin/i18n/zh.json @@ -688,5 +688,28 @@ "js.error.delete_model_failed": "删除模型失败", "js.error.delete_model_connection": "删除模型失败,请检查服务器连接。", "js.success.download_started": "已开始下载:{repo_id}", - "js.success.settings_saved": "设置已成功保存" + "js.success.settings_saved": "设置已成功保存", + "modal.model_settings.dflash_hint": "块扩散投机解码,单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。
仅单流:请求按顺序处理。
* MLX 实现:bstnxbt(GitHub)", + "modal.model_settings.dflash_draft_model": "草稿模型", + "modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...", + "modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint(例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash)", + "modal.model_settings.dflash_draft_quant": "草稿模型量化", + "modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化,与主模型量化独立。", + "modal.model_settings.dflash_draft_quant_bf16": "bf16(默认)", + "modal.model_settings.dflash_max_ctx": "最大上下文(fallback 阈值)", + "modal.model_settings.dflash_max_ctx_placeholder": "不限", + "modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。", + "modal.model_settings.dflash_max_concurrent": "最大并发", + "modal.model_settings.dflash_max_concurrent_placeholder": "4(默认)", + "modal.model_settings.dflash_max_concurrent_help": "DFlash 同时处理中的请求数上限,超出的请求在入口排队等空位。DFlash 解码本身是严格单流设计——这个值不会提高吞吐速率,作用是资源 admission 闸门:突发流量下控制内存占用(每个处理中的请求各自持有 KV cache,几百 MB 到几 GB),同时让尾延迟可预测。默认 4。内存紧(< 64 GB)建议设 1-2;128 GB+ 可设 8。留空为不限(除非你确信上游流量形态可控,否则不推荐)。", + "modal.model_settings.dflash_l1_cache": "内存缓存", + "modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。", + "modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数", + "modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。", + "modal.model_settings.dflash_l1_max_gib": "内存缓存大小(GiB)", + "modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算;超过时 LRU 淘汰。", + "modal.model_settings.dflash_l2_cache": "SSD 缓存", + "modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录(dflash_l2/)。", + "modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache(--paged-ssd-cache-dir)。", + "modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。" } diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py index 3c0851a8..276f632e 100644 --- a/omlx/admin/routes.py +++ b/omlx/admin/routes.py @@ -135,6 +135,7 @@ class ModelSettingsRequest(BaseModel): dflash_draft_quant_activation_bits: Optional[int] = None dflash_draft_quant_group_size: Optional[int] = None dflash_max_ctx: Optional[int] = None + dflash_max_concurrent: Optional[int] = None dflash_in_memory_cache: Optional[bool] = None dflash_in_memory_cache_max_entries: Optional[int] = None dflash_in_memory_cache_max_bytes: Optional[int] = None @@ -1662,6 +1663,7 @@ async def list_models(is_admin: bool = Depends(require_admin)): "dflash_draft_quant_activation_bits": settings.dflash_draft_quant_activation_bits, "dflash_draft_quant_group_size": settings.dflash_draft_quant_group_size, "dflash_max_ctx": settings.dflash_max_ctx, + "dflash_max_concurrent": settings.dflash_max_concurrent, "dflash_in_memory_cache": settings.dflash_in_memory_cache, "dflash_in_memory_cache_max_entries": settings.dflash_in_memory_cache_max_entries, "dflash_in_memory_cache_max_bytes": settings.dflash_in_memory_cache_max_bytes, @@ -1946,6 +1948,10 @@ async def update_model_settings( # 0/None means "unlimited" — the engine treats None as no fallback threshold value = request.dflash_max_ctx current_settings.dflash_max_ctx = value if value and value > 0 else None + if "dflash_max_concurrent" in sent: + # 0/None means "unlimited" — the engine treats None as no concurrent cap + value = request.dflash_max_concurrent + current_settings.dflash_max_concurrent = value if value and value > 0 else None if "dflash_in_memory_cache" in sent: current_settings.dflash_in_memory_cache = bool(request.dflash_in_memory_cache) if "dflash_in_memory_cache_max_entries" in sent: diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js index 94f1841f..e53c7331 100644 --- a/omlx/admin/static/js/dashboard.js +++ b/omlx/admin/static/js/dashboard.js @@ -1568,6 +1568,7 @@ dflash_draft_quant_activation_bits: settings.dflash_draft_quant_activation_bits || 16, dflash_draft_quant_group_size: settings.dflash_draft_quant_group_size || 64, dflash_max_ctx: settings.dflash_max_ctx ?? null, + dflash_max_concurrent: settings.dflash_max_concurrent ?? null, dflash_in_memory_cache: settings.dflash_in_memory_cache !== false, dflash_in_memory_cache_max_entries: settings.dflash_in_memory_cache_max_entries || 4, dflash_in_memory_cache_max_gib: settings.dflash_in_memory_cache_max_bytes @@ -1676,6 +1677,9 @@ dflash_max_ctx: this.modelSettings.dflash_enabled && this.modelSettings.dflash_max_ctx ? parseInt(this.modelSettings.dflash_max_ctx) : null, + dflash_max_concurrent: this.modelSettings.dflash_enabled && this.modelSettings.dflash_max_concurrent + ? parseInt(this.modelSettings.dflash_max_concurrent) + : null, dflash_in_memory_cache: this.modelSettings.dflash_enabled ? !!this.modelSettings.dflash_in_memory_cache : true, @@ -1771,6 +1775,7 @@ this.modelSettings.dflash_draft_quant_activation_bits = null; this.modelSettings.dflash_draft_quant_group_size = null; this.modelSettings.dflash_max_ctx = null; + this.modelSettings.dflash_max_concurrent = null; this.modelSettings.dflash_in_memory_cache = true; this.modelSettings.dflash_in_memory_cache_max_entries = 4; this.modelSettings.dflash_in_memory_cache_max_gib = 8; @@ -2432,9 +2437,14 @@ }, benchGetSpeedup(batchResult) { + // Symmetric metric: batch tg_tps is wall-aggregate (total_gen / + // wall_time), so baseline must also be wall-aggregate. Using + // gen_tps (gen-only, excludes prefill) makes the ratio look + // sub-1x for any engine that doesn't parallelize prefill. const baseline = this.benchSingleResults.find(r => r.pp === 1024); - if (!baseline || !baseline.gen_tps || baseline.gen_tps <= 0) return 0; - return batchResult.tg_tps / baseline.gen_tps; + const base = baseline && baseline.wall_tg_tps ? baseline.wall_tg_tps : (baseline ? baseline.gen_tps : 0); + if (!base || base <= 0) return 0; + return batchResult.tg_tps / base; }, benchFormatMemory(bytes) { @@ -2468,7 +2478,7 @@ pad(r.ttft_ms.toFixed(1), 10), pad(r.tpot_ms.toFixed(2), 10), pad(r.processing_tps.toFixed(1) + ' tok/s', 12), - pad(r.gen_tps.toFixed(1) + ' tok/s', 12), + pad(((r.wall_tg_tps ?? r.gen_tps)).toFixed(1) + ' tok/s', 12), pad(r.e2e_latency_s.toFixed(3), 10), pad(r.total_throughput.toFixed(1) + ' tok/s', 12), pad(this.benchFormatMemory(r.peak_memory_bytes), 10), @@ -2487,10 +2497,13 @@ lines.push('-'.repeat(80)); const hdr = [rpad('Batch', 8), pad('tg TPS', 12), pad('Speedup', 8), pad('pp TPS', 12), pad('pp TPS/req', 12), pad('TTFT(ms)', 10), pad('E2E(s)', 10)]; lines.push(hdr.join(' ')); + // 1x baseline uses wall_tg_tps (wall-aggregate) to stay + // symmetric with batch tg_tps below. See benchGetSpeedup. + const baseTg = baseline && baseline.wall_tg_tps ? baseline.wall_tg_tps : (baseline ? baseline.gen_tps : 0); if (baseline) { const row = [ rpad('1x', 8), - pad(baseline.gen_tps.toFixed(1) + ' tok/s', 12), + pad(baseTg.toFixed(1) + ' tok/s', 12), pad('1.00x', 8), pad(baseline.processing_tps.toFixed(1) + ' tok/s', 12), pad(baseline.processing_tps.toFixed(1) + ' tok/s', 12), @@ -2500,7 +2513,7 @@ lines.push(row.join(' ')); } for (const r of results) { - const speedup = baseline && baseline.gen_tps > 0 ? (r.tg_tps / baseline.gen_tps).toFixed(2) + 'x' : '-'; + const speedup = baseTg > 0 ? (r.tg_tps / baseTg).toFixed(2) + 'x' : '-'; const row = [ rpad(r.batch_size + 'x', 8), pad(r.tg_tps.toFixed(1) + ' tok/s', 12), diff --git a/omlx/admin/templates/dashboard/_bench.html b/omlx/admin/templates/dashboard/_bench.html index e2a31432..49f846a0 100644 --- a/omlx/admin/templates/dashboard/_bench.html +++ b/omlx/admin/templates/dashboard/_bench.html @@ -188,7 +188,7 @@

{{ t('bench.headi - + @@ -225,7 +225,7 @@

{{ t('bench.headi {{ t('bench.results.batch.baseline') }} - + 1.00x diff --git a/omlx/admin/templates/dashboard/_modal_model_settings.html b/omlx/admin/templates/dashboard/_modal_model_settings.html index e5c262e6..39e122f1 100644 --- a/omlx/admin/templates/dashboard/_modal_model_settings.html +++ b/omlx/admin/templates/dashboard/_modal_model_settings.html @@ -661,10 +661,10 @@

{{
- + {{ t('modal.model_settings.dflash_max_ctx') }} + +

{{ t('modal.model_settings.dflash_max_ctx_help') }}

+
+
+ + -

Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.

+

{{ t('modal.model_settings.dflash_max_concurrent_help') }}

- In-memory cache -

DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.

+ {{ t('modal.model_settings.dflash_l1_cache') }} +

{{ t('modal.model_settings.dflash_l1_cache_hint') }}

- + -

Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.

+

{{ t('modal.model_settings.dflash_l1_max_entries_help') }}

- + -

Byte budget for L1 snapshots; LRU evicts when exceeded.

+

{{ t('modal.model_settings.dflash_l1_max_gib_help') }}

- SSD cache -

L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (dflash_l2/).

-

Enable oMLX paged SSD cache first (--paged-ssd-cache-dir).

-

Requires in-memory cache to be enabled.

+ {{ t('modal.model_settings.dflash_l2_cache') }} +

{{ t('modal.model_settings.dflash_l2_cache_hint') | safe }}

+

{{ t('modal.model_settings.dflash_l2_unavailable') | safe }}

+

{{ t('modal.model_settings.dflash_l2_requires_l1') }}