Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 106 additions & 13 deletions omlx/admin/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,18 @@ def _compute_single_metrics(
ttft_ms = ttft_s * 1000
tpot_ms = (gen_duration / max(completion_tokens - 1, 1)) * 1000
gen_tps = completion_tokens / max(gen_duration, 1e-9)
# wall_tg_tps: gen tokens over total wall (includes prefill). Used as the
# 1x baseline for Continuous Batching speedup so the ratio is symmetric
# with batch tg_tps (which is also total_gen / wall_time).
wall_tg_tps = completion_tokens / max(e2e_duration, 1e-9)
processing_tps = prompt_tokens / max(ttft_s, 1e-9)
total_throughput = (prompt_tokens + completion_tokens) / max(e2e_duration, 1e-9)

return {
"ttft_ms": round(ttft_ms, 1),
"tpot_ms": round(tpot_ms, 2),
"gen_tps": round(gen_tps, 1),
"wall_tg_tps": round(wall_tg_tps, 1),
"processing_tps": round(processing_tps, 1),
"e2e_latency_s": round(e2e_duration, 3),
"total_throughput": round(total_throughput, 1),
Expand Down Expand Up @@ -238,6 +243,76 @@ async def _run_single_test(
)



async def _run_batch_test_via_stream_generate(
engine: Any,
prompts: list[str],
prompt_tokens: int,
max_tokens: int,
batch_size: int,
) -> dict:
"""Batch test path for engines that only expose high-level stream_generate
(no engine_core). Used for DFlashEngine where concurrent requests
serialize through _active_request; metrics will reflect serial behavior.
"""

async def _single_request(prompt: str) -> dict:
start = time.perf_counter()
first_token = None
completion_tokens = 0
async for output in engine.stream_generate(
prompt=prompt, max_tokens=max_tokens, temperature=0.0
):
ct = getattr(output, "completion_tokens", None)
if ct is not None and ct > 0 and first_token is None:
first_token = time.perf_counter()
if ct is not None:
completion_tokens = ct
if getattr(output, "finished", False):
break
end = time.perf_counter()
if first_token is None:
first_token = end
return {
"ttft_s": first_token - start,
"first_token_abs": first_token,
"completion_tokens": completion_tokens,
"wall_s": end - start,
}

wall_start = time.perf_counter()
results = await asyncio.gather(
*[_single_request(prompts[i]) for i in range(batch_size)]
)
wall_end = time.perf_counter()

total_gen_tokens = sum(r["completion_tokens"] for r in results)
total_prompt_tokens = prompt_tokens * batch_size
wall_time = wall_end - wall_start
avg_ttft_ms = (sum(r["ttft_s"] for r in results) / batch_size) * 1000

# pp TPS: total prompt tokens / time until ALL requests finish prefill
max_first_token = max(r["first_token_abs"] for r in results)
prefill_wall_time = max_first_token - wall_start
pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)

# tg TPS: wall-aggregate (total_gen / wall_time). Same formula used in
# the engine_core batch path so DFlash ↔ BatchedEngine ratios are
# symmetric. The alternative (gen_wall_time = wall_end - max_first_token)
# is inflated for any engine that serializes prefill or decode, and
# makes cross-engine speedup columns meaningless. See _run_batch_test.
tg_tps = total_gen_tokens / max(wall_time, 1e-9)

return {
"pp_tps": round(pp_tps, 1),
"tg_tps": round(tg_tps, 1),
"avg_ttft_ms": round(avg_ttft_ms, 1),
"e2e_latency_s": round(wall_time, 3),
"total_gen_tokens": total_gen_tokens,
"batch_size": batch_size,
}


async def _run_batch_test(
engine: Any,
prompts: list[str],
Expand All @@ -247,7 +322,7 @@ async def _run_batch_test(
) -> dict:
"""Run a continuous batching benchmark test.

Submits batch_size concurrent requests via the engine core and measures
Submits batch_size concurrent requests via the engine and measures
aggregate throughput including pp TPS and tg TPS.

Args:
Expand All @@ -256,6 +331,18 @@ async def _run_batch_test(
has a unique UUID prefix.
prompt_tokens: Number of prompt tokens per request (for pp TPS calc).
"""
# Dispatch: engines exposing engine_core (BatchedEngine, VLMBatchedEngine)
# use add_request/stream_outputs; DFlashEngine uses stream_generate (high
# level) and serializes requests via its _active_request lock.
if not hasattr(engine, "_engine"):
return await _run_batch_test_via_stream_generate(
engine=engine,
prompts=prompts,
prompt_tokens=prompt_tokens,
max_tokens=max_tokens,
batch_size=batch_size,
)

from ..request import SamplingParams

engine_core = engine._engine
Expand Down Expand Up @@ -314,10 +401,10 @@ async def _single_request(prompt: str) -> dict:
prefill_wall_time = max_first_token - wall_start
pp_tps = total_prompt_tokens / max(prefill_wall_time, 1e-9)

# tg TPS: total generated tokens / generation wall time
# Generation starts when the last request finishes prefill
gen_wall_time = wall_end - max_first_token
tg_tps = total_gen_tokens / max(gen_wall_time, 1e-9)
# tg TPS: wall-aggregate (total_gen / wall_time), same as the DFlash
# path. Honest cross-engine ratio with the Single Request wall_tg_tps
# baseline — see _run_batch_test_via_stream_generate.
tg_tps = total_gen_tokens / max(wall_time, 1e-9)

return {
"pp_tps": round(pp_tps, 1),
Expand Down Expand Up @@ -502,7 +589,11 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
(r for r in single_results if r.get("pp") == 1024), None
)
if pp1024_single and batch_results:
baseline_tps = pp1024_single["gen_tps"]
# Use wall-aggregate baseline so 1x ↔ Nx ratios stay honest across
# engine types. Single Request table keeps showing gen_tps (peak
# decode rate) for context-length comparisons; Continuous Batching
# column needs the symmetric metric.
baseline_tps = pp1024_single["wall_tg_tps"]
batching_results.append({
"batch_size": 1,
"tg_tps": baseline_tps,
Expand Down Expand Up @@ -536,6 +627,9 @@ async def _upload_to_omlx_ai(run: BenchmarkRun, engine_pool: Any) -> None:
"quantization": quantization,
"context_length": context_length,
"pp_tps": result["processing_tps"],
# Community board metric: peak decode rate (gen_tps, gen-only),
# NOT the wall-aggregate used for in-UI Continuous Batching
# speedup. Do not "tidy" this to wall_tg_tps.
"tg_tps": result["gen_tps"],
"ttft_ms": result.get("ttft_ms"),
"peak_memory_gb": peak_mem_gb,
Expand Down Expand Up @@ -760,13 +854,12 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None:
batch_prompts = [_generate_prompt(tokenizer, 1024) for _ in range(max_batch)]

# Skip batch tests for engines without scheduler core (e.g. DFlashEngine)
if request.batch_sizes and not hasattr(engine, "_engine"):
logger.info(
"Batch test skipped: engine does not support concurrent batching"
)
current_test += len(request.batch_sizes)

for batch_size in request.batch_sizes if hasattr(engine, "_engine") else []:
# NOTE: DFlashEngine doesn't expose engine_core (`_engine`) but does
# support concurrent requests by serializing them through its
# _active_request lock. Running batch test on DFlash yields valid
# metrics that show serialization behavior (4x wall time, no
# aggregate gain) — strictly more useful than silently skipping.
for batch_size in request.batch_sizes:
current_test += 1
await _send_event(run, {
"type": "progress",
Expand Down
27 changes: 25 additions & 2 deletions omlx/admin/i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -688,5 +688,28 @@
"js.error.delete_model_failed": "Failed to delete model",
"js.error.delete_model_connection": "Failed to delete model. Check server connection.",
"js.success.download_started": "Download started: {repo_id}",
"js.success.settings_saved": "Settings saved successfully"
}
"js.success.settings_saved": "Settings saved successfully",
"modal.model_settings.dflash_hint": "Block diffusion speculative decoding for 3-4x faster generation. Requires a DFlash draft model checkpoint.<br><strong>Single-stream only: requests run one at a time.</strong><br>* MLX impl by bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
"modal.model_settings.dflash_draft_model": "Draft Model",
"modal.model_settings.dflash_draft_model_placeholder": "Select draft model...",
"modal.model_settings.dflash_draft_model_help": "DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash)",
"modal.model_settings.dflash_draft_quant": "Draft Quantization",
"modal.model_settings.dflash_draft_quant_help": "Quantization of the draft model only — independent of target model quantization.",
"modal.model_settings.dflash_draft_quant_bf16": "bf16 (default)",
"modal.model_settings.dflash_max_ctx": "Max Context (fallback threshold)",
"modal.model_settings.dflash_max_ctx_placeholder": "unlimited",
"modal.model_settings.dflash_max_ctx_help": "Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.",
"modal.model_settings.dflash_max_concurrent": "Max Concurrent",
"modal.model_settings.dflash_max_concurrent_placeholder": "4 (default)",
"modal.model_settings.dflash_max_concurrent_help": "Cap on simultaneous in-flight DFlash requests; excess requests queue at the gate. DFlash decode is single-stream by design — this value does NOT increase throughput. It is a resource admission gate that bounds memory under bursts (each in-flight request holds its own KV cache, hundreds of MB to several GB) and keeps tail latency predictable. Default 4. On tight memory (< 64 GB) set 1–2; on 128 GB or more set 8. Leave empty for unlimited (not recommended unless you trust the upstream load shape).",
"modal.model_settings.dflash_l1_cache": "In-memory cache",
"modal.model_settings.dflash_l1_cache_hint": "DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.",
"modal.model_settings.dflash_l1_max_entries": "In-memory cache max entries",
"modal.model_settings.dflash_l1_max_entries_help": "Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.",
"modal.model_settings.dflash_l1_max_gib": "In-memory cache size (GiB)",
"modal.model_settings.dflash_l1_max_gib_help": "Byte budget for L1 snapshots; LRU evicts when exceeded.",
"modal.model_settings.dflash_l2_cache": "SSD cache",
"modal.model_settings.dflash_l2_cache_hint": "L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (<code>dflash_l2/</code>).",
"modal.model_settings.dflash_l2_unavailable": "Enable oMLX paged SSD cache first (<code>--paged-ssd-cache-dir</code>).",
"modal.model_settings.dflash_l2_requires_l1": "Requires in-memory cache to be enabled."
}
25 changes: 24 additions & 1 deletion omlx/admin/i18n/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -688,5 +688,28 @@
"js.error.delete_model_failed": "刪除模型失敗",
"js.error.delete_model_connection": "刪除模型失敗,請檢查伺服器連線。",
"js.success.download_started": "已開始下載:{repo_id}",
"js.success.settings_saved": "設定儲存成功"
"js.success.settings_saved": "設定儲存成功",
"modal.model_settings.dflash_hint": "块扩散投机解码,单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。<br><strong>仅单流:请求按顺序处理。</strong><br>* MLX 实现:bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
"modal.model_settings.dflash_draft_model": "草稿模型",
"modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
"modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint(例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash)",
"modal.model_settings.dflash_draft_quant": "草稿模型量化",
"modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化,与主模型量化独立。",
"modal.model_settings.dflash_draft_quant_bf16": "bf16(默认)",
"modal.model_settings.dflash_max_ctx": "最大上下文(fallback 阈值)",
"modal.model_settings.dflash_max_ctx_placeholder": "不限",
"modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
"modal.model_settings.dflash_max_concurrent": "最大並發",
"modal.model_settings.dflash_max_concurrent_placeholder": "4(預設)",
"modal.model_settings.dflash_max_concurrent_help": "DFlash 同時處理中的請求數上限,超出的請求在入口排隊等空位。DFlash 解碼本身是嚴格單流設計——這個值不會提高吞吐速率,作用是資源 admission 閘門:突發流量下控制記憶體佔用(每個處理中的請求各自持有 KV cache,幾百 MB 到幾 GB),同時讓尾延遲可預測。預設 4。記憶體緊(< 64 GB)建議設 1-2;128 GB+ 可設 8。留空為不限(除非你確信上游流量形態可控,否則不推薦)。",
"modal.model_settings.dflash_l1_cache": "内存缓存",
"modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
"modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
"modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
"modal.model_settings.dflash_l1_max_gib": "内存缓存大小(GiB)",
"modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算;超过时 LRU 淘汰。",
"modal.model_settings.dflash_l2_cache": "SSD 缓存",
"modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录(<code>dflash_l2/</code>)。",
"modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache(<code>--paged-ssd-cache-dir</code>)。",
"modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
}
25 changes: 24 additions & 1 deletion omlx/admin/i18n/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -688,5 +688,28 @@
"js.error.delete_model_failed": "删除模型失败",
"js.error.delete_model_connection": "删除模型失败,请检查服务器连接。",
"js.success.download_started": "已开始下载:{repo_id}",
"js.success.settings_saved": "设置已成功保存"
"js.success.settings_saved": "设置已成功保存",
"modal.model_settings.dflash_hint": "块扩散投机解码,单请求可加速 3-4 倍。需要 DFlash 草稿模型 checkpoint。<br><strong>仅单流:请求按顺序处理。</strong><br>* MLX 实现:bstnxbt(<a href=\"https://github.com/bstnxbt/dflash-mlx\" target=\"_blank\" rel=\"noopener\" class=\"text-blue-500 hover:text-blue-700 underline\">GitHub</a>)",
"modal.model_settings.dflash_draft_model": "草稿模型",
"modal.model_settings.dflash_draft_model_placeholder": "选择草稿模型...",
"modal.model_settings.dflash_draft_model_help": "DFlash 草稿 checkpoint(例如 z-lab/Qwen3-4B-DFlash-b16、z-lab/gemma-4-26B-A4B-it-DFlash)",
"modal.model_settings.dflash_draft_quant": "草稿模型量化",
"modal.model_settings.dflash_draft_quant_help": "仅影响草稿模型量化,与主模型量化独立。",
"modal.model_settings.dflash_draft_quant_bf16": "bf16(默认)",
"modal.model_settings.dflash_max_ctx": "最大上下文(fallback 阈值)",
"modal.model_settings.dflash_max_ctx_placeholder": "不限",
"modal.model_settings.dflash_max_ctx_help": "提示长度达到或超过此 token 数时切换到 BatchedEngine。留空为不限。",
"modal.model_settings.dflash_max_concurrent": "最大并发",
"modal.model_settings.dflash_max_concurrent_placeholder": "4(默认)",
"modal.model_settings.dflash_max_concurrent_help": "DFlash 同时处理中的请求数上限,超出的请求在入口排队等空位。DFlash 解码本身是严格单流设计——这个值不会提高吞吐速率,作用是资源 admission 闸门:突发流量下控制内存占用(每个处理中的请求各自持有 KV cache,几百 MB 到几 GB),同时让尾延迟可预测。默认 4。内存紧(< 64 GB)建议设 1-2;128 GB+ 可设 8。留空为不限(除非你确信上游流量形态可控,否则不推荐)。",
"modal.model_settings.dflash_l1_cache": "内存缓存",
"modal.model_settings.dflash_l1_cache_hint": "DFlash L1 前缀快照内存缓存。加速共享前缀的多轮对话。",
"modal.model_settings.dflash_l1_max_entries": "内存缓存最大条目数",
"modal.model_settings.dflash_l1_max_entries_help": "L1 缓存中保留的最大前缀快照数。每条条目存储一个对话前缀的 KV + 草稿 GDN 状态。",
"modal.model_settings.dflash_l1_max_gib": "内存缓存大小(GiB)",
"modal.model_settings.dflash_l1_max_gib_help": "L1 快照字节预算;超过时 LRU 淘汰。",
"modal.model_settings.dflash_l2_cache": "SSD 缓存",
"modal.model_settings.dflash_l2_cache_hint": "L1 淘汰条目 spill 到磁盘的 L2 缓存。使用 oMLX paged SSD cache 目录(<code>dflash_l2/</code>)。",
"modal.model_settings.dflash_l2_unavailable": "请先启用 oMLX paged SSD cache(<code>--paged-ssd-cache-dir</code>)。",
"modal.model_settings.dflash_l2_requires_l1": "需要先启用内存缓存。"
}
6 changes: 6 additions & 0 deletions omlx/admin/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ class ModelSettingsRequest(BaseModel):
dflash_draft_quant_activation_bits: Optional[int] = None
dflash_draft_quant_group_size: Optional[int] = None
dflash_max_ctx: Optional[int] = None
dflash_max_concurrent: Optional[int] = None
dflash_in_memory_cache: Optional[bool] = None
dflash_in_memory_cache_max_entries: Optional[int] = None
dflash_in_memory_cache_max_bytes: Optional[int] = None
Expand Down Expand Up @@ -1662,6 +1663,7 @@ async def list_models(is_admin: bool = Depends(require_admin)):
"dflash_draft_quant_activation_bits": settings.dflash_draft_quant_activation_bits,
"dflash_draft_quant_group_size": settings.dflash_draft_quant_group_size,
"dflash_max_ctx": settings.dflash_max_ctx,
"dflash_max_concurrent": settings.dflash_max_concurrent,
"dflash_in_memory_cache": settings.dflash_in_memory_cache,
"dflash_in_memory_cache_max_entries": settings.dflash_in_memory_cache_max_entries,
"dflash_in_memory_cache_max_bytes": settings.dflash_in_memory_cache_max_bytes,
Expand Down Expand Up @@ -1946,6 +1948,10 @@ async def update_model_settings(
# 0/None means "unlimited" — the engine treats None as no fallback threshold
value = request.dflash_max_ctx
current_settings.dflash_max_ctx = value if value and value > 0 else None
if "dflash_max_concurrent" in sent:
# 0/None means "unlimited" — the engine treats None as no concurrent cap
value = request.dflash_max_concurrent
current_settings.dflash_max_concurrent = value if value and value > 0 else None
if "dflash_in_memory_cache" in sent:
current_settings.dflash_in_memory_cache = bool(request.dflash_in_memory_cache)
if "dflash_in_memory_cache_max_entries" in sent:
Expand Down
Loading