Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions omlx/admin/accuracy_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import asyncio
import logging
import sys
import time
import uuid
from dataclasses import dataclass, field
Expand Down Expand Up @@ -47,6 +48,10 @@ class AccuracyBenchmarkRequest(BaseModel):
benchmarks: dict[str, int] # name -> sample_size (0 = full dataset)
batch_size: int = 1
enable_thinking: bool = False
# Ephemeral ModelSettings overrides applied for the duration of this run
# only. Persisted settings are untouched. None / empty means "use whatever
# is on disk". Unknown keys are dropped with a warning by the manager.
settings_override: Optional[dict[str, Any]] = None

@field_validator("batch_size")
@classmethod
Expand Down Expand Up @@ -283,7 +288,21 @@ async def run_accuracy_benchmark(
engine_pool._suppress_ttl = True
start_time = time.time()

# Apply per-run setting overrides (from the bench-tab settings panel) for
# the duration of this run only. Persisted model_settings.json is
# untouched. Engine-init flags are picked up by the model load below;
# sampling-class overrides flow through get_settings() into sampling_kwargs.
# Entered inside `try:` so any exception during __enter__ is caught by
# the existing handlers and the finally block releases cleanly.
sm = getattr(engine_pool, "_settings_manager", None)
override_ctx = None

try:
if sm is not None and request.settings_override:
override_ctx = sm.ephemeral_overrides(
request.model_id, request.settings_override
)
override_ctx.__enter__()
# Phase 1: Unload all models
loaded_ids = engine_pool.get_loaded_model_ids()
if loaded_ids:
Expand Down Expand Up @@ -499,3 +518,13 @@ async def on_progress(current: int, total: int) -> None:
finally:
# Re-enable TTL auto-unload
engine_pool._suppress_ttl = False
if override_ctx is not None:
try:
# Pass through the live exception info (if any) so the context
# manager sees the same triple Python's `with` would supply.
override_ctx.__exit__(*sys.exc_info())
except Exception as e:
logger.warning(
f"Accuracy benchmark: failed to release ephemeral "
f"overrides for {request.model_id}: {e}"
)
37 changes: 36 additions & 1 deletion omlx/admin/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import json
import logging
import re
import sys
import time
import uuid
from dataclasses import dataclass, field
Expand Down Expand Up @@ -43,6 +44,11 @@ class BenchmarkRequest(BaseModel):
prompt_lengths: list[int]
generation_length: int = 128
batch_sizes: list[int] = []
# Ephemeral ModelSettings overrides applied for the duration of this run
# only. Persisted settings are untouched. None / empty means "use whatever
# is on disk". Unknown keys are dropped with a warning by the manager.
settings_override: Optional[dict[str, Any]] = None

@field_validator("prompt_lengths")
@classmethod
def validate_prompt_lengths(cls, v: list[int]) -> list[int]:
Expand Down Expand Up @@ -644,11 +650,28 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None:
current_test = 0
overall_start = time.perf_counter()

# Apply per-run setting overrides (from the bench-tab settings panel) for
# the duration of this run only. Persisted model_settings.json is untouched.
# Engine-init flags (TurboQuant/DFlash/MTP/...) are picked up because
# Phase 2 reloads the model. We enter the context manually inside `try:`
# (not via `with`) so the existing try/except body below stays unchanged;
# the matching `finally` at the bottom releases it. Entering inside the
# try means an exception during __enter__ is caught by the existing
# handlers — no leaked override token.
sm = getattr(engine_pool, "_settings_manager", None)
override_ctx = None

try:
if sm is not None and request.settings_override:
override_ctx = sm.ephemeral_overrides(
request.model_id, request.settings_override
)
override_ctx.__enter__()
# Snapshot experimental flags at run start. Settings can change mid-run
# (user toggling DFlash/SpecPrefill/TurboQuant), and the produced
# numbers are tied to whatever was active when generation actually ran.
sm = getattr(engine_pool, "_settings_manager", None)
# With an override active this reflects the merged view, so
# override-induced experimental flags also block omlx.ai upload.
if sm is not None:
try:
s = sm.get_settings(request.model_id)
Expand Down Expand Up @@ -860,3 +883,15 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None:
await engine_pool._unload_engine(request.model_id)
except Exception:
pass

finally:
if override_ctx is not None:
try:
# Pass through the live exception info (if any) so the context
# manager sees the same triple Python's `with` would supply.
override_ctx.__exit__(*sys.exc_info())
except Exception as e:
logger.warning(
f"Benchmark: failed to release ephemeral overrides for "
f"{request.model_id}: {e}"
)
Loading