diff --git a/omlx/admin/accuracy_benchmark.py b/omlx/admin/accuracy_benchmark.py index 74eb7fcd2..5fd0b1ef3 100644 --- a/omlx/admin/accuracy_benchmark.py +++ b/omlx/admin/accuracy_benchmark.py @@ -10,6 +10,7 @@ import asyncio import logging +import sys import time import uuid from dataclasses import dataclass, field @@ -47,6 +48,10 @@ class AccuracyBenchmarkRequest(BaseModel): benchmarks: dict[str, int] # name -> sample_size (0 = full dataset) batch_size: int = 1 enable_thinking: bool = False + # Ephemeral ModelSettings overrides applied for the duration of this run + # only. Persisted settings are untouched. None / empty means "use whatever + # is on disk". Unknown keys are dropped with a warning by the manager. + settings_override: Optional[dict[str, Any]] = None @field_validator("batch_size") @classmethod @@ -283,7 +288,21 @@ async def run_accuracy_benchmark( engine_pool._suppress_ttl = True start_time = time.time() + # Apply per-run setting overrides (from the bench-tab settings panel) for + # the duration of this run only. Persisted model_settings.json is + # untouched. Engine-init flags are picked up by the model load below; + # sampling-class overrides flow through get_settings() into sampling_kwargs. + # Entered inside `try:` so any exception during __enter__ is caught by + # the existing handlers and the finally block releases cleanly. + sm = getattr(engine_pool, "_settings_manager", None) + override_ctx = None + try: + if sm is not None and request.settings_override: + override_ctx = sm.ephemeral_overrides( + request.model_id, request.settings_override + ) + override_ctx.__enter__() # Phase 1: Unload all models loaded_ids = engine_pool.get_loaded_model_ids() if loaded_ids: @@ -499,3 +518,13 @@ async def on_progress(current: int, total: int) -> None: finally: # Re-enable TTL auto-unload engine_pool._suppress_ttl = False + if override_ctx is not None: + try: + # Pass through the live exception info (if any) so the context + # manager sees the same triple Python's `with` would supply. + override_ctx.__exit__(*sys.exc_info()) + except Exception as e: + logger.warning( + f"Accuracy benchmark: failed to release ephemeral " + f"overrides for {request.model_id}: {e}" + ) diff --git a/omlx/admin/benchmark.py b/omlx/admin/benchmark.py index 7f0d620b1..1c7b2afd1 100644 --- a/omlx/admin/benchmark.py +++ b/omlx/admin/benchmark.py @@ -9,6 +9,7 @@ import json import logging import re +import sys import time import uuid from dataclasses import dataclass, field @@ -43,6 +44,11 @@ class BenchmarkRequest(BaseModel): prompt_lengths: list[int] generation_length: int = 128 batch_sizes: list[int] = [] + # Ephemeral ModelSettings overrides applied for the duration of this run + # only. Persisted settings are untouched. None / empty means "use whatever + # is on disk". Unknown keys are dropped with a warning by the manager. + settings_override: Optional[dict[str, Any]] = None + @field_validator("prompt_lengths") @classmethod def validate_prompt_lengths(cls, v: list[int]) -> list[int]: @@ -644,11 +650,28 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None: current_test = 0 overall_start = time.perf_counter() + # Apply per-run setting overrides (from the bench-tab settings panel) for + # the duration of this run only. Persisted model_settings.json is untouched. + # Engine-init flags (TurboQuant/DFlash/MTP/...) are picked up because + # Phase 2 reloads the model. We enter the context manually inside `try:` + # (not via `with`) so the existing try/except body below stays unchanged; + # the matching `finally` at the bottom releases it. Entering inside the + # try means an exception during __enter__ is caught by the existing + # handlers — no leaked override token. + sm = getattr(engine_pool, "_settings_manager", None) + override_ctx = None + try: + if sm is not None and request.settings_override: + override_ctx = sm.ephemeral_overrides( + request.model_id, request.settings_override + ) + override_ctx.__enter__() # Snapshot experimental flags at run start. Settings can change mid-run # (user toggling DFlash/SpecPrefill/TurboQuant), and the produced # numbers are tied to whatever was active when generation actually ran. - sm = getattr(engine_pool, "_settings_manager", None) + # With an override active this reflects the merged view, so + # override-induced experimental flags also block omlx.ai upload. if sm is not None: try: s = sm.get_settings(request.model_id) @@ -860,3 +883,15 @@ async def run_benchmark(run: BenchmarkRun, engine_pool: Any) -> None: await engine_pool._unload_engine(request.model_id) except Exception: pass + + finally: + if override_ctx is not None: + try: + # Pass through the live exception info (if any) so the context + # manager sees the same triple Python's `with` would supply. + override_ctx.__exit__(*sys.exc_info()) + except Exception as e: + logger.warning( + f"Benchmark: failed to release ephemeral overrides for " + f"{request.model_id}: {e}" + ) diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js index c4831fb21..fc5895054 100644 --- a/omlx/admin/static/js/dashboard.js +++ b/omlx/admin/static/js/dashboard.js @@ -392,6 +392,21 @@ benchTab: 'throughput', benchDropdown: false, + // ---- Bench-tab inline Run-time Settings panel ---- + // Mirrors modelSettings shape; ephemeral by default, persists only + // when the user clicks Save (or Save as Profile). + benchSettingsOpen: false, + benchSettings: {}, // hydrated from selected model's persisted settings + benchSettingsBaseline: {}, // last-hydrated snapshot for dirty detection / reset + benchSettingsSaving: false, + benchSettingsRecentlySaved: false, // ~1.5s post-save flag → checkmark in Save button + benchSettingsSaveProfileOpen: false, + benchSettingsNewProfile: { display_name: '', description: '' }, + benchSettingsStatus: '', // transient status line (profile created / errors) + benchProfiles: [], // per-model profiles for benchModelId (loaded on hydrate) + benchProfileScope: 'model', // 'preset' | 'global' | 'model' + benchActiveProfileName: null, + // Accuracy benchmark state accModelId: '', accBenchmarks: { mmlu: true, mmlu_pro: false, kmmlu: false, cmmlu: false, jmmlu: false, hellaswag: false, truthfulqa: true, arc_challenge: false, winogrande: false, gsm8k: false, mathqa: false, humaneval: true, mbpp: false, livecodebench: false, bbq: false, safetybench: false }, @@ -452,6 +467,19 @@ accShowText: false, accCopied: false, + // ---- Accuracy-tab inline Run-time Settings panel ---- + accSettingsOpen: false, + accSettings: {}, + accSettingsBaseline: {}, + accSettingsSaving: false, + accSettingsRecentlySaved: false, + accSettingsSaveProfileOpen: false, + accSettingsNewProfile: { display_name: '', description: '' }, + accSettingsStatus: '', + accProfiles: [], + accProfileScope: 'model', + accActiveProfileName: null, + async init() { // Apply theme this.applyTheme(); @@ -475,6 +503,15 @@ this.handleMainTabChange(value); }); + // Bench-tab inline settings panels: re-hydrate from server + // baseline whenever the selected model changes. Initial hydrate + // runs once here so the panel state is well-formed even before + // a model is picked. + this._panelHydrate('bench'); + this._panelHydrate('acc'); + this.$watch('benchModelId', () => this._panelHydrate('bench')); + this.$watch('accModelId', () => this._panelHydrate('acc')); + this.$watch('hfMlxOnly', () => { this.hfRecommended = { trending: [], popular: [] }; this.hfRecommendedLoaded = false; @@ -1202,43 +1239,84 @@ }, _resetPresetApplicableFields() { - // Reset all fields a preset can touch so switching presets does not leave - // stale values. Intentionally does NOT touch model_alias / model_type_override - // / is_pinned / is_default / turboquant_* / dflash_* / specprefill_* / index_cache_*. - const ms = this.modelSettings; - ms.temperature = null; - ms.top_p = null; - ms.top_k = null; - ms.min_p = null; - ms.repetition_penalty = null; - ms.presence_penalty = null; - ms.force_sampling = false; - ms.max_context_window = null; - ms.max_tokens = null; - ms.reasoning_parser = null; - ms.ttl_seconds = null; - ms.enable_thinking = null; - ms.enableThinkingBudget = false; - ms.thinking_budget_tokens = null; - ms.enableToolResultLimit = false; - ms.max_tool_result_tokens = null; - ms.ctKwargEntries = []; + this._resetStatePresetFields(this.modelSettings); + }, + + // Pure version of _resetPresetApplicableFields — operates on any + // state object (modal's modelSettings, bench's benchSettings, + // accuracy's accSettings). Resets all fields a preset can touch + // so switching presets does not leave stale values. Intentionally + // does NOT touch model_alias / model_type_override / is_pinned / + // is_default / turboquant_* / dflash_* / specprefill_* / index_cache_*. + _resetStatePresetFields(state) { + state.temperature = null; + state.top_p = null; + state.top_k = null; + state.min_p = null; + state.repetition_penalty = null; + state.presence_penalty = null; + state.force_sampling = false; + state.max_context_window = null; + state.max_tokens = null; + state.reasoning_parser = null; + state.ttl_seconds = null; + state.enable_thinking = null; + state.enableThinkingBudget = false; + state.thinking_budget_tokens = null; + state.enableToolResultLimit = false; + state.max_tool_result_tokens = null; + state.ctKwargEntries = []; + }, + + // Pure profile/preset merge helpers — write a settings dict into + // a target state object (modal or panel) without touching + // selectedModel/activeProfileName or calling the backend. + // Modal- and panel-side wrappers add their own side effects + // (POSTs, status updates) around these. + _mergeProfileSettingsIntoState(state, settings) { + const fields = this.profileFields.universal.concat( + this.profileFields.model_specific, + ); + for (const k of fields) { + if (!(k in settings)) continue; + if (k === 'thinking_budget_enabled') { + state.enableThinkingBudget = !!settings[k]; + } else if (k === 'index_cache_freq') { + state.enableIndexCache = !!settings[k]; + state.index_cache_freq = settings[k] || null; + } else if (k === 'max_tool_result_tokens') { + state.enableToolResultLimit = !!settings[k]; + state.max_tool_result_tokens = settings[k] || null; + } else if (k === 'chat_template_kwargs' || k === 'forced_ct_kwargs') { + const ctk = settings.chat_template_kwargs || {}; + const forced = new Set(settings.forced_ct_kwargs || []); + const entries = []; + for (const [key, value] of Object.entries(ctk)) { + if (key === 'enable_thinking') { + entries.push({type:'enable_thinking', value:String(value), force:forced.has('enable_thinking')}); + } else if (key === 'reasoning_effort') { + entries.push({type:'reasoning_effort', value:String(value), force:forced.has('reasoning_effort')}); + } else { + entries.push({type:'custom', key, value:String(value), force:forced.has(key)}); + } + } + state.ctKwargEntries = entries; + } else { + state[k] = settings[k]; + } + } }, - applyPresetToForm(preset) { - // Reset first so previous preset's fields (e.g. presence_penalty) do not stick. - this._resetPresetApplicableFields(); - const s = preset.settings || {}; - const ms = this.modelSettings; - for (const k of Object.keys(s)) { + _mergePresetSettingsIntoState(state, settings) { + for (const k of Object.keys(settings)) { if (k === 'thinking_budget_enabled') { - ms.enableThinkingBudget = !!s[k]; + state.enableThinkingBudget = !!settings[k]; } else if (k === 'max_tool_result_tokens') { - ms.enableToolResultLimit = s[k] != null; - ms.max_tool_result_tokens = s[k] ?? null; + state.enableToolResultLimit = settings[k] != null; + state.max_tool_result_tokens = settings[k] ?? null; } else if (k === 'chat_template_kwargs' || k === 'forced_ct_kwargs') { - const ctk = s.chat_template_kwargs || {}; - const forced = new Set(s.forced_ct_kwargs || []); + const ctk = settings.chat_template_kwargs || {}; + const forced = new Set(settings.forced_ct_kwargs || []); const entries = []; for (const [key, value] of Object.entries(ctk)) { if (key === 'enable_thinking') { @@ -1249,11 +1327,17 @@ entries.push({type:'custom', key, value:String(value), force:forced.has(key)}); } } - ms.ctKwargEntries = entries; + state.ctKwargEntries = entries; } else { - ms[k] = s[k]; + state[k] = settings[k]; } } + }, + + applyPresetToForm(preset) { + // Reset first so previous preset's fields (e.g. presence_penalty) do not stick. + this._resetStatePresetFields(this.modelSettings); + this._mergePresetSettingsIntoState(this.modelSettings, preset.settings || {}); this.activeProfileName = null; this.profilesDrift = false; }, @@ -1304,37 +1388,7 @@ }, async applyProfileToForm(profile) { // Merge all profile fields into the form (no server call — user clicks Save to persist). - const s = profile.settings || {}; - const ms = this.modelSettings; - for (const k of this.profileFields.universal.concat(this.profileFields.model_specific)) { - if (!(k in s)) continue; - if (k === 'thinking_budget_enabled') { - ms.enableThinkingBudget = !!s[k]; - } else if (k === 'index_cache_freq') { - ms.enableIndexCache = !!s[k]; - ms.index_cache_freq = s[k] || null; - } else if (k === 'max_tool_result_tokens') { - ms.enableToolResultLimit = !!s[k]; - ms.max_tool_result_tokens = s[k] || null; - } else if (k === 'chat_template_kwargs' || k === 'forced_ct_kwargs') { - // Rebuild ctKwargEntries - const ctk = s.chat_template_kwargs || {}; - const forced = new Set(s.forced_ct_kwargs || []); - const entries = []; - for (const [key, value] of Object.entries(ctk)) { - if (key === 'enable_thinking') { - entries.push({type:'enable_thinking', value:String(value), force:forced.has('enable_thinking')}); - } else if (key === 'reasoning_effort') { - entries.push({type:'reasoning_effort', value:String(value), force:forced.has('reasoning_effort')}); - } else { - entries.push({type:'custom', key, value:String(value), force:forced.has(key)}); - } - } - ms.ctKwargEntries = entries; - } else { - ms[k] = s[k]; - } - } + this._mergeProfileSettingsIntoState(this.modelSettings, profile.settings || {}); // Persist active_profile_name to backend before updating UI state const seq = ++this._applySeq; try { @@ -2481,6 +2535,12 @@ prompt_lengths: promptLengths, generation_length: 128, batch_sizes: batchSizes, + // Ship inline panel edits as a per-run override. + // null when nothing's been touched, so the run uses + // whatever's persisted in model_settings.json. + settings_override: this.benchSettingsDirty() + ? this._settingsStateToPayload(this.benchSettings) + : null, }), }); @@ -2787,6 +2847,12 @@ ), batch_size: this.accBatchSize, enable_thinking: this.accEnableThinking, + // Ship inline panel edits as a per-run override. + // null when nothing's been touched, so the run uses + // whatever's persisted in model_settings.json. + settings_override: this.accSettingsDirty() + ? this._settingsStateToPayload(this.accSettings) + : null, }), }); if (!resp.ok) { @@ -4779,5 +4845,469 @@ this.msModelDetailLoading = false; } }, + + // ================================================================= + // Bench-tab inline Run-time Settings panel. + // + // Mirrors modelSettings shape and serialization, but binds to + // benchSettings / accSettings so edits don't pollute the modal's + // form. Edits are ephemeral by default — they ship as a + // settings_override on the bench request and revert on the next + // model selection. Save / Save-as-Profile reuse the existing + // /api/models/{id}/settings and /api/models/{id}/profiles + // endpoints used by the modal. + // ================================================================= + + // Empty/default panel state. Keep aligned with openModelSettings's + // assignment to this.modelSettings (dashboard.js:~1525). + _emptySettingsState() { + return { + model_alias: '', + model_type_override: '', + max_context_window: null, + max_tokens: null, + temperature: null, + top_p: null, + top_k: null, + repetition_penalty: null, + min_p: null, + presence_penalty: null, + force_sampling: false, + enable_thinking: null, + thinking_default: null, + enableThinkingBudget: false, + thinking_budget_tokens: null, + enableToolResultLimit: false, + max_tool_result_tokens: null, + reasoning_parser: '', + ttl_seconds: null, + enableIndexCache: false, + index_cache_freq: null, + turboquant_kv_enabled: false, + turboquant_kv_bits: 4, + specprefill_enabled: false, + specprefill_draft_model: '', + specprefill_keep_pct: '0.2', + specprefill_threshold: null, + dflash_enabled: false, + dflash_draft_model: '', + dflash_draft_quant_bits: '', + dflash_max_ctx: null, + dflash_in_memory_cache: true, + dflash_in_memory_cache_max_entries: 4, + dflash_in_memory_cache_max_gib: 8, + dflash_ssd_cache: false, + dflash_compatible: true, + dflash_compatibility_reason: '', + dflash_ssd_cache_available: false, + mtp_enabled: false, + mtp_compatible: false, + mtp_compatibility_reason: '', + vlm_mtp_enabled: false, + vlm_mtp_draft_model: '', + vlm_mtp_draft_block_size: null, + ctKwargEntries: [], + trust_remote_code: false, + }; + }, + + // Build a settings state from a model entry. Mirrors the + // assignment in openModelSettings; kept separate to avoid + // disturbing the modal's existing flow. + _modelToSettingsState(model) { + if (!model) return this._emptySettingsState(); + const settings = model.settings || {}; + const ctk = settings.chat_template_kwargs || {}; + const forcedKeys = new Set(settings.forced_ct_kwargs || []); + const ctKwargEntries = []; + for (const [key, value] of Object.entries(ctk)) { + if (key === 'enable_thinking') { + ctKwargEntries.push({type: 'enable_thinking', value: String(value), force: forcedKeys.has('enable_thinking')}); + } else if (key === 'reasoning_effort') { + ctKwargEntries.push({type: 'reasoning_effort', value: String(value), force: forcedKeys.has('reasoning_effort')}); + } else { + ctKwargEntries.push({type: 'custom', key, value: String(value), force: forcedKeys.has(key)}); + } + } + const isOcr = OCR_CONFIG_MODEL_TYPES.has(model.config_model_type || ''); + return { + model_alias: settings.model_alias || '', + model_type_override: settings.model_type_override || '', + max_context_window: settings.max_context_window || null, + max_tokens: settings.max_tokens || null, + temperature: isOcr ? 0.0 : (settings.temperature ?? null), + top_p: settings.top_p ?? null, + top_k: settings.top_k ?? null, + repetition_penalty: settings.repetition_penalty ?? null, + min_p: settings.min_p ?? null, + presence_penalty: settings.presence_penalty ?? null, + force_sampling: settings.force_sampling || false, + enable_thinking: settings.enable_thinking ?? null, + thinking_default: model.thinking_default ?? null, + enableThinkingBudget: !!(settings.thinking_budget_tokens), + thinking_budget_tokens: settings.thinking_budget_tokens || null, + enableToolResultLimit: !!(settings.max_tool_result_tokens), + max_tool_result_tokens: settings.max_tool_result_tokens || null, + reasoning_parser: settings.reasoning_parser || '', + ttl_seconds: settings.ttl_seconds ?? null, + enableIndexCache: !!(settings.index_cache_freq), + index_cache_freq: settings.index_cache_freq || null, + turboquant_kv_enabled: settings.turboquant_kv_enabled || false, + turboquant_kv_bits: settings.turboquant_kv_bits || 4, + specprefill_enabled: settings.specprefill_enabled || false, + specprefill_draft_model: settings.specprefill_draft_model || '', + specprefill_keep_pct: settings.specprefill_keep_pct ? String(settings.specprefill_keep_pct) : '0.2', + specprefill_threshold: settings.specprefill_threshold || null, + dflash_enabled: settings.dflash_enabled || false, + dflash_draft_model: settings.dflash_draft_model || '', + dflash_draft_quant_bits: settings.dflash_draft_quant_bits ? String(settings.dflash_draft_quant_bits) : '', + dflash_max_ctx: settings.dflash_max_ctx ?? null, + dflash_in_memory_cache: settings.dflash_in_memory_cache !== false, + dflash_in_memory_cache_max_entries: settings.dflash_in_memory_cache_max_entries || 4, + dflash_in_memory_cache_max_gib: settings.dflash_in_memory_cache_max_bytes + ? Math.round(settings.dflash_in_memory_cache_max_bytes / (1024 ** 3)) + : 8, + dflash_ssd_cache: settings.dflash_ssd_cache || false, + dflash_compatible: model.dflash_compatible !== false, + dflash_compatibility_reason: model.dflash_compatibility_reason || '', + dflash_ssd_cache_available: !!model.dflash_ssd_cache_available, + mtp_enabled: settings.mtp_enabled || false, + mtp_compatible: model.mtp_compatible === true, + mtp_compatibility_reason: model.mtp_compatibility_reason || '', + vlm_mtp_enabled: settings.vlm_mtp_enabled || false, + vlm_mtp_draft_model: settings.vlm_mtp_draft_model || '', + vlm_mtp_draft_block_size: settings.vlm_mtp_draft_block_size ?? null, + ctKwargEntries, + trust_remote_code: settings.trust_remote_code || false, + }; + }, + + // Convert a settings state object back to the wire payload used + // by both PUT /api/models/{id}/settings (full save) and the bench + // request body's settings_override. Mirrors saveModelSettings's + // IIFE (dashboard.js:~1582). Kept separate so changes to the + // modal don't quietly affect bench overrides. + _settingsStateToPayload(s) { + const chatTemplateKwargs = {}; + const forcedCtKwargs = []; + for (const entry of (s.ctKwargEntries || [])) { + if (entry.type === 'enable_thinking') { + chatTemplateKwargs.enable_thinking = entry.value === 'true'; + if (entry.force) forcedCtKwargs.push('enable_thinking'); + } else if (entry.type === 'reasoning_effort') { + chatTemplateKwargs.reasoning_effort = entry.value; + if (entry.force) forcedCtKwargs.push('reasoning_effort'); + } else if (entry.type === 'custom' && entry.key && entry.key.trim()) { + let val = entry.value; + if (val === 'true') val = true; + else if (val === 'false') val = false; + else if (typeof val === 'string' && val.trim() !== '' && !isNaN(Number(val))) val = Number(val); + const key = entry.key.trim(); + chatTemplateKwargs[key] = val; + if (entry.force) forcedCtKwargs.push(key); + } + } + return { + model_alias: s.model_alias?.trim() || null, + model_type_override: s.model_type_override || null, + max_context_window: s.max_context_window || null, + max_tokens: s.max_tokens || null, + temperature: Number.isFinite(s.temperature) ? s.temperature : null, + top_p: Number.isFinite(s.top_p) ? s.top_p : null, + top_k: Number.isFinite(s.top_k) ? s.top_k : null, + repetition_penalty: Number.isFinite(s.repetition_penalty) ? s.repetition_penalty : null, + min_p: Number.isFinite(s.min_p) ? s.min_p : null, + presence_penalty: Number.isFinite(s.presence_penalty) ? s.presence_penalty : null, + force_sampling: s.force_sampling, + reasoning_parser: s.reasoning_parser || null, + // Preserve explicit 0 (means "no TTL"); only fall to null + // when the value is missing/non-finite. + ttl_seconds: Number.isFinite(s.ttl_seconds) ? s.ttl_seconds : null, + index_cache_freq: s.enableIndexCache ? (s.index_cache_freq || 4) : 0, + enable_thinking: s.enable_thinking, + thinking_budget_enabled: s.enableThinkingBudget, + thinking_budget_tokens: s.enableThinkingBudget ? (s.thinking_budget_tokens || null) : 0, + max_tool_result_tokens: s.enableToolResultLimit ? (s.max_tool_result_tokens || null) : 0, + chat_template_kwargs: Object.keys(chatTemplateKwargs).length > 0 ? chatTemplateKwargs : null, + forced_ct_kwargs: forcedCtKwargs.length > 0 ? forcedCtKwargs : null, + turboquant_kv_enabled: s.turboquant_kv_enabled, + turboquant_kv_bits: s.turboquant_kv_enabled ? (parseFloat(s.turboquant_kv_bits) || 4) : 4, + specprefill_enabled: s.specprefill_enabled, + specprefill_draft_model: s.specprefill_draft_model || null, + specprefill_keep_pct: s.specprefill_enabled ? (parseFloat(s.specprefill_keep_pct) || 0.2) : null, + specprefill_threshold: s.specprefill_enabled ? (s.specprefill_threshold || null) : null, + dflash_enabled: s.dflash_enabled, + dflash_draft_model: s.dflash_draft_model || null, + dflash_draft_quant_bits: s.dflash_enabled && s.dflash_draft_quant_bits ? parseInt(s.dflash_draft_quant_bits) : null, + dflash_max_ctx: s.dflash_enabled && s.dflash_max_ctx ? parseInt(s.dflash_max_ctx) : null, + dflash_in_memory_cache: s.dflash_enabled ? !!s.dflash_in_memory_cache : true, + dflash_in_memory_cache_max_entries: s.dflash_enabled + ? (parseInt(s.dflash_in_memory_cache_max_entries) || 4) + : 4, + dflash_in_memory_cache_max_bytes: s.dflash_enabled + ? Math.max(1, parseInt(s.dflash_in_memory_cache_max_gib) || 8) * (1024 ** 3) + : 8 * (1024 ** 3), + dflash_ssd_cache: s.dflash_enabled + && !!s.dflash_in_memory_cache + && !!s.dflash_ssd_cache_available + && !!s.dflash_ssd_cache, + mtp_enabled: !!s.mtp_enabled, + vlm_mtp_enabled: !!s.vlm_mtp_enabled, + vlm_mtp_draft_model: s.vlm_mtp_enabled + ? (s.vlm_mtp_draft_model || null) + : null, + vlm_mtp_draft_block_size: s.vlm_mtp_enabled && s.vlm_mtp_draft_block_size + ? parseInt(s.vlm_mtp_draft_block_size) + : null, + trust_remote_code: !!s.trust_remote_code, + }; + }, + + // Engine-init-class fields. Toggling these requires the engine + // to be reloaded. Both bench tabs reload the model on every run + // so it costs nothing here, but we surface a hint in the panel. + _ENGINE_INIT_KEYS: [ + 'turboquant_kv_enabled', 'turboquant_kv_bits', + 'dflash_enabled', 'dflash_draft_model', 'dflash_draft_quant_bits', + 'dflash_max_ctx', 'dflash_in_memory_cache', + 'dflash_in_memory_cache_max_entries', + 'dflash_in_memory_cache_max_gib', 'dflash_ssd_cache', + 'specprefill_enabled', 'specprefill_draft_model', + 'specprefill_keep_pct', 'specprefill_threshold', + 'mtp_enabled', + 'vlm_mtp_enabled', 'vlm_mtp_draft_model', 'vlm_mtp_draft_block_size', + 'enableIndexCache', 'index_cache_freq', + 'reasoning_parser', 'model_type_override', 'trust_remote_code', + ], + + // Generic panel ops keyed by prefix ('bench' | 'acc'). Each tab + // exposes thin wrappers below for templates to call by name. + _panelHydrate(prefix) { + const modelIdField = prefix === 'bench' ? 'benchModelId' : 'accModelId'; + const modelId = this[modelIdField]; + const model = (this.models || []).find(m => m.id === modelId) || null; + const state = this._modelToSettingsState(model); + this[prefix + 'Settings'] = state; + this[prefix + 'SettingsBaseline'] = JSON.parse(JSON.stringify(state)); + this[prefix + 'SettingsStatus'] = ''; + this[prefix + 'SettingsSaveProfileOpen'] = false; + // Echo modal's "active profile" tracking so the matching pill + // can render highlighted when the bench model has one set. + this[prefix + 'ActiveProfileName'] = (model && model.settings && + model.settings.active_profile_name) || null; + + // Pull the model's profile list so the pills row can render. + // Templates + presets are shared globals already loaded at init(). + this[prefix + 'Profiles'] = []; + if (modelId) { + fetch(`/admin/api/models/${encodeURIComponent(modelId)}/profiles`) + .then(async r => { + // Honor the session-expiry redirect contract used + // elsewhere in the dashboard for /admin/api fetches. + if (r.status === 401) { + window.location.href = '/admin'; + return null; + } + if (!r.ok) { + this[prefix + 'SettingsStatus'] = + `Failed to load profiles (HTTP ${r.status}).`; + return null; + } + return r.json(); + }) + .then(data => { + if (data && this[modelIdField] === modelId) { + this[prefix + 'Profiles'] = data.profiles || []; + } + }) + .catch(e => { + console.error(`Failed to load ${prefix} profiles:`, e); + this[prefix + 'SettingsStatus'] = + `Failed to load profiles: ${e}`; + }); + } + }, + _panelDirty(prefix) { + const cur = this[prefix + 'Settings']; + const base = this[prefix + 'SettingsBaseline']; + if (!cur || !base) return false; + return JSON.stringify(cur) !== JSON.stringify(base); + }, + _panelRequiresReload(prefix) { + const cur = this[prefix + 'Settings']; + const base = this[prefix + 'SettingsBaseline']; + if (!cur || !base) return false; + for (const k of this._ENGINE_INIT_KEYS) { + if (JSON.stringify(cur[k]) !== JSON.stringify(base[k])) return true; + } + return false; + }, + _panelReset(prefix) { + this[prefix + 'Settings'] = JSON.parse(JSON.stringify(this[prefix + 'SettingsBaseline'])); + this[prefix + 'SettingsStatus'] = ''; + }, + async _panelSave(prefix) { + const modelIdField = prefix === 'bench' ? 'benchModelId' : 'accModelId'; + const modelId = this[modelIdField]; + if (!modelId) return; + this[prefix + 'SettingsSaving'] = true; + this[prefix + 'SettingsStatus'] = ''; + try { + const payload = this._settingsStateToPayload(this[prefix + 'Settings']); + const response = await fetch( + `/admin/api/models/${encodeURIComponent(modelId)}/settings`, + { method: 'PUT', headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload) } + ); + if (response.ok) { + await this.loadModels(); + // Re-hydrate baseline from the refreshed model entry so + // dirty flips back to false after a successful save. + this._panelHydrate(prefix); + this[prefix + 'SettingsRecentlySaved'] = true; + setTimeout(() => { + this[prefix + 'SettingsRecentlySaved'] = false; + }, 1500); + } else if (response.status === 401) { + window.location.href = '/admin'; + } else { + const data = await response.json().catch(() => ({})); + this[prefix + 'SettingsStatus'] = data.detail || 'Save failed.'; + } + } catch (err) { + console.error(`Failed to save ${prefix} settings:`, err); + this[prefix + 'SettingsStatus'] = `Save failed: ${err.message}`; + } finally { + this[prefix + 'SettingsSaving'] = false; + } + }, + _panelSaveAsProfile(prefix) { + const modelIdField = prefix === 'bench' ? 'benchModelId' : 'accModelId'; + if (!this[modelIdField]) return; + this[prefix + 'SettingsNewProfile'] = { display_name: '', description: '' }; + this[prefix + 'SettingsSaveProfileOpen'] = true; + this[prefix + 'SettingsStatus'] = ''; + }, + async _panelSaveAsProfileConfirm(prefix) { + const modelIdField = prefix === 'bench' ? 'benchModelId' : 'accModelId'; + const modelId = this[modelIdField]; + const np = this[prefix + 'SettingsNewProfile']; + if (!modelId || !np?.display_name?.trim()) return; + const autoId = 'p-' + Date.now().toString(36) + '-' + + Math.random().toString(36).slice(2, 6); + // formValuesForProfile() reads this.modelSettings; aim it at + // the panel state for one call, then restore. Avoids + // re-implementing the chat_template_kwargs flattening logic. + const savedMs = this.modelSettings; + this.modelSettings = this[prefix + 'Settings']; + let profileSettings; + try { + profileSettings = this.formValuesForProfile(); + } finally { + this.modelSettings = savedMs; + } + const body = { + name: autoId, + display_name: np.display_name.trim(), + description: np.description?.trim() || null, + settings: profileSettings, + also_save_as_template: false, + }; + try { + const r = await fetch( + `/admin/api/models/${encodeURIComponent(modelId)}/profiles`, + { method: 'POST', headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body) } + ); + if (r.ok) { + this[prefix + 'SettingsSaveProfileOpen'] = false; + this[prefix + 'SettingsStatus'] = `Profile "${np.display_name.trim()}" created.`; + // Refresh the panel's profile list so the new pill + // shows up immediately. Don't re-hydrate everything; + // just refetch the profiles for this model. + try { + const pr = await fetch(`/admin/api/models/${encodeURIComponent(modelId)}/profiles`); + if (pr.ok) { + const pd = await pr.json(); + this[prefix + 'Profiles'] = pd.profiles || []; + } + } catch (_) { /* non-fatal */ } + } else if (r.status === 401) { + window.location.href = '/admin'; + } else { + const data = await r.json().catch(() => ({})); + this[prefix + 'SettingsStatus'] = data.detail || 'Profile save failed.'; + } + } catch (e) { + this[prefix + 'SettingsStatus'] = String(e); + } + }, + + // Apply a profile / template / preset to the panel state. + // These are intentionally ephemeral by default — they mutate + // only the panel's state object via pure merge helpers, and + // never call applyProfileToForm/applyTemplateToForm (which + // POST to /admin/api/models/{selectedModel.id}/...) — those + // would target the modal's selectedModel and persist server- + // side, breaking the ephemeral-override contract and possibly + // hitting the wrong model entirely. + _panelApplyProfile(prefix, profile) { + this._mergeProfileSettingsIntoState( + this[prefix + 'Settings'], + profile.settings || {}, + ); + this[prefix + 'ActiveProfileName'] = profile.name; + this[prefix + 'SettingsStatus'] = `Applied profile "${profile.display_name || profile.name}".`; + }, + _panelApplyTemplate(prefix, template) { + // Templates carry the same shape as profiles. We merge them + // directly into panel state — unlike applyTemplateToForm, we + // do NOT create a server-side profile from the template + // (that's a write-action the user hasn't asked for from + // the bench panels). + this._mergeProfileSettingsIntoState( + this[prefix + 'Settings'], + template.settings || {}, + ); + // Templates aren't profiles — clear the active-profile highlight. + this[prefix + 'ActiveProfileName'] = null; + this[prefix + 'SettingsStatus'] = `Applied template "${template.display_name || template.name}".`; + }, + _panelApplyPreset(prefix, preset) { + const state = this[prefix + 'Settings']; + // Reset first so previous preset's fields don't bleed through. + this._resetStatePresetFields(state); + this._mergePresetSettingsIntoState(state, preset.settings || {}); + this[prefix + 'ActiveProfileName'] = null; + this[prefix + 'SettingsStatus'] = `Applied preset "${preset.display_name || preset.name}".`; + }, + + // ---- Bench (throughput) tab — template-facing API ---- + get benchSelectedModel() { + return (this.models || []).find(m => m.id === this.benchModelId) || null; + }, + benchSettingsDirty() { return this._panelDirty('bench'); }, + benchSettingsRequiresReload() { return this._panelRequiresReload('bench'); }, + benchSettingsReset() { this._panelReset('bench'); }, + benchSettingsSave() { return this._panelSave('bench'); }, + benchSettingsSaveAsProfile() { this._panelSaveAsProfile('bench'); }, + benchSettingsSaveAsProfileConfirm() { return this._panelSaveAsProfileConfirm('bench'); }, + benchApplyProfile(p) { return this._panelApplyProfile('bench', p); }, + benchApplyTemplate(t) { return this._panelApplyTemplate('bench', t); }, + benchApplyPreset(p) { this._panelApplyPreset('bench', p); }, + + // ---- Accuracy tab — template-facing API ---- + get accSelectedModel() { + return (this.models || []).find(m => m.id === this.accModelId) || null; + }, + accSettingsDirty() { return this._panelDirty('acc'); }, + accSettingsRequiresReload() { return this._panelRequiresReload('acc'); }, + accSettingsReset() { this._panelReset('acc'); }, + accSettingsSave() { return this._panelSave('acc'); }, + accSettingsSaveAsProfile() { this._panelSaveAsProfile('acc'); }, + accSettingsSaveAsProfileConfirm() { return this._panelSaveAsProfileConfirm('acc'); }, + accApplyProfile(p) { return this._panelApplyProfile('acc', p); }, + accApplyTemplate(t) { return this._panelApplyTemplate('acc', t); }, + accApplyPreset(p) { this._panelApplyPreset('acc', p); }, } } diff --git a/omlx/admin/templates/base.html b/omlx/admin/templates/base.html index d8c3a2c9a..ac4493505 100644 --- a/omlx/admin/templates/base.html +++ b/omlx/admin/templates/base.html @@ -129,33 +129,80 @@ } function replaceIcon(el) { - var name = el.getAttribute('data-lucide'); - if (!name) return; - var def = lucide.icons[toPascal(name)]; - if (!def) return; - var svg = lucide.createElement(def); - Array.from(el.attributes).forEach(function(a) { - if (a.name !== 'data-lucide') svg.setAttribute(a.name, a.value); - }); - svg.classList.add('lucide', 'lucide-' + name); - if (el.parentNode) el.parentNode.replaceChild(svg, el); + // Per-icon try/catch: a single failure must NOT abort the rest + // of the pass (forEach does not isolate iteration exceptions, + // so one throw here previously left every later + // unprocessed — a second-tab regression). + try { + var name = el.getAttribute('data-lucide'); + if (!name) return; + var def = lucide.icons[toPascal(name)]; + if (!def) return; + var svg = lucide.createElement(def); + Array.from(el.attributes).forEach(function(a) { + if (a.name !== 'data-lucide') svg.setAttribute(a.name, a.value); + }); + svg.classList.add('lucide', 'lucide-' + name); + if (el.parentNode) el.parentNode.replaceChild(svg, el); + } catch (e) { + if (window.console && console.warn) { + console.warn('lucide: failed to render icon', el, e); + } + } } function processAll() { document.querySelectorAll('i[data-lucide]').forEach(replaceIcon); } - // Initial pass + // Initial pass. if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', processAll); } else { processAll(); } - // Poll for dynamically added icons (Alpine x-for, x-if, modals). - setInterval(function() { - if (document.querySelector('i[data-lucide]')) processAll(); - }, 300); + // MutationObserver replaces the previous 300ms poll. It reacts + // immediately to: + // - newly inserted nodes (Alpine x-for / x-if, + // modal open, dynamically rendered partials) + // - attribute mutations on existing elements (Alpine + // :data-lucide bindings resolving once the component mounts). + // No idle CPU cost, and zero latency between DOM availability and + // icon replacement. + function startObserving() { + var observer = new MutationObserver(function(mutations) { + for (var i = 0; i < mutations.length; i++) { + var m = mutations[i]; + if (m.type === 'childList') { + m.addedNodes.forEach(function(node) { + if (node.nodeType !== 1) return; + if (node.matches && node.matches('i[data-lucide]')) { + replaceIcon(node); + } + if (node.querySelectorAll) { + node.querySelectorAll('i[data-lucide]').forEach(replaceIcon); + } + }); + } else if (m.type === 'attributes' && + m.attributeName === 'data-lucide' && + m.target.tagName === 'I') { + replaceIcon(m.target); + } + } + }); + observer.observe(document.body, { + childList: true, + subtree: true, + attributes: true, + attributeFilter: ['data-lucide'] + }); + } + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', startObserving); + } else { + startObserving(); + } })(); diff --git a/omlx/admin/templates/dashboard.html b/omlx/admin/templates/dashboard.html index 23be93ab4..8847e2a62 100644 --- a/omlx/admin/templates/dashboard.html +++ b/omlx/admin/templates/dashboard.html @@ -39,6 +39,21 @@ {% include "dashboard/_modal_model_settings.html" %} + + {% from "dashboard/_modal_bench_settings.html" import bench_settings_modal %} + {{ bench_settings_modal( + 'bench', + run_fn='startBenchmark()', + run_disabled_expr='!benchModelId || Object.values(benchPromptLengths).every(v => !v) || benchRunning', + placeholder='e.g. low-temp-deterministic' + ) }} + {{ bench_settings_modal( + 'acc', + run_fn='addToAccQueue()', + run_disabled_expr='!accModelId || !Object.values(accBenchmarks).some(v => v) || accRunning', + placeholder='e.g. high-recall-mmlu' + ) }} +
{{ t('bench.headi {{ t('bench.config.section_label') }}
+
diff --git a/omlx/admin/templates/dashboard/_bench_accuracy.html b/omlx/admin/templates/dashboard/_bench_accuracy.html index 4dac894dd..b60091b31 100644 --- a/omlx/admin/templates/dashboard/_bench_accuracy.html +++ b/omlx/admin/templates/dashboard/_bench_accuracy.html @@ -18,6 +18,15 @@

{{ t('acc_bench.h {{ t('acc_bench.config.section_label') }}

+
diff --git a/omlx/admin/templates/dashboard/_modal_bench_settings.html b/omlx/admin/templates/dashboard/_modal_bench_settings.html new file mode 100644 index 000000000..204713f44 --- /dev/null +++ b/omlx/admin/templates/dashboard/_modal_bench_settings.html @@ -0,0 +1,228 @@ +{# Run-time settings modal for the benchmark tabs. + + Renders the same field set as the model-settings modal, but bound to a + per-tab state object (benchSettings or accSettings). Changes are + ephemeral by default — applied to the next bench run only — with + explicit Save / Save-as-Profile / Run / Reset actions. + + Parameters: + prefix 'bench' or 'acc' — used to derive every Alpine binding + (e.g. benchSettingsOpen, accSettingsReset(), …). + run_fn JS expression invoked by the Run benchmark button + (e.g. 'startBenchmark()'). + run_disabled_expr JS expression that evaluates true when the Run + benchmark button must be disabled. + placeholder Suggested profile name shown in the inline create form. +#} +{% from "dashboard/_settings_fields.html" import settings_fields %} + +{% macro bench_settings_modal(prefix, run_fn, run_disabled_expr, placeholder='') -%} +{%- set state = prefix ~ 'Settings' -%} +{%- set selected_model = prefix ~ 'SelectedModel' -%} +{%- set running = prefix ~ 'Running' -%} +{%- set open_flag = prefix ~ 'SettingsOpen' -%} +{%- set saving = prefix ~ 'SettingsSaving' -%} +{%- set recently_saved = prefix ~ 'SettingsRecentlySaved' -%} +{%- set status = prefix ~ 'SettingsStatus' -%} +{%- set save_open = prefix ~ 'SettingsSaveProfileOpen' -%} +{%- set new_profile = prefix ~ 'SettingsNewProfile' -%} +{%- set save_confirm = prefix ~ 'SettingsSaveAsProfileConfirm()' -%} +{%- set save_cancel = prefix ~ 'SettingsSaveProfileOpen = false' -%} +{%- set dirty_fn = prefix ~ 'SettingsDirty()' -%} +{%- set reload_fn = prefix ~ 'SettingsRequiresReload()' -%} +{%- set reset_fn = prefix ~ 'SettingsReset()' -%} +{%- set save_fn = prefix ~ 'SettingsSave()' -%} +{%- set save_as_fn = prefix ~ 'SettingsSaveAsProfile()' -%} +{%- set profiles = prefix ~ 'Profiles' -%} +{%- set scope = prefix ~ 'ProfileScope' -%} +{%- set active_profile = prefix ~ 'ActiveProfileName' -%} +{%- set apply_preset = prefix ~ 'ApplyPreset' -%} +{%- set apply_template = prefix ~ 'ApplyTemplate' -%} +{%- set apply_profile = prefix ~ 'ApplyProfile' -%} + +
+
+ +
+
+ + +
+
+
+

Run-time Settings

+ modified +
+

+

+ Changes apply to the next run only. Use Save to persist, or Save as Profile to bookmark. +

+
+ + +
+ + +
+
+ +

+ + Engine-init flags changed (TurboQuant / DFlash / MTP / IndexCache). The bench reloads the model on each run, so no extra cost. +

+ + +
+
+

Apply Profile / Template

+
+ + + +
+
+ + +
+ + no presets available +
+ + +
+ + no templates yet — create them in the model settings modal +
+ + +
+ + no profiles for this model — use Save as Profile below to create one +
+
+ + + {{ settings_fields(state, selected_model, mode='panel') }} + + +
+ + + +
+ + +
+
+ + +
+ + + + + + + + + +
+
+
+
+{%- endmacro %} diff --git a/omlx/admin/templates/dashboard/_modal_model_settings.html b/omlx/admin/templates/dashboard/_modal_model_settings.html index 31cb4d6f8..cdf4d7527 100644 --- a/omlx/admin/templates/dashboard/_modal_model_settings.html +++ b/omlx/admin/templates/dashboard/_modal_model_settings.html @@ -251,682 +251,9 @@

{{ t('m class="fixed z-50 px-2.5 py-1 text-xs font-medium text-white bg-neutral-800 rounded-md whitespace-nowrap pointer-events-none -translate-x-1/2" :style="`top: ${tip.y}px; left: ${tip.x}px`">

- -
- -
-

{{ t('modal.model_settings.basic_label') }}

-
- -
-
- - -
-
- - -
-
- - -
-
- - - - - -
-
- - -
-
- -
-
- -

{{ t('modal.model_settings.empty_hint') }}

-
-
- - -
-

{{ t('modal.model_settings.advanced_label') }}

-
- -
-
-
- {{ t('modal.model_settings.enable_thinking') }} -

{{ t('modal.model_settings.enable_thinking_hint') }}

-
- -
-

-

-
- - -
-
-
- {{ t('modal.model_settings.thinking_budget') }} -

{{ t('modal.model_settings.thinking_budget_hint') }}

-
- -
-
- -
-
- - -
-
-
- {{ t('modal.model_settings.limit_tool_result') }} -

{{ t('modal.model_settings.limit_tool_result_hint') }}

-
- -
-
- -
-
- - -
-
-
- {{ t('modal.model_settings.force_sampling') }} -

{{ t('modal.model_settings.force_sampling_hint') }}

-
- -
-
- - -
-
-
- {{ t('modal.model_settings.trust_remote_code') }} -

{{ t('modal.model_settings.trust_remote_code_hint') }}

-
- -
-
- - -
-
-
- {{ t('modal.model_settings.chat_template_kwargs') }} -

{{ t('modal.model_settings.chat_template_kwargs_hint') }}

-
- -
- -
- - - -
-
-
- - - - - -

{{ t('modal.model_settings.no_kwargs') }}

-
- - -

{{ t('modal.model_settings.experimental_label') }}

- - -
-
-
- {{ t('modal.model_settings.turboquant_kv') }} -

{{ t('modal.model_settings.turboquant_kv_hint') }}

-
- -
-
- - -
-
- - - - - -
-
-
- SpecPrefill -

Attention-based sparse prefill for MoE/hybrid models. (Paper) (HuggingFace)

-
- -
-
-
- - -

Small model sharing tokenizer with target (e.g. Qwen3.5-0.8B for 35B)

-
-
- - -
-
- - -

Min tokens to trigger (shorter prompts use full prefill)

-
-
-
- - -
-
-
- DFlash -

Block diffusion speculative decoding for 3-4x faster generation. Supports Qwen (3, 3.5, 3.6) and Gemma4 model families. Requires a DFlash draft model checkpoint.
Single-stream only: requests run one at a time.
* MLX impl by bstnxbt(GitHub)

-

-
- -
-
-
- - -

DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash). Note: -DFlash suffix only; -assistant variants are for MTP.

-
-
-
-
- Quantization -

Enable quantization for the draft model (weight, activation bits & group size).

-
- -
-
-
-
- - -
-
- - -
-
- - -
-
-
- - -

Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.

-
-
-

Long-context tuning

-
-
- - -

Draft model sliding-attention window. Helps stabilise acceptance on long contexts. Leave empty for dflash default (1024).

-
-
- - -

Attention-sink tokens always kept regardless of window. Leave empty for dflash default (64).

-
-
-
- - -

Verifier algorithm. "adaptive" shrinks block size when acceptance drops; "off" disables speculative verify.

-
-
-
-
- In-memory cache -

DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.

-
- -
-
- - -

Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.

-
-
- - -

Byte budget for L1 snapshots; LRU evicts when exceeded.

-
-
-
- SSD cache -

L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (dflash_l2/).

-

Enable oMLX paged SSD cache first (--paged-ssd-cache-dir).

-

Requires in-memory cache to be enabled.

-
- -
-
-
- - -
-
-
- {{ t('modal.model_settings.mtp') }} -

{{ t('modal.model_settings.mtp_hint') | safe }}

-

-

{{ t('modal.model_settings.mtp_conflict') }}

-
- -
-
- - -
-
-
- {{ t('modal.model_settings.vlm_mtp') }} -

{{ t('modal.model_settings.vlm_mtp_hint') | safe }}

-

{{ t('modal.model_settings.vlm_mtp_conflict') }}

-
- -
-
- - - - -
-
-
-
-
+ + {% from "dashboard/_settings_fields.html" import settings_fields %} + {{ settings_fields('modelSettings') }}
diff --git a/omlx/admin/templates/dashboard/_settings_fields.html b/omlx/admin/templates/dashboard/_settings_fields.html new file mode 100644 index 000000000..5f55e5e97 --- /dev/null +++ b/omlx/admin/templates/dashboard/_settings_fields.html @@ -0,0 +1,695 @@ +{# + Shared settings-fields macro for the model-settings modal and the bench-tab + inline panels. Renders the Basic + Advanced columns. Profile/template + management, modal chrome, and the Save/Cancel actions stay in the parent + template. + + Args: + state — Alpine state object holding the form values + (e.g. "modelSettings", "benchSettings", "accSettings"). + selected_model — Alpine expression for the model currently being edited + (e.g. "selectedModel", "benchSelectedModel"). + mode — "modal" or "panel". Modal-only chrome (the + "Load Defaults" button) is hidden in panel mode so the + macro doesn't depend on modal-scope helpers. +#} +{% macro settings_fields(state, selected_model='selectedModel', mode='modal') %} +
+ +
+

{{ t('modal.model_settings.basic_label') }}

+
+ +
+
+ + +
+
+ + +
+
+ + +
+
+ + + + + +
+
+ + +
+ {% if mode == 'modal' %} +
+ +
+ {% endif %} +
+ +

{{ t('modal.model_settings.empty_hint') }}

+
+
+ + +
+

{{ t('modal.model_settings.advanced_label') }}

+
+ +
+
+
+ {{ t('modal.model_settings.enable_thinking') }} +

{{ t('modal.model_settings.enable_thinking_hint') }}

+
+ +
+

+

+
+ + +
+
+
+ {{ t('modal.model_settings.thinking_budget') }} +

{{ t('modal.model_settings.thinking_budget_hint') }}

+
+ +
+
+ +
+
+ + +
+
+
+ {{ t('modal.model_settings.limit_tool_result') }} +

{{ t('modal.model_settings.limit_tool_result_hint') }}

+
+ +
+
+ +
+
+ + +
+
+
+ {{ t('modal.model_settings.force_sampling') }} +

{{ t('modal.model_settings.force_sampling_hint') }}

+
+ +
+
+ + +
+
+
+ {{ t('modal.model_settings.trust_remote_code') }} +

{{ t('modal.model_settings.trust_remote_code_hint') }}

+
+ +
+
+ + +
+
+
+ {{ t('modal.model_settings.chat_template_kwargs') }} +

{{ t('modal.model_settings.chat_template_kwargs_hint') }}

+
+ +
+ +
+ + + +
+
+
+ + + + + +

{{ t('modal.model_settings.no_kwargs') }}

+
+ + +

{{ t('modal.model_settings.experimental_label') }}

+ + +
+
+
+ {{ t('modal.model_settings.turboquant_kv') }} +

{{ t('modal.model_settings.turboquant_kv_hint') }}

+
+ +
+
+ + +
+
+ + + + + +
+
+
+ SpecPrefill +

Attention-based sparse prefill for MoE/hybrid models. (Paper) (HuggingFace)

+
+ +
+
+
+ + +

Small model sharing tokenizer with target (e.g. Qwen3.5-0.8B for 35B)

+
+
+ + +
+
+ + +

Min tokens to trigger (shorter prompts use full prefill)

+
+
+
+ + +
+
+
+ DFlash +

Block diffusion speculative decoding for 3-4x faster generation. Supports Qwen (3, 3.5, 3.6) and Gemma4 model families. Requires a DFlash draft model checkpoint.
Single-stream only: requests run one at a time.
* MLX impl by bstnxbt(GitHub)

+

+
+ +
+
+
+ + +

DFlash draft checkpoint (e.g. z-lab/Qwen3-4B-DFlash-b16, z-lab/gemma-4-26B-A4B-it-DFlash). Note: -DFlash suffix only; -assistant variants are for MTP.

+
+
+
+
+ Quantization +

Enable quantization for the draft model (weight, activation bits & group size).

+
+ +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +

Prompts at or above this token count switch to BatchedEngine. Leave empty for unlimited.

+
+
+

Long-context tuning

+
+
+ + +

Draft model sliding-attention window. Helps stabilise acceptance on long contexts. Leave empty for dflash default (1024).

+
+
+ + +

Attention-sink tokens always kept regardless of window. Leave empty for dflash default (64).

+
+
+
+ + +

Verifier algorithm. "adaptive" shrinks block size when acceptance drops; "off" disables speculative verify.

+
+
+
+
+ In-memory cache +

DFlash L1 prefix snapshot cache in RAM. Speeds up multi-turn chats with shared prefixes.

+
+ +
+
+ + +

Maximum number of prefix snapshots kept in L1 cache. Each entry stores KV + draft GDN state for one conversation prefix.

+
+
+ + +

Byte budget for L1 snapshots; LRU evicts when exceeded.

+
+
+
+ SSD cache +

L2 spill of evicted L1 entries to disk. Uses the oMLX paged SSD cache directory (dflash_l2/).

+

Enable oMLX paged SSD cache first (--paged-ssd-cache-dir).

+

Requires in-memory cache to be enabled.

+
+ +
+
+
+ + +
+
+
+ {{ t('modal.model_settings.mtp') }} +

{{ t('modal.model_settings.mtp_hint') | safe }}

+

+

{{ t('modal.model_settings.mtp_conflict') }}

+
+ +
+
+ + +
+
+
+ {{ t('modal.model_settings.vlm_mtp') }} +

{{ t('modal.model_settings.vlm_mtp_hint') | safe }}

+

{{ t('modal.model_settings.vlm_mtp_conflict') }}

+
+ +
+
+ + + + +
+
+
+
+
+ +{% endmacro %} diff --git a/omlx/model_settings.py b/omlx/model_settings.py index 4140fcbc8..d83d97169 100644 --- a/omlx/model_settings.py +++ b/omlx/model_settings.py @@ -5,13 +5,14 @@ flags, and metadata. """ +import contextlib import copy import json import logging import threading from dataclasses import dataclass, field, fields from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict, Iterator, Optional from .model_profiles import ( filter_profile_fields, @@ -266,6 +267,11 @@ def __init__(self, base_path: Path): self._settings: Dict[str, ModelSettings] = {} self._profiles: Dict[str, Dict[str, Dict[str, Any]]] = {} self._templates: Dict[str, Dict[str, Any]] = {} + # Ephemeral override layers keyed by model_id. Each value is a list of + # (token, override_dict) tuples; later entries win during merge. + # Tokens identify which entry to pop on exit so out-of-order context + # managers don't corrupt each other's state. + self._overrides: Dict[str, list[tuple[object, Dict[str, Any]]]] = {} # Ensure base directory exists self.base_path.mkdir(parents=True, exist_ok=True) @@ -346,6 +352,11 @@ def _save(self) -> None: def get_settings(self, model_id: str) -> ModelSettings: """Get settings for a specific model. + Returns persisted settings merged with any active ephemeral overrides + from ``ephemeral_overrides``. Override layers stack (later wins), and + a key set to ``None`` in an override is treated as "use the layer + beneath this one" so callers can express "don't touch this field". + Args: model_id: The model identifier. @@ -354,11 +365,67 @@ def get_settings(self, model_id: str) -> ModelSettings: """ with self._lock: if model_id in self._settings: - # Return a copy to prevent external modification - settings = self._settings[model_id] - return ModelSettings.from_dict(settings.to_dict()) + base = self._settings[model_id].to_dict() + else: + base = ModelSettings().to_dict() + + for _token, layer in self._overrides.get(model_id, ()): + for key, value in layer.items(): + if value is None: + continue + base[key] = value + + return ModelSettings.from_dict(base) + + @contextlib.contextmanager + def ephemeral_overrides( + self, model_id: str, overrides: Optional[Dict[str, Any]] + ) -> Iterator[None]: + """Apply overrides on top of persisted settings for ``model_id``. + + Inside the ``with`` block, ``get_settings(model_id)`` returns the + persisted settings shallow-merged with ``overrides`` (override values + win, except ``None`` which defers to the layer beneath). On exit — + normal or via exception — the overrides are removed. + + Unknown keys (i.e. not fields of :class:`ModelSettings`) are dropped + with a warning. Stacking is supported; exits are matched by token, so + out-of-order exits are safe. + + Yields ``None`` for use as a context manager. When ``overrides`` is + ``None`` or empty the context is a no-op. + """ + if not overrides: + yield + return - return ModelSettings() + valid_keys = {f.name for f in fields(ModelSettings)} + unknown = [k for k in overrides if k not in valid_keys] + if unknown: + logger.warning( + f"ephemeral_overrides: dropping unknown keys for " + f"'{model_id}': {unknown}" + ) + cleaned = {k: v for k, v in overrides.items() if k in valid_keys} + if not cleaned: + yield + return + + token = object() + with self._lock: + self._overrides.setdefault(model_id, []).append((token, cleaned)) + + try: + yield + finally: + with self._lock: + stack = self._overrides.get(model_id) + if stack is not None: + self._overrides[model_id] = [ + entry for entry in stack if entry[0] is not token + ] + if not self._overrides[model_id]: + del self._overrides[model_id] def set_settings(self, model_id: str, settings: ModelSettings) -> None: """Set settings for a specific model. diff --git a/tests/test_accuracy_benchmark.py b/tests/test_accuracy_benchmark.py index 8159b0121..9ff26f2b4 100644 --- a/tests/test_accuracy_benchmark.py +++ b/tests/test_accuracy_benchmark.py @@ -75,6 +75,21 @@ def test_enable_thinking_true(self): ) assert req.enable_thinking is True + def test_settings_override_defaults_to_none(self): + req = AccuracyBenchmarkRequest( + model_id="test-model", + benchmarks={"mmlu": 100}, + ) + assert req.settings_override is None + + def test_settings_override_accepts_dict(self): + req = AccuracyBenchmarkRequest( + model_id="test-model", + benchmarks={"mmlu": 100}, + settings_override={"temperature": 0.0, "top_p": 1.0}, + ) + assert req.settings_override == {"temperature": 0.0, "top_p": 1.0} + class TestQueueAndResults: def setup_method(self): diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index bb519e395..8506c8821 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -84,6 +84,29 @@ def test_default_generation_length(self): ) assert req.generation_length == 128 + def test_settings_override_defaults_to_none(self): + req = BenchmarkRequest( + model_id="test-model", + prompt_lengths=[1024], + ) + assert req.settings_override is None + + def test_settings_override_accepts_dict(self): + req = BenchmarkRequest( + model_id="test-model", + prompt_lengths=[1024], + settings_override={ + "temperature": 0.1, + "turboquant_kv_enabled": True, + "turboquant_kv_bits": 4, + }, + ) + assert req.settings_override == { + "temperature": 0.1, + "turboquant_kv_enabled": True, + "turboquant_kv_bits": 4, + } + # ============================================================================= # Prompt generation tests diff --git a/tests/test_model_settings.py b/tests/test_model_settings.py index 661b4483e..f9c0e0713 100644 --- a/tests/test_model_settings.py +++ b/tests/test_model_settings.py @@ -469,3 +469,227 @@ def worker(model_id): t.join() assert len(errors) == 0 + + +class TestEphemeralOverrides: + """Tests for ModelSettingsManager.ephemeral_overrides context manager. + + The override layer is the foundation of the bench-tab inline settings + panel: it lets a benchmark run apply per-run overrides without writing + to model_settings.json. + """ + + def test_noop_when_overrides_empty(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + + with manager.ephemeral_overrides("m", None): + assert manager.get_settings("m").temperature == 0.5 + with manager.ephemeral_overrides("m", {}): + assert manager.get_settings("m").temperature == 0.5 + + def test_overrides_apply_inside_and_revert_after(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5, top_p=0.9)) + + with manager.ephemeral_overrides("m", {"temperature": 0.1}): + s = manager.get_settings("m") + assert s.temperature == 0.1 + # Untouched fields keep persisted values. + assert s.top_p == 0.9 + + after = manager.get_settings("m") + assert after.temperature == 0.5 + assert after.top_p == 0.9 + + def test_persisted_file_unchanged(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + settings_path = Path(tmpdir) / "model_settings.json" + before = settings_path.read_text() + + with manager.ephemeral_overrides( + "m", {"temperature": 0.1, "top_p": 0.7} + ): + pass + + assert settings_path.read_text() == before + + def test_overrides_apply_to_model_with_no_persisted_settings(self): + """Engine-init flag overrides should work even for fresh models.""" + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + + with manager.ephemeral_overrides( + "fresh", {"turboquant_kv_enabled": True, "turboquant_kv_bits": 4} + ): + s = manager.get_settings("fresh") + assert s.turboquant_kv_enabled is True + assert s.turboquant_kv_bits == 4 + + # And the manager has no persisted state for this model. + assert manager.get_settings("fresh").turboquant_kv_enabled is False + + def test_unknown_keys_dropped(self, caplog): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + + with caplog.at_level("WARNING"): + with manager.ephemeral_overrides( + "m", {"temperature": 0.1, "totally_made_up_key": 42} + ): + s = manager.get_settings("m") + assert s.temperature == 0.1 + assert not hasattr(s, "totally_made_up_key") + + assert any("totally_made_up_key" in r.message for r in caplog.records) + + def test_none_value_defers_to_lower_layer(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + + with manager.ephemeral_overrides("m", {"temperature": None}): + # None means "don't override this field" — persisted wins. + assert manager.get_settings("m").temperature == 0.5 + + def test_nested_overrides_inner_wins_then_outer_restored(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + + with manager.ephemeral_overrides("m", {"temperature": 0.2}): + assert manager.get_settings("m").temperature == 0.2 + with manager.ephemeral_overrides("m", {"temperature": 0.1}): + assert manager.get_settings("m").temperature == 0.1 + # After inner exits, outer override is back in effect. + assert manager.get_settings("m").temperature == 0.2 + # After outer exits, persisted wins. + assert manager.get_settings("m").temperature == 0.5 + + def test_out_of_order_exit_uses_token(self): + """Two overlapping overrides exited out of LIFO order shouldn't corrupt state.""" + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + + outer = manager.ephemeral_overrides("m", {"temperature": 0.2}) + inner = manager.ephemeral_overrides("m", {"temperature": 0.1}) + outer.__enter__() + inner.__enter__() + assert manager.get_settings("m").temperature == 0.1 + + # Exit outer first (out of LIFO order). + outer.__exit__(None, None, None) + # Inner is still active, so its temperature wins. + assert manager.get_settings("m").temperature == 0.1 + + inner.__exit__(None, None, None) + assert manager.get_settings("m").temperature == 0.5 + + def test_overrides_released_on_exception(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(temperature=0.5)) + + with pytest.raises(RuntimeError): + with manager.ephemeral_overrides("m", {"temperature": 0.1}): + assert manager.get_settings("m").temperature == 0.1 + raise RuntimeError("boom") + + assert manager.get_settings("m").temperature == 0.5 + # Internal stack is empty. + assert "m" not in manager._overrides + + def test_overrides_isolated_per_model(self): + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("a", ModelSettings(temperature=0.5)) + manager.set_settings("b", ModelSettings(temperature=0.7)) + + with manager.ephemeral_overrides("a", {"temperature": 0.1}): + assert manager.get_settings("a").temperature == 0.1 + # Overrides for "a" don't leak into "b". + assert manager.get_settings("b").temperature == 0.7 + + def test_engine_init_flag_overrides(self): + """TurboQuant/DFlash/MTP overrides should compose just like sampling.""" + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(turboquant_kv_enabled=False)) + + with manager.ephemeral_overrides( + "m", + { + "turboquant_kv_enabled": True, + "turboquant_kv_bits": 3, + "dflash_enabled": True, + }, + ): + s = manager.get_settings("m") + assert s.turboquant_kv_enabled is True + assert s.turboquant_kv_bits == 3 + assert s.dflash_enabled is True + + after = manager.get_settings("m") + assert after.turboquant_kv_enabled is False + assert after.dflash_enabled is False + + def test_override_respects_mutual_exclusion_constraints(self): + """ModelSettings rejects mtp_enabled=True with dflash_enabled=True. + + get_settings runs the merged dict through ModelSettings.from_dict, + which triggers __post_init__ validation. An override that creates + an invalid combination should surface as an exception from + get_settings — not silently succeed with a corrupted state. + """ + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + manager.set_settings("m", ModelSettings(mtp_enabled=True)) + + # Override turns on dflash_enabled while persisted has + # mtp_enabled=True — invalid combo per ModelSettings.__post_init__. + with manager.ephemeral_overrides("m", {"dflash_enabled": True}): + with pytest.raises(Exception): + manager.get_settings("m") + + # Override is still released after the exception path. + assert "m" not in manager._overrides + + def test_thread_safe_concurrent_overrides(self): + """Concurrent overrides on different models don't corrupt each other.""" + import threading + + with tempfile.TemporaryDirectory() as tmpdir: + manager = ModelSettingsManager(Path(tmpdir)) + errors: list[Exception] = [] + + def worker(model_id: str, target_temp: float) -> None: + try: + for _ in range(20): + with manager.ephemeral_overrides( + model_id, {"temperature": target_temp} + ): + assert ( + manager.get_settings(model_id).temperature + == target_temp + ) + except Exception as e: + errors.append(e) + + threads = [ + threading.Thread(target=worker, args=(f"m{i}", i / 10)) + for i in range(8) + ] + for t in threads: + t.start() + for t in threads: + t.join() + + assert errors == [] + # All override stacks released. + assert manager._overrides == {}