jundot · blightbow · May 11, 2026 · May 13, 2026 · May 8, 2026 · May 8, 2026
diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
@@ -3483,6 +3483,13 @@ def _build_runtime_cache_observability(
         }
 
     cache_dir = global_settings.cache.get_ssd_cache_dir(global_settings.base_path)
+    cache_cfg = global_settings.cache
+    try:
+        cfg_disk_max = cache_cfg.get_ssd_cache_max_size_bytes(global_settings.base_path)
+    except (ValueError, OSError, TypeError) as exc:
+        logger.warning("Could not read SSD cache max size from config: %s", exc)
+        cfg_disk_max = 0
+
     payload = {
         "base_path": str(global_settings.base_path),
         "ssd_cache_dir": str(cache_dir),
@@ -3491,6 +3498,16 @@ def _build_runtime_cache_observability(
         "total_num_files": 0,
         "total_size_bytes": 0,
         "effective_block_sizes": [],
+        "disk_max_bytes": cfg_disk_max,
+        "hot_cache_max_bytes": 0,
+        "hot_cache_size_bytes": 0,
+        "hot_cache_entries": 0,
+        # MRU partial cache feature gate.  Per-model occupancy lives on each
+        # models[] entry; there is no payload-level entries aggregate
+        # because MRU tail slots are per-model, not a shared budget — only
+        # this max-entries sum is kept, purely so the dashboard can tell
+        # whether the feature is configured for any loaded model.
+        "mru_partial_max_entries": 0,
     }
 
     engine_pool = _get_engine_pool()
@@ -3602,11 +3619,28 @@ def _build_runtime_cache_observability(
             "last_tokens_to_next_block": last_tokens_to_next_block,
             "num_files": int(ssd_stats.get("num_files", 0) or 0),
             "total_size_bytes": int(ssd_stats.get("total_size_bytes", 0) or 0),
+            "max_size_bytes": int(ssd_stats.get("max_size_bytes", 0) or 0),
             "hot_cache_max_bytes": int(ssd_stats.get("hot_cache_max_bytes", 0) or 0),
             "hot_cache_size_bytes": int(ssd_stats.get("hot_cache_size_bytes", 0) or 0),
             "hot_cache_entries": int(ssd_stats.get("hot_cache_entries", 0) or 0),
+            "mru_partial_entries": int(
+                prefix_stats.get("mru_partial_entries", 0) or 0
+            ),
+            "mru_partial_max_entries": int(
+                prefix_stats.get("mru_partial_max_entries", 0) or 0
+            ),
+            # Tri-state: None (unknown / no inference yet), True (eligible),
+            # False (model uses non-sliceable cache layers — every stash
+            # refused at the safety gate; dashboard renders 'N/A (see log)').
+            "mru_partial_supported": prefix_stats.get(
+                "mru_partial_supported", None
+            ),
         }
 
+        cache_rates = runtime_stats.get("cache_rates")
+        if cache_rates:
+            model_payload["cache_rates"] = cache_rates
+
         payload["models"].append(model_payload)
         payload["total_num_files"] += model_payload["num_files"]
         payload["total_size_bytes"] += model_payload["total_size_bytes"]
@@ -3616,6 +3650,33 @@ def _build_runtime_cache_observability(
 
     payload["effective_block_sizes"] = sorted(block_sizes)
 
+    # Aggregate hot-cache and disk-max across models.
+    # hot_cache_max sums across models (each model reserves its own slice of
+    # the same process-wide hot cache budget) so the gauge denominator matches
+    # the summed numerator.  disk_max keeps the config fallback via max()
+    # because a single SSD cache directory is shared — the effective cap is
+    # the largest configured limit, not a per-model sum.
+    hot_cache_max = 0
+    disk_max = payload["disk_max_bytes"]
+    hot_cache_size_total = 0
+    hot_cache_entries_total = 0
+    mru_max_entries_total = 0
+    for m in payload["models"]:
+        hot_cache_size_total += m.get("hot_cache_size_bytes", 0)
+        hot_cache_entries_total += m.get("hot_cache_entries", 0)
+        hot_cache_max += m.get("hot_cache_max_bytes", 0)
+        disk_max = max(disk_max, m.get("max_size_bytes", 0))
+        # MRU: only the max-entries sum is kept, and only as a feature-on
+        # gate for the dashboard.  Per-model occupancy is on each models[]
+        # entry; an aggregate live count would be meaningless because the
+        # slots are per-model, not a shared budget.
+        mru_max_entries_total += m.get("mru_partial_max_entries", 0)
+    payload["hot_cache_max_bytes"] = hot_cache_max
+    payload["hot_cache_size_bytes"] = hot_cache_size_total
+    payload["hot_cache_entries"] = hot_cache_entries_total
+    payload["disk_max_bytes"] = disk_max
+    payload["mru_partial_max_entries"] = mru_max_entries_total
+
     # Fallback: if no loaded models contributed stats, scan the cache
     # directory directly so the dashboard still shows real disk usage.
     if payload["total_num_files"] == 0 and cache_dir.exists():
@@ -3870,6 +3931,30 @@ async def clear_alltime_stats(is_admin: bool = Depends(require_admin)):
     return {"status": "ok"}
 
 
+def _iter_loaded_schedulers():
+    """Yield (model_id, scheduler) for each loaded model.
+
+    Traverses the internal engine hierarchy: pool entry → async engine →
+    core engine → scheduler.  Both ``clear_ssd_cache`` and
+    ``clear_hot_cache`` share this traversal.
+    """
+    engine_pool = _get_engine_pool()
+    if engine_pool is None:
+        return
+    for model_info in engine_pool.get_status().get("models", []):
+        model_id = model_info.get("id")
+        if not model_id or not model_info.get("loaded"):
+            continue
+        entry = engine_pool._entries.get(model_id)
+        if entry is None or entry.engine is None:
+            continue
+        async_core = getattr(entry.engine, "_engine", None)
+        core = getattr(async_core, "engine", None) if async_core is not None else None
+        scheduler = getattr(core, "scheduler", None) if core is not None else None
+        if scheduler is not None:
+            yield model_id, scheduler
+
+
 @router.post("/api/ssd-cache/clear")
 async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
     """Clear all SSD cache files for all loaded models.
@@ -3880,38 +3965,33 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
     """
     total_deleted = 0
 
-    # Phase 1: clear via loaded models' cache managers (updates in-memory index)
-    engine_pool = _get_engine_pool()
-    if engine_pool is not None:
-        for model_info in engine_pool.get_status().get("models", []):
-            model_id = model_info.get("id")
-            if not model_id or not model_info.get("loaded"):
-                continue
-
-            entry = engine_pool._entries.get(model_id)
-            if entry is None or entry.engine is None:
-                continue
-
-            async_core = getattr(entry.engine, "_engine", None)
-            core = (
-                getattr(async_core, "engine", None) if async_core is not None else None
-            )
-            scheduler = (
-                getattr(core, "scheduler", None) if core is not None else None
-            )
+    for model_id, scheduler in _iter_loaded_schedulers():
+        ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
+        if ssd_manager is not None:
+            try:
+                total_deleted += ssd_manager.clear()
+            except Exception as exc:
+                logger.warning(
+                    "Failed to clear SSD cache for model '%s': %s",
+                    model_id,
+                    exc,
+                )
 
-            if scheduler is not None:
-                ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
-                if ssd_manager is not None:
-                    try:
-                        deleted = ssd_manager.clear()
-                        total_deleted += deleted
-                    except Exception as exc:
-                        logger.warning(
-                            "Failed to clear SSD cache for model '%s': %s",
-                            model_id,
-                            exc,
-                        )
+        # MRU partials chain from paged-block hashes whose KV bytes are
+        # gone after the ssd_manager.clear() above.  Drop them so the
+        # admin "clear all warm caches" intent is honoured symmetrically.
+        # Single-tier behaviour (no clear) is the surviving-stash hazard
+        # the peer review caught for this endpoint.
+        block_aware_cache = getattr(scheduler, "block_aware_cache", None)
+        if block_aware_cache is not None:
+            try:
+                block_aware_cache.clear_mru_partials()
+            except Exception as exc:
+                logger.warning(
+                    "Failed to clear MRU partials for model '%s': %s",
+                    model_id,
+                    exc,
+                )
 
     # Phase 2: remove any remaining files on disk (covers unloaded models)
     global_settings = _get_global_settings()
@@ -3937,6 +4017,31 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
     return {"status": "ok", "total_deleted": total_deleted}
 
 
+@router.post("/api/hot-cache/clear")
+async def clear_hot_cache(is_admin: bool = Depends(require_admin)):
+    """Clear the in-memory (hot) cache for all loaded models.
+
+    No filesystem fallback needed — hot cache is in-memory only and does
+    not survive process restart.
+    """
+    total_cleared = 0
+    for model_id, scheduler in _iter_loaded_schedulers():
+        ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
+        if ssd_manager is not None and hasattr(ssd_manager, "clear_hot_cache"):
+            try:
+                total_cleared += ssd_manager.clear_hot_cache()
+            except Exception as exc:
+                logger.warning(
+                    "Failed to clear hot cache for model '%s': %s",
+                    model_id,
+                    exc,
+                )
+        rate_tracker = getattr(scheduler, "_cache_rate_tracker", None)
+        if rate_tracker is not None:
+            rate_tracker.clear()
+    return {"status": "ok", "total_cleared": total_cleared}
+
+
 @router.post("/api/cache/probe")
 async def probe_cache(
     request: CacheProbeRequest,

diff --git a/omlx/admin/static/css/dashboard.css b/omlx/admin/static/css/dashboard.css
@@ -63,6 +63,10 @@
     [data-theme="dark"] .hover\:text-neutral-700:hover { color: var(--text-primary) !important; }
     [data-theme="dark"] .hover\:text-neutral-600:hover { color: var(--text-secondary) !important; }
 
+    /* === Gauge track (visible in both themes) === */
+    .gauge-track { background-color: #e5e5e5; }
+    [data-theme="dark"] .gauge-track { background-color: #3f3f46 !important; }
+
     /* === Active nav tab (bg-white with shadow inside dark nav) === */
     [data-theme="dark"] .shadow-sm { box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.3) !important; }
 

diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
@@ -160,6 +160,10 @@
                     total_num_files: 0,
                     total_size_bytes: 0,
                     effective_block_sizes: [],
+                    hot_cache_size_bytes: 0,
+                    hot_cache_entries: 0,
+                    hot_cache_max_bytes: 0,
+                    disk_max_bytes: 0,
                 },
             },
             alltimeStats: {
@@ -190,6 +194,7 @@
             showClearStatsConfirm: false,
             showClearAlltimeConfirm: false,
             showClearSsdCacheConfirm: false,
+            showClearHotCacheConfirm: false,
             _statsRefreshTimer: null,
 
             // Log viewer state
@@ -2149,7 +2154,8 @@
 
             async clearSsdCache() {
                 try {
-                    await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
+                    const resp = await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
+                    if (!resp.ok) console.error('SSD cache clear failed:', resp.status);
                     this.showClearSsdCacheConfirm = false;
                     await this.loadStats();
                 } catch (err) {
@@ -2158,6 +2164,18 @@
                 }
             },
 
+            async clearHotCache() {
+                try {
+                    const resp = await fetch('/admin/api/hot-cache/clear', { method: 'POST' });
+                    if (!resp.ok) console.error('Hot cache clear failed:', resp.status);
+                    this.showClearHotCacheConfirm = false;
+                    await this.loadStats();
+                } catch (err) {
+                    console.error('Failed to clear hot cache:', err);
+                    this.showClearHotCacheConfirm = false;
+                }
+            },
+
             startStatsRefresh() {
                 this.stopStatsRefresh();
                 this._statsRefreshTimer = setInterval(() => {
@@ -2178,6 +2196,39 @@
                 return num.toLocaleString();
             },
 
+            cacheObsCumulative(stats, selectedModel) {
+                const entries = stats.runtime_cache?.models || [];
+                if (entries.length === 0) return {};
+
+                if (selectedModel) {
+                    const entry = entries.find(m => m.id === selectedModel);
+                    return entry?.cache_rates?.cumulative || {};
+                }
+
+                const sumKeys = ['prefix_hits', 'prefix_misses', 'evictions', 'ssd_hot_hits', 'ssd_disk_loads', 'ssd_saves', 'hot_cache_evictions', 'hot_cache_promotions', 'mru_partial_stashes', 'mru_partial_hits', 'mru_partial_evictions', 'mru_partial_tokens_saved'];
+                let agg = {};
+
+                for (const m of entries) {
+                    const c = m.cache_rates?.cumulative;
+                    if (!c || Object.keys(c).length === 0) continue;
+                    for (const k of sumKeys) {
+                        agg[k] = (agg[k] || 0) + (c[k] || 0);
+                    }
+                }
+
+                const ph = agg.prefix_hits || 0;
+                const pm = agg.prefix_misses || 0;
+                const sh = agg.ssd_hot_hits || 0;
+                const sd = agg.ssd_disk_loads || 0;
+                const ms = agg.mru_partial_stashes || 0;
+                const mh = agg.mru_partial_hits || 0;
+                agg.prefix_hit_rate = (ph + pm) > 0 ? ph / (ph + pm) : 0;
+                agg.ssd_hot_rate = (sh + sd) > 0 ? sh / (sh + sd) : 0;
+                agg.mru_partial_hit_rate = ms > 0 ? mh / ms : 0;
+
+                return agg;
+            },
+
             getStatFontClass(value) {
                 if (value >= 1000000000) return 'text-2xl';
                 if (value >= 1000000) return 'text-3xl';
@@ -2239,6 +2290,38 @@
                 return 'bg-red-400';
             },
 
+            get runtimeHotCachePercent() {
+                const rc = this.stats.runtime_cache;
+                if (!rc || !rc.hot_cache_max_bytes) return 0;
+                return Math.min(100, (rc.hot_cache_size_bytes / rc.hot_cache_max_bytes) * 100);
+            },
+
+            // mruEnabled is a feature-on gate (drives the rate strip and the
+            // per-model MRU Tails column).  It reads the payload-level
+            // mru_partial_max_entries purely as "configured for any loaded
+            // model" — there is deliberately no aggregate MRU-tails gauge,
+            // since the slots are per-model, not a shared budget.
+            get mruEnabled() {
+                return (this.stats.runtime_cache?.mru_partial_max_entries || 0) > 0;
+            },
+
+            get hotCacheEnabled() {
+                return (this.stats.runtime_cache?.hot_cache_max_bytes || 0) > 0;
+            },
+
+            get cacheRatesGridCols() {
+                const both = this.hotCacheEnabled && this.mruEnabled;
+                if (both) return 'grid-cols-2 sm:grid-cols-6';
+                if (this.hotCacheEnabled || this.mruEnabled) return 'grid-cols-2 sm:grid-cols-4';
+                return 'grid-cols-2';
+            },
+
+            get runtimeSsdCachePercent() {
+                const rc = this.stats.runtime_cache;
+                if (!rc || !rc.disk_max_bytes) return 0;
+                return Math.min(100, (rc.total_size_bytes / rc.disk_max_bytes) * 100);
+            },
+
             get activeModelsMemoryPercent() {
                 const am = this.stats.active_models;
                 if (!am || !am.model_memory_max) return 0;