diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
index 06ceb808b..16a51a4cb 100644
--- a/omlx/admin/routes.py
+++ b/omlx/admin/routes.py
@@ -3483,6 +3483,13 @@ def _build_runtime_cache_observability(
}
cache_dir = global_settings.cache.get_ssd_cache_dir(global_settings.base_path)
+ cache_cfg = global_settings.cache
+ try:
+ cfg_disk_max = cache_cfg.get_ssd_cache_max_size_bytes(global_settings.base_path)
+ except (ValueError, OSError, TypeError) as exc:
+ logger.warning("Could not read SSD cache max size from config: %s", exc)
+ cfg_disk_max = 0
+
payload = {
"base_path": str(global_settings.base_path),
"ssd_cache_dir": str(cache_dir),
@@ -3491,6 +3498,10 @@ def _build_runtime_cache_observability(
"total_num_files": 0,
"total_size_bytes": 0,
"effective_block_sizes": [],
+ "disk_max_bytes": cfg_disk_max,
+ "hot_cache_max_bytes": 0,
+ "hot_cache_size_bytes": 0,
+ "hot_cache_entries": 0,
}
engine_pool = _get_engine_pool()
@@ -3602,11 +3613,16 @@ def _build_runtime_cache_observability(
"last_tokens_to_next_block": last_tokens_to_next_block,
"num_files": int(ssd_stats.get("num_files", 0) or 0),
"total_size_bytes": int(ssd_stats.get("total_size_bytes", 0) or 0),
+ "max_size_bytes": int(ssd_stats.get("max_size_bytes", 0) or 0),
"hot_cache_max_bytes": int(ssd_stats.get("hot_cache_max_bytes", 0) or 0),
"hot_cache_size_bytes": int(ssd_stats.get("hot_cache_size_bytes", 0) or 0),
"hot_cache_entries": int(ssd_stats.get("hot_cache_entries", 0) or 0),
}
+ cache_rates = runtime_stats.get("cache_rates")
+ if cache_rates:
+ model_payload["cache_rates"] = cache_rates
+
payload["models"].append(model_payload)
payload["total_num_files"] += model_payload["num_files"]
payload["total_size_bytes"] += model_payload["total_size_bytes"]
@@ -3616,6 +3632,26 @@ def _build_runtime_cache_observability(
payload["effective_block_sizes"] = sorted(block_sizes)
+ # Aggregate hot-cache and disk-max across models.
+ # hot_cache_max sums across models (each model reserves its own slice of
+ # the same process-wide hot cache budget) so the gauge denominator matches
+ # the summed numerator. disk_max keeps the config fallback via max()
+ # because a single SSD cache directory is shared — the effective cap is
+ # the largest configured limit, not a per-model sum.
+ hot_cache_max = 0
+ disk_max = payload["disk_max_bytes"]
+ hot_cache_size_total = 0
+ hot_cache_entries_total = 0
+ for m in payload["models"]:
+ hot_cache_size_total += m.get("hot_cache_size_bytes", 0)
+ hot_cache_entries_total += m.get("hot_cache_entries", 0)
+ hot_cache_max += m.get("hot_cache_max_bytes", 0)
+ disk_max = max(disk_max, m.get("max_size_bytes", 0))
+ payload["hot_cache_max_bytes"] = hot_cache_max
+ payload["hot_cache_size_bytes"] = hot_cache_size_total
+ payload["hot_cache_entries"] = hot_cache_entries_total
+ payload["disk_max_bytes"] = disk_max
+
# Fallback: if no loaded models contributed stats, scan the cache
# directory directly so the dashboard still shows real disk usage.
if payload["total_num_files"] == 0 and cache_dir.exists():
@@ -3870,6 +3906,30 @@ async def clear_alltime_stats(is_admin: bool = Depends(require_admin)):
return {"status": "ok"}
+def _iter_loaded_schedulers():
+ """Yield (model_id, scheduler) for each loaded model.
+
+ Traverses the internal engine hierarchy: pool entry → async engine →
+ core engine → scheduler. Both ``clear_ssd_cache`` and
+ ``clear_hot_cache`` share this traversal.
+ """
+ engine_pool = _get_engine_pool()
+ if engine_pool is None:
+ return
+ for model_info in engine_pool.get_status().get("models", []):
+ model_id = model_info.get("id")
+ if not model_id or not model_info.get("loaded"):
+ continue
+ entry = engine_pool._entries.get(model_id)
+ if entry is None or entry.engine is None:
+ continue
+ async_core = getattr(entry.engine, "_engine", None)
+ core = getattr(async_core, "engine", None) if async_core is not None else None
+ scheduler = getattr(core, "scheduler", None) if core is not None else None
+ if scheduler is not None:
+ yield model_id, scheduler
+
+
@router.post("/api/ssd-cache/clear")
async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
"""Clear all SSD cache files for all loaded models.
@@ -3880,38 +3940,17 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
"""
total_deleted = 0
- # Phase 1: clear via loaded models' cache managers (updates in-memory index)
- engine_pool = _get_engine_pool()
- if engine_pool is not None:
- for model_info in engine_pool.get_status().get("models", []):
- model_id = model_info.get("id")
- if not model_id or not model_info.get("loaded"):
- continue
-
- entry = engine_pool._entries.get(model_id)
- if entry is None or entry.engine is None:
- continue
-
- async_core = getattr(entry.engine, "_engine", None)
- core = (
- getattr(async_core, "engine", None) if async_core is not None else None
- )
- scheduler = (
- getattr(core, "scheduler", None) if core is not None else None
- )
-
- if scheduler is not None:
- ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
- if ssd_manager is not None:
- try:
- deleted = ssd_manager.clear()
- total_deleted += deleted
- except Exception as exc:
- logger.warning(
- "Failed to clear SSD cache for model '%s': %s",
- model_id,
- exc,
- )
+ for model_id, scheduler in _iter_loaded_schedulers():
+ ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
+ if ssd_manager is not None:
+ try:
+ total_deleted += ssd_manager.clear()
+ except Exception as exc:
+ logger.warning(
+ "Failed to clear SSD cache for model '%s': %s",
+ model_id,
+ exc,
+ )
# Phase 2: remove any remaining files on disk (covers unloaded models)
global_settings = _get_global_settings()
@@ -3937,6 +3976,31 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
return {"status": "ok", "total_deleted": total_deleted}
+@router.post("/api/hot-cache/clear")
+async def clear_hot_cache(is_admin: bool = Depends(require_admin)):
+ """Clear the in-memory (hot) cache for all loaded models.
+
+ No filesystem fallback needed — hot cache is in-memory only and does
+ not survive process restart.
+ """
+ total_cleared = 0
+ for model_id, scheduler in _iter_loaded_schedulers():
+ ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
+ if ssd_manager is not None and hasattr(ssd_manager, "clear_hot_cache"):
+ try:
+ total_cleared += ssd_manager.clear_hot_cache()
+ except Exception as exc:
+ logger.warning(
+ "Failed to clear hot cache for model '%s': %s",
+ model_id,
+ exc,
+ )
+ rate_tracker = getattr(scheduler, "_cache_rate_tracker", None)
+ if rate_tracker is not None:
+ rate_tracker.clear()
+ return {"status": "ok", "total_cleared": total_cleared}
+
+
@router.post("/api/cache/probe")
async def probe_cache(
request: CacheProbeRequest,
diff --git a/omlx/admin/static/css/dashboard.css b/omlx/admin/static/css/dashboard.css
index a95382df8..6af38022b 100644
--- a/omlx/admin/static/css/dashboard.css
+++ b/omlx/admin/static/css/dashboard.css
@@ -63,6 +63,10 @@
[data-theme="dark"] .hover\:text-neutral-700:hover { color: var(--text-primary) !important; }
[data-theme="dark"] .hover\:text-neutral-600:hover { color: var(--text-secondary) !important; }
+ /* === Gauge track (visible in both themes) === */
+ .gauge-track { background-color: #e5e5e5; }
+ [data-theme="dark"] .gauge-track { background-color: #3f3f46 !important; }
+
/* === Active nav tab (bg-white with shadow inside dark nav) === */
[data-theme="dark"] .shadow-sm { box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.3) !important; }
diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
index ec8ddbf90..d5eca0eba 100644
--- a/omlx/admin/static/js/dashboard.js
+++ b/omlx/admin/static/js/dashboard.js
@@ -160,6 +160,10 @@
total_num_files: 0,
total_size_bytes: 0,
effective_block_sizes: [],
+ hot_cache_size_bytes: 0,
+ hot_cache_entries: 0,
+ hot_cache_max_bytes: 0,
+ disk_max_bytes: 0,
},
},
alltimeStats: {
@@ -190,6 +194,7 @@
showClearStatsConfirm: false,
showClearAlltimeConfirm: false,
showClearSsdCacheConfirm: false,
+ showClearHotCacheConfirm: false,
_statsRefreshTimer: null,
// Log viewer state
@@ -2149,7 +2154,8 @@
async clearSsdCache() {
try {
- await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
+ const resp = await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
+ if (!resp.ok) console.error('SSD cache clear failed:', resp.status);
this.showClearSsdCacheConfirm = false;
await this.loadStats();
} catch (err) {
@@ -2158,6 +2164,18 @@
}
},
+ async clearHotCache() {
+ try {
+ const resp = await fetch('/admin/api/hot-cache/clear', { method: 'POST' });
+ if (!resp.ok) console.error('Hot cache clear failed:', resp.status);
+ this.showClearHotCacheConfirm = false;
+ await this.loadStats();
+ } catch (err) {
+ console.error('Failed to clear hot cache:', err);
+ this.showClearHotCacheConfirm = false;
+ }
+ },
+
startStatsRefresh() {
this.stopStatsRefresh();
this._statsRefreshTimer = setInterval(() => {
@@ -2178,6 +2196,36 @@
return num.toLocaleString();
},
+ cacheObsCumulative(stats, selectedModel) {
+ const entries = stats.runtime_cache?.models || [];
+ if (entries.length === 0) return {};
+
+ if (selectedModel) {
+ const entry = entries.find(m => m.id === selectedModel);
+ return entry?.cache_rates?.cumulative || {};
+ }
+
+ const sumKeys = ['prefix_hits', 'prefix_misses', 'evictions', 'ssd_hot_hits', 'ssd_disk_loads', 'ssd_saves', 'hot_cache_evictions', 'hot_cache_promotions'];
+ let agg = {};
+
+ for (const m of entries) {
+ const c = m.cache_rates?.cumulative;
+ if (!c || Object.keys(c).length === 0) continue;
+ for (const k of sumKeys) {
+ agg[k] = (agg[k] || 0) + (c[k] || 0);
+ }
+ }
+
+ const ph = agg.prefix_hits || 0;
+ const pm = agg.prefix_misses || 0;
+ const sh = agg.ssd_hot_hits || 0;
+ const sd = agg.ssd_disk_loads || 0;
+ agg.prefix_hit_rate = (ph + pm) > 0 ? ph / (ph + pm) : 0;
+ agg.ssd_hot_rate = (sh + sd) > 0 ? sh / (sh + sd) : 0;
+
+ return agg;
+ },
+
getStatFontClass(value) {
if (value >= 1000000000) return 'text-2xl';
if (value >= 1000000) return 'text-3xl';
@@ -2239,6 +2287,18 @@
return 'bg-red-400';
},
+ get runtimeHotCachePercent() {
+ const rc = this.stats.runtime_cache;
+ if (!rc || !rc.hot_cache_max_bytes) return 0;
+ return Math.min(100, (rc.hot_cache_size_bytes / rc.hot_cache_max_bytes) * 100);
+ },
+
+ get runtimeSsdCachePercent() {
+ const rc = this.stats.runtime_cache;
+ if (!rc || !rc.disk_max_bytes) return 0;
+ return Math.min(100, (rc.total_size_bytes / rc.disk_max_bytes) * 100);
+ },
+
get activeModelsMemoryPercent() {
const am = this.stats.active_models;
if (!am || !am.model_memory_max) return 0;
diff --git a/omlx/admin/templates/dashboard/_status.html b/omlx/admin/templates/dashboard/_status.html
index 1d6e04f40..865ba6c25 100644
--- a/omlx/admin/templates/dashboard/_status.html
+++ b/omlx/admin/templates/dashboard/_status.html
@@ -282,8 +282,47 @@
{{ t('status.head
Runtime Cache Observability
-
+
+
+
Memory
+
+
+
+
+
+ Clear memory cache?
+
+
+
+
+
+
+
|
+
+
+
+
+
@@ -336,8 +402,10 @@ {{ t('status.head
| Block Size |
Indexed Blocks |
Sub-block Cache |
- Cache Files |
- Cache Size |
+ SSD Files |
+ SSD Size |
+ Memory Entries |
+ Memory Size |
@@ -358,6 +426,8 @@ {{ t('status.head
|
|
+ |
+ |
diff --git a/omlx/cache/observability.py b/omlx/cache/observability.py
new file mode 100644
index 000000000..72a0e370c
--- /dev/null
+++ b/omlx/cache/observability.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+import threading
+import time
+from collections import deque
+from typing import Any
+
+
+_DEFAULT_WINDOWS = (60, 300, 900)
+_MAX_SNAPSHOTS = 90
+_MIN_INTERVAL = 10.0
+
+
+class CacheRateTracker:
+
+ def __init__(
+ self,
+ max_snapshots: int = _MAX_SNAPSHOTS,
+ min_interval: float = _MIN_INTERVAL,
+ ):
+ self._snapshots: deque[tuple[float, dict[str, int]]] = deque(
+ maxlen=max_snapshots
+ )
+ self._min_interval = min_interval
+ self._lock = threading.Lock()
+
+ def maybe_snapshot(self, counters: dict[str, int]) -> bool:
+ with self._lock:
+ now = time.monotonic()
+ if self._snapshots and (now - self._snapshots[-1][0]) < self._min_interval:
+ return False
+ self._snapshots.append((now, dict(counters)))
+ return True
+
+ def get_rates(
+ self, windows: tuple[int, ...] = _DEFAULT_WINDOWS
+ ) -> dict[str, Any]:
+ with self._lock:
+ if not self._snapshots:
+ return {"windows": {}, "cumulative": {}}
+
+ now = self._snapshots[-1][0]
+ newest = self._snapshots[-1][1]
+
+ window_rates = {}
+ for w in windows:
+ label = _window_label(w)
+ baseline_ts = None
+ baseline_counters = None
+ for ts, counters in self._snapshots:
+ if (now - ts) <= w:
+ baseline_ts, baseline_counters = ts, counters
+ break
+ if baseline_ts is None:
+ baseline_ts, baseline_counters = self._snapshots[0]
+ elapsed = now - baseline_ts
+ if elapsed < 1.0:
+ window_rates[label] = {}
+ continue
+ window_rates[label] = _compute_window(
+ baseline_counters, newest, elapsed
+ )
+
+ cumulative = _compute_cumulative(newest)
+ return {"windows": window_rates, "cumulative": cumulative}
+
+ def snapshot_and_get_rates(
+ self,
+ counters: dict[str, int],
+ windows: tuple[int, ...] = _DEFAULT_WINDOWS,
+ ) -> dict[str, Any]:
+ self.maybe_snapshot(counters)
+ return self.get_rates(windows)
+
+ def clear(self) -> None:
+ with self._lock:
+ self._snapshots.clear()
+
+
+def _window_label(seconds: int) -> str:
+ if seconds < 60:
+ return f"{seconds}s"
+ return f"{seconds // 60}m"
+
+
+def _safe_ratio(numerator: int, denominator: int) -> float:
+ if denominator == 0:
+ return 0.0
+ return numerator / denominator
+
+
+def _compute_window(
+ old: dict[str, int], new: dict[str, int], elapsed: float
+) -> dict[str, Any]:
+ def delta(key: str) -> int:
+ return max(0, new.get(key, 0) - old.get(key, 0))
+
+ d_prefix_hits = delta("prefix_hits")
+ d_prefix_misses = delta("prefix_misses")
+ d_evictions = delta("evictions")
+ d_ssd_hot = delta("ssd_hot_hits")
+ d_ssd_disk = delta("ssd_disk_loads")
+ d_tokens_matched = delta("prefix_tokens_matched")
+ d_tokens_requested = delta("prefix_tokens_requested")
+
+ minutes = elapsed / 60.0
+
+ return {
+ "prefix_hit_rate": round(
+ _safe_ratio(d_prefix_hits, d_prefix_hits + d_prefix_misses), 4
+ ),
+ "prefix_hits": d_prefix_hits,
+ "prefix_misses": d_prefix_misses,
+ "prefix_match_efficiency": round(
+ _safe_ratio(d_tokens_matched, d_tokens_requested), 4
+ ),
+ "evictions": d_evictions,
+ "eviction_rate_per_min": round(d_evictions / minutes, 2) if minutes > 0 else 0.0,
+ "ssd_hot_hits": d_ssd_hot,
+ "ssd_disk_loads": d_ssd_disk,
+ "ssd_hot_rate": round(
+ _safe_ratio(d_ssd_hot, d_ssd_hot + d_ssd_disk), 4
+ ),
+ }
+
+
+def _compute_cumulative(counters: dict[str, int]) -> dict[str, Any]:
+ prefix_hits = counters.get("prefix_hits", 0)
+ prefix_misses = counters.get("prefix_misses", 0)
+ ssd_hot = counters.get("ssd_hot_hits", 0)
+ ssd_disk = counters.get("ssd_disk_loads", 0)
+ tokens_matched = counters.get("prefix_tokens_matched", 0)
+ tokens_requested = counters.get("prefix_tokens_requested", 0)
+
+ return {
+ "prefix_hits": prefix_hits,
+ "prefix_misses": prefix_misses,
+ "prefix_hit_rate": round(_safe_ratio(prefix_hits, prefix_hits + prefix_misses), 4),
+ "prefix_tokens_saved": counters.get("prefix_tokens_saved", 0),
+ "prefix_match_efficiency": round(
+ _safe_ratio(tokens_matched, tokens_requested), 4
+ ),
+ "evictions": counters.get("evictions", 0),
+ "ssd_hot_hits": ssd_hot,
+ "ssd_disk_loads": ssd_disk,
+ "ssd_saves": counters.get("ssd_saves", 0),
+ "hot_cache_evictions": counters.get("hot_cache_evictions", 0),
+ "hot_cache_promotions": counters.get("hot_cache_promotions", 0),
+ "ssd_hot_rate": round(_safe_ratio(ssd_hot, ssd_hot + ssd_disk), 4),
+ }
diff --git a/omlx/cache/paged_ssd_cache.py b/omlx/cache/paged_ssd_cache.py
index 7d5c0d6c7..be52bc8e5 100644
--- a/omlx/cache/paged_ssd_cache.py
+++ b/omlx/cache/paged_ssd_cache.py
@@ -2035,6 +2035,20 @@ def enforce_size_limit(self) -> int:
)
return freed
+ def clear_hot_cache(self) -> int:
+ """Clear all in-memory (hot) cache entries.
+
+ Returns:
+ Number of entries cleared.
+ """
+ with self._hot_cache_lock:
+ count = len(self._hot_cache)
+ self._hot_cache.clear()
+ self._hot_cache_total_bytes = 0
+ if count:
+ logger.info("Cleared %d hot cache entries", count)
+ return count
+
def clear(self) -> int:
"""
Clear all SSD cache files.
diff --git a/omlx/cache/prefix_cache.py b/omlx/cache/prefix_cache.py
index 4f2bd1d32..c9efaf349 100644
--- a/omlx/cache/prefix_cache.py
+++ b/omlx/cache/prefix_cache.py
@@ -117,6 +117,8 @@ def __init__(
self._tokens_saved = 0
self._partial_block_skips = 0
self._partial_tokens_skipped = 0
+ self._tokens_matched_total = 0
+ self._tokens_requested_total = 0
self._last_partial_tokens_skipped = 0
self._last_tokens_to_next_block = 0
@@ -285,6 +287,8 @@ def fetch_cache(
num_prefix_tokens = len(tokens) - len(remaining)
self._hits += 1
self._tokens_saved += num_prefix_tokens
+ self._tokens_matched_total += num_prefix_tokens
+ self._tokens_requested_total += len(tokens)
logger.debug(
f"Cache hit for {request_id}: "
@@ -310,6 +314,8 @@ def fetch_cache(
remaining = tokens[prefix_len:]
self._hits += 1
self._tokens_saved += prefix_len
+ self._tokens_matched_total += prefix_len
+ self._tokens_requested_total += len(tokens)
logger.debug(
f"Prefix index hit for {request_id}: " f"{prefix_len} tokens matched"
@@ -319,6 +325,7 @@ def fetch_cache(
# No cache hit
self._misses += 1
+ self._tokens_requested_total += len(tokens)
logger.debug(f"Cache miss for {request_id}")
return None, tokens
@@ -2367,6 +2374,8 @@ def get_stats(self) -> PrefixCacheStats:
block_size=self.block_size,
last_partial_tokens_skipped=self._last_partial_tokens_skipped,
last_tokens_to_next_block=self._last_tokens_to_next_block,
+ tokens_matched_total=self._tokens_matched_total,
+ tokens_requested_total=self._tokens_requested_total,
)
def get_stats_dict(self) -> dict[str, Any]:
@@ -2393,6 +2402,8 @@ def get_stats_dict(self) -> dict[str, Any]:
"block_size": self.block_size,
"last_partial_tokens_skipped": self._last_partial_tokens_skipped,
"last_tokens_to_next_block": self._last_tokens_to_next_block,
+ "tokens_matched_total": self._tokens_matched_total,
+ "tokens_requested_total": self._tokens_requested_total,
"active_requests": len(self._request_tables),
**paged_stats,
}
@@ -2404,6 +2415,8 @@ def reset_stats(self) -> None:
self._tokens_saved = 0
self._partial_block_skips = 0
self._partial_tokens_skipped = 0
+ self._tokens_matched_total = 0
+ self._tokens_requested_total = 0
self._last_partial_tokens_skipped = 0
self._last_tokens_to_next_block = 0
self.paged_cache.reset_stats()
diff --git a/omlx/cache/stats.py b/omlx/cache/stats.py
index 412074fc7..01a78c531 100644
--- a/omlx/cache/stats.py
+++ b/omlx/cache/stats.py
@@ -88,6 +88,8 @@ class PrefixCacheStats(BaseCacheStats):
block_size: int = 0
last_partial_tokens_skipped: int = 0
last_tokens_to_next_block: int = 0
+ tokens_matched_total: int = 0
+ tokens_requested_total: int = 0
_total_queries: int = field(default=0, repr=False)
@property
@@ -111,6 +113,8 @@ def reset(self) -> None:
self.partial_tokens_skipped = 0
self.last_partial_tokens_skipped = 0
self.last_tokens_to_next_block = 0
+ self.tokens_matched_total = 0
+ self.tokens_requested_total = 0
self._total_queries = 0
diff --git a/omlx/scheduler.py b/omlx/scheduler.py
index fab20ed7b..ef17f295f 100644
--- a/omlx/scheduler.py
+++ b/omlx/scheduler.py
@@ -37,6 +37,7 @@
from mlx_lm.models.cache import make_prompt_cache
from mlx_lm.sample_utils import make_logits_processors
+from .cache.observability import CacheRateTracker
from .cache.paged_cache import PagedCacheManager
from .cache.prefix_cache import BlockAwarePrefixCache
from .exceptions import is_cache_corruption_error
@@ -781,6 +782,7 @@ def __init__(
self.paged_cache_manager: PagedCacheManager | None = None
self.block_aware_cache: BlockAwarePrefixCache | None = None
self.paged_ssd_cache_manager: PagedSSDCacheManager | None = None
+ self._cache_rate_tracker = CacheRateTracker()
self.memory_monitor: MemoryMonitor | None = None
# Initialize paged SSD cache if paged_ssd_cache_dir is specified
@@ -5322,6 +5324,7 @@ def _recover_from_cache_error(self) -> None:
# Clear caches
if self.block_aware_cache is not None:
self.block_aware_cache.clear()
+ self._cache_rate_tracker.clear()
# Clear UID mappings
self.request_id_to_uid.clear()
@@ -5651,6 +5654,7 @@ def reset(self) -> None:
# Clear caches
if self.block_aware_cache is not None:
self.block_aware_cache.clear()
+ self._cache_rate_tracker.clear()
# Clear detokenizers
self._request_detokenizers.clear()
@@ -6083,6 +6087,35 @@ def restore_cold_blocks_for_request(self, request_id: str) -> int:
return verified
+ def _collect_cache_counters(self) -> dict[str, int] | None:
+ if self.block_aware_cache is None:
+ return None
+
+ prefix_stats = self.block_aware_cache.get_stats()
+ counters = {
+ "prefix_hits": prefix_stats.hits,
+ "prefix_misses": prefix_stats.misses,
+ "prefix_tokens_matched": prefix_stats.tokens_matched_total,
+ "prefix_tokens_requested": prefix_stats.tokens_requested_total,
+ "prefix_tokens_saved": prefix_stats.tokens_saved,
+ "evictions": prefix_stats.evictions,
+ }
+
+ if self.paged_ssd_cache_manager is not None:
+ ssd = self.paged_ssd_cache_manager.get_stats()
+ hot_hits = ssd.hot_cache_hits
+ total_loads = ssd.loads
+ counters.update({
+ "ssd_hot_hits": hot_hits,
+ "ssd_disk_loads": max(0, total_loads - hot_hits),
+ "ssd_saves": ssd.saves,
+ "ssd_errors": ssd.errors,
+ "hot_cache_evictions": ssd.hot_cache_evictions,
+ "hot_cache_promotions": ssd.hot_cache_promotions,
+ })
+
+ return counters
+
def get_ssd_cache_stats(self) -> dict[str, Any] | None:
"""Get paged SSD + prefix cache observability statistics."""
stats = {}
@@ -6091,15 +6124,18 @@ def get_ssd_cache_stats(self) -> dict[str, Any] | None:
stats["ssd_cache"] = self.paged_ssd_cache_manager.get_stats()
if self.paged_cache_manager is not None:
- # In paged SSD-only mode, all cache data is on paged SSD
stats["indexed_blocks"] = self.paged_cache_manager.cold_block_count
stats["block_size"] = self.config.paged_cache_block_size
if self.block_aware_cache is not None:
- # Expose prefix-cache observability so UI can distinguish
- # "0 indexed blocks" from "sub-block cached (