diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
index 06ceb808b..16a51a4cb 100644
--- a/omlx/admin/routes.py
+++ b/omlx/admin/routes.py
@@ -3483,6 +3483,13 @@ def _build_runtime_cache_observability(
         }
 
     cache_dir = global_settings.cache.get_ssd_cache_dir(global_settings.base_path)
+    cache_cfg = global_settings.cache
+    try:
+        cfg_disk_max = cache_cfg.get_ssd_cache_max_size_bytes(global_settings.base_path)
+    except (ValueError, OSError, TypeError) as exc:
+        logger.warning("Could not read SSD cache max size from config: %s", exc)
+        cfg_disk_max = 0
+
     payload = {
         "base_path": str(global_settings.base_path),
         "ssd_cache_dir": str(cache_dir),
@@ -3491,6 +3498,10 @@ def _build_runtime_cache_observability(
         "total_num_files": 0,
         "total_size_bytes": 0,
         "effective_block_sizes": [],
+        "disk_max_bytes": cfg_disk_max,
+        "hot_cache_max_bytes": 0,
+        "hot_cache_size_bytes": 0,
+        "hot_cache_entries": 0,
     }
 
     engine_pool = _get_engine_pool()
@@ -3602,11 +3613,16 @@ def _build_runtime_cache_observability(
             "last_tokens_to_next_block": last_tokens_to_next_block,
             "num_files": int(ssd_stats.get("num_files", 0) or 0),
             "total_size_bytes": int(ssd_stats.get("total_size_bytes", 0) or 0),
+            "max_size_bytes": int(ssd_stats.get("max_size_bytes", 0) or 0),
             "hot_cache_max_bytes": int(ssd_stats.get("hot_cache_max_bytes", 0) or 0),
             "hot_cache_size_bytes": int(ssd_stats.get("hot_cache_size_bytes", 0) or 0),
             "hot_cache_entries": int(ssd_stats.get("hot_cache_entries", 0) or 0),
         }
 
+        cache_rates = runtime_stats.get("cache_rates")
+        if cache_rates:
+            model_payload["cache_rates"] = cache_rates
+
         payload["models"].append(model_payload)
         payload["total_num_files"] += model_payload["num_files"]
         payload["total_size_bytes"] += model_payload["total_size_bytes"]
@@ -3616,6 +3632,26 @@ def _build_runtime_cache_observability(
 
     payload["effective_block_sizes"] = sorted(block_sizes)
 
+    # Aggregate hot-cache and disk-max across models.
+    # hot_cache_max sums across models (each model reserves its own slice of
+    # the same process-wide hot cache budget) so the gauge denominator matches
+    # the summed numerator.  disk_max keeps the config fallback via max()
+    # because a single SSD cache directory is shared — the effective cap is
+    # the largest configured limit, not a per-model sum.
+    hot_cache_max = 0
+    disk_max = payload["disk_max_bytes"]
+    hot_cache_size_total = 0
+    hot_cache_entries_total = 0
+    for m in payload["models"]:
+        hot_cache_size_total += m.get("hot_cache_size_bytes", 0)
+        hot_cache_entries_total += m.get("hot_cache_entries", 0)
+        hot_cache_max += m.get("hot_cache_max_bytes", 0)
+        disk_max = max(disk_max, m.get("max_size_bytes", 0))
+    payload["hot_cache_max_bytes"] = hot_cache_max
+    payload["hot_cache_size_bytes"] = hot_cache_size_total
+    payload["hot_cache_entries"] = hot_cache_entries_total
+    payload["disk_max_bytes"] = disk_max
+
     # Fallback: if no loaded models contributed stats, scan the cache
     # directory directly so the dashboard still shows real disk usage.
     if payload["total_num_files"] == 0 and cache_dir.exists():
@@ -3870,6 +3906,30 @@ async def clear_alltime_stats(is_admin: bool = Depends(require_admin)):
     return {"status": "ok"}
 
 
+def _iter_loaded_schedulers():
+    """Yield (model_id, scheduler) for each loaded model.
+
+    Traverses the internal engine hierarchy: pool entry → async engine →
+    core engine → scheduler.  Both ``clear_ssd_cache`` and
+    ``clear_hot_cache`` share this traversal.
+    """
+    engine_pool = _get_engine_pool()
+    if engine_pool is None:
+        return
+    for model_info in engine_pool.get_status().get("models", []):
+        model_id = model_info.get("id")
+        if not model_id or not model_info.get("loaded"):
+            continue
+        entry = engine_pool._entries.get(model_id)
+        if entry is None or entry.engine is None:
+            continue
+        async_core = getattr(entry.engine, "_engine", None)
+        core = getattr(async_core, "engine", None) if async_core is not None else None
+        scheduler = getattr(core, "scheduler", None) if core is not None else None
+        if scheduler is not None:
+            yield model_id, scheduler
+
+
 @router.post("/api/ssd-cache/clear")
 async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
     """Clear all SSD cache files for all loaded models.
@@ -3880,38 +3940,17 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
     """
     total_deleted = 0
 
-    # Phase 1: clear via loaded models' cache managers (updates in-memory index)
-    engine_pool = _get_engine_pool()
-    if engine_pool is not None:
-        for model_info in engine_pool.get_status().get("models", []):
-            model_id = model_info.get("id")
-            if not model_id or not model_info.get("loaded"):
-                continue
-
-            entry = engine_pool._entries.get(model_id)
-            if entry is None or entry.engine is None:
-                continue
-
-            async_core = getattr(entry.engine, "_engine", None)
-            core = (
-                getattr(async_core, "engine", None) if async_core is not None else None
-            )
-            scheduler = (
-                getattr(core, "scheduler", None) if core is not None else None
-            )
-
-            if scheduler is not None:
-                ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
-                if ssd_manager is not None:
-                    try:
-                        deleted = ssd_manager.clear()
-                        total_deleted += deleted
-                    except Exception as exc:
-                        logger.warning(
-                            "Failed to clear SSD cache for model '%s': %s",
-                            model_id,
-                            exc,
-                        )
+    for model_id, scheduler in _iter_loaded_schedulers():
+        ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
+        if ssd_manager is not None:
+            try:
+                total_deleted += ssd_manager.clear()
+            except Exception as exc:
+                logger.warning(
+                    "Failed to clear SSD cache for model '%s': %s",
+                    model_id,
+                    exc,
+                )
 
     # Phase 2: remove any remaining files on disk (covers unloaded models)
     global_settings = _get_global_settings()
@@ -3937,6 +3976,31 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
     return {"status": "ok", "total_deleted": total_deleted}
 
 
+@router.post("/api/hot-cache/clear")
+async def clear_hot_cache(is_admin: bool = Depends(require_admin)):
+    """Clear the in-memory (hot) cache for all loaded models.
+
+    No filesystem fallback needed — hot cache is in-memory only and does
+    not survive process restart.
+    """
+    total_cleared = 0
+    for model_id, scheduler in _iter_loaded_schedulers():
+        ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
+        if ssd_manager is not None and hasattr(ssd_manager, "clear_hot_cache"):
+            try:
+                total_cleared += ssd_manager.clear_hot_cache()
+            except Exception as exc:
+                logger.warning(
+                    "Failed to clear hot cache for model '%s': %s",
+                    model_id,
+                    exc,
+                )
+        rate_tracker = getattr(scheduler, "_cache_rate_tracker", None)
+        if rate_tracker is not None:
+            rate_tracker.clear()
+    return {"status": "ok", "total_cleared": total_cleared}
+
+
 @router.post("/api/cache/probe")
 async def probe_cache(
     request: CacheProbeRequest,
diff --git a/omlx/admin/static/css/dashboard.css b/omlx/admin/static/css/dashboard.css
index a95382df8..6af38022b 100644
--- a/omlx/admin/static/css/dashboard.css
+++ b/omlx/admin/static/css/dashboard.css
@@ -63,6 +63,10 @@
     [data-theme="dark"] .hover\:text-neutral-700:hover { color: var(--text-primary) !important; }
     [data-theme="dark"] .hover\:text-neutral-600:hover { color: var(--text-secondary) !important; }
 
+    /* === Gauge track (visible in both themes) === */
+    .gauge-track { background-color: #e5e5e5; }
+    [data-theme="dark"] .gauge-track { background-color: #3f3f46 !important; }
+
     /* === Active nav tab (bg-white with shadow inside dark nav) === */
     [data-theme="dark"] .shadow-sm { box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.3) !important; }
 
diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
index ec8ddbf90..d5eca0eba 100644
--- a/omlx/admin/static/js/dashboard.js
+++ b/omlx/admin/static/js/dashboard.js
@@ -160,6 +160,10 @@
                     total_num_files: 0,
                     total_size_bytes: 0,
                     effective_block_sizes: [],
+                    hot_cache_size_bytes: 0,
+                    hot_cache_entries: 0,
+                    hot_cache_max_bytes: 0,
+                    disk_max_bytes: 0,
                 },
             },
             alltimeStats: {
@@ -190,6 +194,7 @@
             showClearStatsConfirm: false,
             showClearAlltimeConfirm: false,
             showClearSsdCacheConfirm: false,
+            showClearHotCacheConfirm: false,
             _statsRefreshTimer: null,
 
             // Log viewer state
@@ -2149,7 +2154,8 @@
 
             async clearSsdCache() {
                 try {
-                    await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
+                    const resp = await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
+                    if (!resp.ok) console.error('SSD cache clear failed:', resp.status);
                     this.showClearSsdCacheConfirm = false;
                     await this.loadStats();
                 } catch (err) {
@@ -2158,6 +2164,18 @@
                 }
             },
 
+            async clearHotCache() {
+                try {
+                    const resp = await fetch('/admin/api/hot-cache/clear', { method: 'POST' });
+                    if (!resp.ok) console.error('Hot cache clear failed:', resp.status);
+                    this.showClearHotCacheConfirm = false;
+                    await this.loadStats();
+                } catch (err) {
+                    console.error('Failed to clear hot cache:', err);
+                    this.showClearHotCacheConfirm = false;
+                }
+            },
+
             startStatsRefresh() {
                 this.stopStatsRefresh();
                 this._statsRefreshTimer = setInterval(() => {
@@ -2178,6 +2196,36 @@
                 return num.toLocaleString();
             },
 
+            cacheObsCumulative(stats, selectedModel) {
+                const entries = stats.runtime_cache?.models || [];
+                if (entries.length === 0) return {};
+
+                if (selectedModel) {
+                    const entry = entries.find(m => m.id === selectedModel);
+                    return entry?.cache_rates?.cumulative || {};
+                }
+
+                const sumKeys = ['prefix_hits', 'prefix_misses', 'evictions', 'ssd_hot_hits', 'ssd_disk_loads', 'ssd_saves', 'hot_cache_evictions', 'hot_cache_promotions'];
+                let agg = {};
+
+                for (const m of entries) {
+                    const c = m.cache_rates?.cumulative;
+                    if (!c || Object.keys(c).length === 0) continue;
+                    for (const k of sumKeys) {
+                        agg[k] = (agg[k] || 0) + (c[k] || 0);
+                    }
+                }
+
+                const ph = agg.prefix_hits || 0;
+                const pm = agg.prefix_misses || 0;
+                const sh = agg.ssd_hot_hits || 0;
+                const sd = agg.ssd_disk_loads || 0;
+                agg.prefix_hit_rate = (ph + pm) > 0 ? ph / (ph + pm) : 0;
+                agg.ssd_hot_rate = (sh + sd) > 0 ? sh / (sh + sd) : 0;
+
+                return agg;
+            },
+
             getStatFontClass(value) {
                 if (value >= 1000000000) return 'text-2xl';
                 if (value >= 1000000) return 'text-3xl';
@@ -2239,6 +2287,18 @@
                 return 'bg-red-400';
             },
 
+            get runtimeHotCachePercent() {
+                const rc = this.stats.runtime_cache;
+                if (!rc || !rc.hot_cache_max_bytes) return 0;
+                return Math.min(100, (rc.hot_cache_size_bytes / rc.hot_cache_max_bytes) * 100);
+            },
+
+            get runtimeSsdCachePercent() {
+                const rc = this.stats.runtime_cache;
+                if (!rc || !rc.disk_max_bytes) return 0;
+                return Math.min(100, (rc.total_size_bytes / rc.disk_max_bytes) * 100);
+            },
+
             get activeModelsMemoryPercent() {
                 const am = this.stats.active_models;
                 if (!am || !am.model_memory_max) return 0;
diff --git a/omlx/admin/templates/dashboard/_status.html b/omlx/admin/templates/dashboard/_status.html
index 1d6e04f40..865ba6c25 100644
--- a/omlx/admin/templates/dashboard/_status.html
+++ b/omlx/admin/templates/dashboard/_status.html
@@ -282,8 +282,47 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('status.head
                                 <span class="text-xs font-bold uppercase tracking-wider text-neutral-600">Runtime Cache Observability</span>
                             </div>
                             <div class="flex items-center gap-2">
-                                <span class="text-xs text-neutral-500"
-                                      x-text="formatNumber((stats.runtime_cache?.total_num_files || 0)) + ' files · ' + formatSizeBytes((stats.runtime_cache?.total_size_bytes || 0))"></span>
+                                <!-- Memory gauge (only when hot cache is enabled AND models are loaded) -->
+                                <div x-show="stats.runtime_cache?.hot_cache_max_bytes > 0 && stats.runtime_cache?.models?.length > 0" class="flex items-center gap-2">
+                                    <span class="text-xs text-neutral-400">Memory</span>
+                                    <div class="w-20 h-2 rounded-full overflow-hidden gauge-track">
+                                        <div class="h-full bg-amber-400 rounded-full transition-all duration-500"
+                                             :style="'width: ' + runtimeHotCachePercent + '%'"></div>
+                                    </div>
+                                    <span class="text-xs font-medium text-neutral-500 whitespace-nowrap"
+                                          x-text="formatSizeBytes(stats.runtime_cache.hot_cache_size_bytes) + ' / ' + formatSizeBytes(stats.runtime_cache.hot_cache_max_bytes) + ' · ' + (stats.runtime_cache.hot_cache_entries || 0) + ' entries'"></span>
+                                    <div class="flex items-center gap-1.5">
+                                        <button x-show="!showClearHotCacheConfirm && (stats.runtime_cache?.hot_cache_entries || 0) > 0"
+                                                @click="showClearHotCacheConfirm = true"
+                                                class="p-1 text-neutral-400 hover:text-red-500 hover:bg-neutral-200 rounded-lg transition-all"
+                                                title="Clear memory cache">
+                                            <i data-lucide="trash-2" class="w-3.5 h-3.5"></i>
+                                        </button>
+                                        <div x-show="showClearHotCacheConfirm" x-cloak class="flex items-center gap-1.5">
+                                            <span class="text-xs text-red-600">Clear memory cache?</span>
+                                            <button @click="clearHotCache()"
+                                                    class="px-2.5 py-1 text-xs font-medium text-white bg-red-500 hover:bg-red-600 rounded-lg transition-all">
+                                                {{ t('status.clear_yes') }}
+                                            </button>
+                                            <button @click="showClearHotCacheConfirm = false"
+                                                    class="px-2.5 py-1 text-xs font-medium text-neutral-600 hover:bg-neutral-100 rounded-lg transition-all">
+                                                {{ t('status.clear_cancel') }}
+                                            </button>
+                                        </div>
+                                    </div>
+                                </div>
+                                <!-- Separator (only when both tiers visible) -->
+                                <span x-show="stats.runtime_cache?.hot_cache_max_bytes > 0 && stats.runtime_cache?.models?.length > 0" class="text-neutral-300 text-xs select-none">|</span>
+                                <!-- SSD gauge (always visible) -->
+                                <div class="flex items-center gap-2">
+                                    <span class="text-xs text-neutral-400">SSD</span>
+                                    <div class="w-20 h-2 rounded-full overflow-hidden gauge-track">
+                                        <div class="h-full bg-amber-400 rounded-full transition-all duration-500"
+                                             :style="'width: ' + runtimeSsdCachePercent + '%'"></div>
+                                    </div>
+                                    <span class="text-xs font-medium text-neutral-500 whitespace-nowrap"
+                                          x-text="formatSizeBytes(stats.runtime_cache?.total_size_bytes || 0) + ' / ' + formatSizeBytes(stats.runtime_cache?.disk_max_bytes || 0) + ' · ' + formatNumber(stats.runtime_cache?.total_num_files || 0) + ' files'"></span>
+                                </div>
                                 <div class="flex items-center gap-1.5">
                                     <button x-show="!showClearSsdCacheConfirm && (stats.runtime_cache?.total_num_files || 0) > 0"
                                             @click="showClearSsdCacheConfirm = true"
@@ -327,6 +366,33 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('status.head
                                 </div>
                             </div>
 
+                            <!-- Cache rates (visible when models are loaded, session scope only) -->
+                            <div x-show="statsScope !== 'alltime' && stats.runtime_cache?.models && stats.runtime_cache.models.length > 0" x-cloak
+                                 x-data="{ get c() { return cacheObsCumulative(stats, selectedStatsModel) } }"
+                                 :class="stats.runtime_cache?.hot_cache_max_bytes > 0 ? 'grid-cols-2 sm:grid-cols-4' : 'grid-cols-2'"
+                                 class="grid gap-3 border border-neutral-100 rounded-lg p-3">
+                                <div>
+                                    <p class="text-[11px] text-neutral-400 mb-0.5">Prefix Hit Rate</p>
+                                    <p class="text-lg font-bold text-neutral-900"
+                                       x-text="((c.prefix_hit_rate || 0) * 100).toFixed(1) + '%'"></p>
+                                </div>
+                                <div x-show="stats.runtime_cache?.hot_cache_max_bytes > 0">
+                                    <p class="text-[11px] text-neutral-400 mb-0.5">Memory Hit Rate</p>
+                                    <p class="text-lg font-bold text-neutral-900"
+                                       x-text="((c.ssd_hot_rate || 0) * 100).toFixed(1) + '%'"></p>
+                                </div>
+                                <div>
+                                    <p class="text-[11px] text-neutral-400 mb-0.5">Prefix Evictions</p>
+                                    <p class="text-lg font-bold text-neutral-900"
+                                       x-text="(c.evictions || 0).toLocaleString()"></p>
+                                </div>
+                                <div x-show="stats.runtime_cache?.hot_cache_max_bytes > 0">
+                                    <p class="text-[11px] text-neutral-400 mb-0.5">Memory Evictions</p>
+                                    <p class="text-lg font-bold text-neutral-900"
+                                       x-text="(c.hot_cache_evictions || 0).toLocaleString()"></p>
+                                </div>
+                            </div>
+
                             <div x-show="stats.runtime_cache?.models && stats.runtime_cache.models.length > 0" x-cloak
                                  class="overflow-x-auto border border-neutral-100 rounded-lg">
                                 <table class="min-w-full text-xs">
@@ -336,8 +402,10 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('status.head
                                             <th class="px-3 py-2 text-right">Block Size</th>
                                             <th class="px-3 py-2 text-right">Indexed Blocks</th>
                                             <th class="px-3 py-2 text-right">Sub-block Cache</th>
-                                            <th class="px-3 py-2 text-right">Cache Files</th>
-                                            <th class="px-3 py-2 text-right">Cache Size</th>
+                                            <th class="px-3 py-2 text-right">SSD Files</th>
+                                            <th class="px-3 py-2 text-right">SSD Size</th>
+                                            <th x-show="stats.runtime_cache?.hot_cache_max_bytes > 0" class="px-3 py-2 text-right">Memory Entries</th>
+                                            <th x-show="stats.runtime_cache?.hot_cache_max_bytes > 0" class="px-3 py-2 text-right">Memory Size</th>
                                         </tr>
                                     </thead>
                                     <tbody class="divide-y divide-neutral-100 text-neutral-700">
@@ -358,6 +426,8 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('status.head
                                                 </td>
                                                 <td class="px-3 py-2 text-right" x-text="formatNumber(m.num_files || 0)"></td>
                                                 <td class="px-3 py-2 text-right" x-text="formatSizeBytes(m.total_size_bytes || 0)"></td>
+                                                <td x-show="stats.runtime_cache?.hot_cache_max_bytes > 0" class="px-3 py-2 text-right" x-text="m.hot_cache_entries || 0"></td>
+                                                <td x-show="stats.runtime_cache?.hot_cache_max_bytes > 0" class="px-3 py-2 text-right" x-text="formatSizeBytes(m.hot_cache_size_bytes || 0)"></td>
                                             </tr>
                                         </template>
                                     </tbody>
diff --git a/omlx/cache/observability.py b/omlx/cache/observability.py
new file mode 100644
index 000000000..72a0e370c
--- /dev/null
+++ b/omlx/cache/observability.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+import threading
+import time
+from collections import deque
+from typing import Any
+
+
+_DEFAULT_WINDOWS = (60, 300, 900)
+_MAX_SNAPSHOTS = 90
+_MIN_INTERVAL = 10.0
+
+
+class CacheRateTracker:
+
+    def __init__(
+        self,
+        max_snapshots: int = _MAX_SNAPSHOTS,
+        min_interval: float = _MIN_INTERVAL,
+    ):
+        self._snapshots: deque[tuple[float, dict[str, int]]] = deque(
+            maxlen=max_snapshots
+        )
+        self._min_interval = min_interval
+        self._lock = threading.Lock()
+
+    def maybe_snapshot(self, counters: dict[str, int]) -> bool:
+        with self._lock:
+            now = time.monotonic()
+            if self._snapshots and (now - self._snapshots[-1][0]) < self._min_interval:
+                return False
+            self._snapshots.append((now, dict(counters)))
+            return True
+
+    def get_rates(
+        self, windows: tuple[int, ...] = _DEFAULT_WINDOWS
+    ) -> dict[str, Any]:
+        with self._lock:
+            if not self._snapshots:
+                return {"windows": {}, "cumulative": {}}
+
+            now = self._snapshots[-1][0]
+            newest = self._snapshots[-1][1]
+
+            window_rates = {}
+            for w in windows:
+                label = _window_label(w)
+                baseline_ts = None
+                baseline_counters = None
+                for ts, counters in self._snapshots:
+                    if (now - ts) <= w:
+                        baseline_ts, baseline_counters = ts, counters
+                        break
+                if baseline_ts is None:
+                    baseline_ts, baseline_counters = self._snapshots[0]
+                elapsed = now - baseline_ts
+                if elapsed < 1.0:
+                    window_rates[label] = {}
+                    continue
+                window_rates[label] = _compute_window(
+                    baseline_counters, newest, elapsed
+                )
+
+            cumulative = _compute_cumulative(newest)
+            return {"windows": window_rates, "cumulative": cumulative}
+
+    def snapshot_and_get_rates(
+        self,
+        counters: dict[str, int],
+        windows: tuple[int, ...] = _DEFAULT_WINDOWS,
+    ) -> dict[str, Any]:
+        self.maybe_snapshot(counters)
+        return self.get_rates(windows)
+
+    def clear(self) -> None:
+        with self._lock:
+            self._snapshots.clear()
+
+
+def _window_label(seconds: int) -> str:
+    if seconds < 60:
+        return f"{seconds}s"
+    return f"{seconds // 60}m"
+
+
+def _safe_ratio(numerator: int, denominator: int) -> float:
+    if denominator == 0:
+        return 0.0
+    return numerator / denominator
+
+
+def _compute_window(
+    old: dict[str, int], new: dict[str, int], elapsed: float
+) -> dict[str, Any]:
+    def delta(key: str) -> int:
+        return max(0, new.get(key, 0) - old.get(key, 0))
+
+    d_prefix_hits = delta("prefix_hits")
+    d_prefix_misses = delta("prefix_misses")
+    d_evictions = delta("evictions")
+    d_ssd_hot = delta("ssd_hot_hits")
+    d_ssd_disk = delta("ssd_disk_loads")
+    d_tokens_matched = delta("prefix_tokens_matched")
+    d_tokens_requested = delta("prefix_tokens_requested")
+
+    minutes = elapsed / 60.0
+
+    return {
+        "prefix_hit_rate": round(
+            _safe_ratio(d_prefix_hits, d_prefix_hits + d_prefix_misses), 4
+        ),
+        "prefix_hits": d_prefix_hits,
+        "prefix_misses": d_prefix_misses,
+        "prefix_match_efficiency": round(
+            _safe_ratio(d_tokens_matched, d_tokens_requested), 4
+        ),
+        "evictions": d_evictions,
+        "eviction_rate_per_min": round(d_evictions / minutes, 2) if minutes > 0 else 0.0,
+        "ssd_hot_hits": d_ssd_hot,
+        "ssd_disk_loads": d_ssd_disk,
+        "ssd_hot_rate": round(
+            _safe_ratio(d_ssd_hot, d_ssd_hot + d_ssd_disk), 4
+        ),
+    }
+
+
+def _compute_cumulative(counters: dict[str, int]) -> dict[str, Any]:
+    prefix_hits = counters.get("prefix_hits", 0)
+    prefix_misses = counters.get("prefix_misses", 0)
+    ssd_hot = counters.get("ssd_hot_hits", 0)
+    ssd_disk = counters.get("ssd_disk_loads", 0)
+    tokens_matched = counters.get("prefix_tokens_matched", 0)
+    tokens_requested = counters.get("prefix_tokens_requested", 0)
+
+    return {
+        "prefix_hits": prefix_hits,
+        "prefix_misses": prefix_misses,
+        "prefix_hit_rate": round(_safe_ratio(prefix_hits, prefix_hits + prefix_misses), 4),
+        "prefix_tokens_saved": counters.get("prefix_tokens_saved", 0),
+        "prefix_match_efficiency": round(
+            _safe_ratio(tokens_matched, tokens_requested), 4
+        ),
+        "evictions": counters.get("evictions", 0),
+        "ssd_hot_hits": ssd_hot,
+        "ssd_disk_loads": ssd_disk,
+        "ssd_saves": counters.get("ssd_saves", 0),
+        "hot_cache_evictions": counters.get("hot_cache_evictions", 0),
+        "hot_cache_promotions": counters.get("hot_cache_promotions", 0),
+        "ssd_hot_rate": round(_safe_ratio(ssd_hot, ssd_hot + ssd_disk), 4),
+    }
diff --git a/omlx/cache/paged_ssd_cache.py b/omlx/cache/paged_ssd_cache.py
index 7d5c0d6c7..be52bc8e5 100644
--- a/omlx/cache/paged_ssd_cache.py
+++ b/omlx/cache/paged_ssd_cache.py
@@ -2035,6 +2035,20 @@ def enforce_size_limit(self) -> int:
             )
             return freed
 
+    def clear_hot_cache(self) -> int:
+        """Clear all in-memory (hot) cache entries.
+
+        Returns:
+            Number of entries cleared.
+        """
+        with self._hot_cache_lock:
+            count = len(self._hot_cache)
+            self._hot_cache.clear()
+            self._hot_cache_total_bytes = 0
+        if count:
+            logger.info("Cleared %d hot cache entries", count)
+        return count
+
     def clear(self) -> int:
         """
         Clear all SSD cache files.
diff --git a/omlx/cache/prefix_cache.py b/omlx/cache/prefix_cache.py
index 4f2bd1d32..c9efaf349 100644
--- a/omlx/cache/prefix_cache.py
+++ b/omlx/cache/prefix_cache.py
@@ -117,6 +117,8 @@ def __init__(
         self._tokens_saved = 0
         self._partial_block_skips = 0
         self._partial_tokens_skipped = 0
+        self._tokens_matched_total = 0
+        self._tokens_requested_total = 0
         self._last_partial_tokens_skipped = 0
         self._last_tokens_to_next_block = 0
 
@@ -285,6 +287,8 @@ def fetch_cache(
             num_prefix_tokens = len(tokens) - len(remaining)
             self._hits += 1
             self._tokens_saved += num_prefix_tokens
+            self._tokens_matched_total += num_prefix_tokens
+            self._tokens_requested_total += len(tokens)
 
             logger.debug(
                 f"Cache hit for {request_id}: "
@@ -310,6 +314,8 @@ def fetch_cache(
             remaining = tokens[prefix_len:]
             self._hits += 1
             self._tokens_saved += prefix_len
+            self._tokens_matched_total += prefix_len
+            self._tokens_requested_total += len(tokens)
 
             logger.debug(
                 f"Prefix index hit for {request_id}: " f"{prefix_len} tokens matched"
@@ -319,6 +325,7 @@ def fetch_cache(
 
         # No cache hit
         self._misses += 1
+        self._tokens_requested_total += len(tokens)
         logger.debug(f"Cache miss for {request_id}")
         return None, tokens
 
@@ -2367,6 +2374,8 @@ def get_stats(self) -> PrefixCacheStats:
             block_size=self.block_size,
             last_partial_tokens_skipped=self._last_partial_tokens_skipped,
             last_tokens_to_next_block=self._last_tokens_to_next_block,
+            tokens_matched_total=self._tokens_matched_total,
+            tokens_requested_total=self._tokens_requested_total,
         )
 
     def get_stats_dict(self) -> dict[str, Any]:
@@ -2393,6 +2402,8 @@ def get_stats_dict(self) -> dict[str, Any]:
             "block_size": self.block_size,
             "last_partial_tokens_skipped": self._last_partial_tokens_skipped,
             "last_tokens_to_next_block": self._last_tokens_to_next_block,
+            "tokens_matched_total": self._tokens_matched_total,
+            "tokens_requested_total": self._tokens_requested_total,
             "active_requests": len(self._request_tables),
             **paged_stats,
         }
@@ -2404,6 +2415,8 @@ def reset_stats(self) -> None:
         self._tokens_saved = 0
         self._partial_block_skips = 0
         self._partial_tokens_skipped = 0
+        self._tokens_matched_total = 0
+        self._tokens_requested_total = 0
         self._last_partial_tokens_skipped = 0
         self._last_tokens_to_next_block = 0
         self.paged_cache.reset_stats()
diff --git a/omlx/cache/stats.py b/omlx/cache/stats.py
index 412074fc7..01a78c531 100644
--- a/omlx/cache/stats.py
+++ b/omlx/cache/stats.py
@@ -88,6 +88,8 @@ class PrefixCacheStats(BaseCacheStats):
     block_size: int = 0
     last_partial_tokens_skipped: int = 0
     last_tokens_to_next_block: int = 0
+    tokens_matched_total: int = 0
+    tokens_requested_total: int = 0
     _total_queries: int = field(default=0, repr=False)
 
     @property
@@ -111,6 +113,8 @@ def reset(self) -> None:
         self.partial_tokens_skipped = 0
         self.last_partial_tokens_skipped = 0
         self.last_tokens_to_next_block = 0
+        self.tokens_matched_total = 0
+        self.tokens_requested_total = 0
         self._total_queries = 0
 
 
diff --git a/omlx/scheduler.py b/omlx/scheduler.py
index fab20ed7b..ef17f295f 100644
--- a/omlx/scheduler.py
+++ b/omlx/scheduler.py
@@ -37,6 +37,7 @@
 from mlx_lm.models.cache import make_prompt_cache
 from mlx_lm.sample_utils import make_logits_processors
 
+from .cache.observability import CacheRateTracker
 from .cache.paged_cache import PagedCacheManager
 from .cache.prefix_cache import BlockAwarePrefixCache
 from .exceptions import is_cache_corruption_error
@@ -781,6 +782,7 @@ def __init__(
         self.paged_cache_manager: PagedCacheManager | None = None
         self.block_aware_cache: BlockAwarePrefixCache | None = None
         self.paged_ssd_cache_manager: PagedSSDCacheManager | None = None
+        self._cache_rate_tracker = CacheRateTracker()
         self.memory_monitor: MemoryMonitor | None = None
 
         # Initialize paged SSD cache if paged_ssd_cache_dir is specified
@@ -5322,6 +5324,7 @@ def _recover_from_cache_error(self) -> None:
         # Clear caches
         if self.block_aware_cache is not None:
             self.block_aware_cache.clear()
+        self._cache_rate_tracker.clear()
 
         # Clear UID mappings
         self.request_id_to_uid.clear()
@@ -5651,6 +5654,7 @@ def reset(self) -> None:
         # Clear caches
         if self.block_aware_cache is not None:
             self.block_aware_cache.clear()
+        self._cache_rate_tracker.clear()
 
         # Clear detokenizers
         self._request_detokenizers.clear()
@@ -6083,6 +6087,35 @@ def restore_cold_blocks_for_request(self, request_id: str) -> int:
 
         return verified
 
+    def _collect_cache_counters(self) -> dict[str, int] | None:
+        if self.block_aware_cache is None:
+            return None
+
+        prefix_stats = self.block_aware_cache.get_stats()
+        counters = {
+            "prefix_hits": prefix_stats.hits,
+            "prefix_misses": prefix_stats.misses,
+            "prefix_tokens_matched": prefix_stats.tokens_matched_total,
+            "prefix_tokens_requested": prefix_stats.tokens_requested_total,
+            "prefix_tokens_saved": prefix_stats.tokens_saved,
+            "evictions": prefix_stats.evictions,
+        }
+
+        if self.paged_ssd_cache_manager is not None:
+            ssd = self.paged_ssd_cache_manager.get_stats()
+            hot_hits = ssd.hot_cache_hits
+            total_loads = ssd.loads
+            counters.update({
+                "ssd_hot_hits": hot_hits,
+                "ssd_disk_loads": max(0, total_loads - hot_hits),
+                "ssd_saves": ssd.saves,
+                "ssd_errors": ssd.errors,
+                "hot_cache_evictions": ssd.hot_cache_evictions,
+                "hot_cache_promotions": ssd.hot_cache_promotions,
+            })
+
+        return counters
+
     def get_ssd_cache_stats(self) -> dict[str, Any] | None:
         """Get paged SSD + prefix cache observability statistics."""
         stats = {}
@@ -6091,15 +6124,18 @@ def get_ssd_cache_stats(self) -> dict[str, Any] | None:
             stats["ssd_cache"] = self.paged_ssd_cache_manager.get_stats()
 
         if self.paged_cache_manager is not None:
-            # In paged SSD-only mode, all cache data is on paged SSD
             stats["indexed_blocks"] = self.paged_cache_manager.cold_block_count
             stats["block_size"] = self.config.paged_cache_block_size
 
         if self.block_aware_cache is not None:
-            # Expose prefix-cache observability so UI can distinguish
-            # "0 indexed blocks" from "sub-block cached (<block_size)".
             stats["prefix_cache"] = self.block_aware_cache.get_stats_dict()
 
+        counters = self._collect_cache_counters()
+        if counters:
+            stats["cache_rates"] = self._cache_rate_tracker.snapshot_and_get_rates(
+                counters
+            )
+
         return stats if stats else None
 
     # Alias for backwards compatibility
diff --git a/tests/test_admin_api_key.py b/tests/test_admin_api_key.py
index 8766b39ce..0ace324e9 100644
--- a/tests/test_admin_api_key.py
+++ b/tests/test_admin_api_key.py
@@ -655,6 +655,7 @@ def test_runtime_cache_uses_model_scoped_ssd_stats(self):
         mock_settings = MagicMock()
         mock_settings.base_path = Path("/tmp/omlx-base")
         mock_settings.cache.get_ssd_cache_dir.return_value = cache_dir
+        mock_settings.cache.get_ssd_cache_max_size_bytes.return_value = 0
 
         shared_ssd_stats = {
             "num_files": 999,
@@ -744,6 +745,7 @@ def test_runtime_cache_uses_model_scoped_ssd_stats(self):
                 "last_tokens_to_next_block": 0,
                 "num_files": 3,
                 "total_size_bytes": 4096,
+                "max_size_bytes": 0,
                 "hot_cache_max_bytes": 0,
                 "hot_cache_size_bytes": 0,
                 "hot_cache_entries": 0,
@@ -760,6 +762,7 @@ def test_runtime_cache_uses_model_scoped_ssd_stats(self):
                 "last_tokens_to_next_block": 0,
                 "num_files": 7,
                 "total_size_bytes": 8192,
+                "max_size_bytes": 0,
                 "hot_cache_max_bytes": 0,
                 "hot_cache_size_bytes": 0,
                 "hot_cache_entries": 0,
@@ -775,6 +778,7 @@ def test_runtime_cache_ignores_single_model_stats_failure(self):
         mock_settings = MagicMock()
         mock_settings.base_path = Path("/tmp/omlx-base")
         mock_settings.cache.get_ssd_cache_dir.return_value = cache_dir
+        mock_settings.cache.get_ssd_cache_max_size_bytes.return_value = 0
 
         bad_scheduler = MagicMock()
         bad_scheduler.get_ssd_cache_stats.side_effect = RuntimeError("boom")
@@ -833,6 +837,7 @@ def test_runtime_cache_marks_sub_block_cached_when_indexed_blocks_zero(self):
         mock_settings = MagicMock()
         mock_settings.base_path = Path("/tmp/omlx-base")
         mock_settings.cache.get_ssd_cache_dir.return_value = cache_dir
+        mock_settings.cache.get_ssd_cache_max_size_bytes.return_value = 0
 
         scheduler = MagicMock()
         scheduler.get_ssd_cache_stats.return_value = {
diff --git a/tests/test_cache_observability.py b/tests/test_cache_observability.py
new file mode 100644
index 000000000..292390496
--- /dev/null
+++ b/tests/test_cache_observability.py
@@ -0,0 +1,202 @@
+# tests/test_cache_observability.py
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for cache observability module."""
+
+import threading
+import time
+from unittest.mock import patch
+
+import pytest
+
+from omlx.cache.observability import CacheRateTracker
+
+
+def _make_counters(
+    prefix_hits=0,
+    prefix_misses=0,
+    prefix_tokens_matched=0,
+    prefix_tokens_requested=0,
+    prefix_tokens_saved=0,
+    evictions=0,
+    ssd_hot_hits=0,
+    ssd_disk_loads=0,
+    ssd_saves=0,
+    ssd_errors=0,
+    hot_cache_evictions=0,
+    hot_cache_promotions=0,
+):
+    return {
+        "prefix_hits": prefix_hits,
+        "prefix_misses": prefix_misses,
+        "prefix_tokens_matched": prefix_tokens_matched,
+        "prefix_tokens_requested": prefix_tokens_requested,
+        "prefix_tokens_saved": prefix_tokens_saved,
+        "evictions": evictions,
+        "ssd_hot_hits": ssd_hot_hits,
+        "ssd_disk_loads": ssd_disk_loads,
+        "ssd_saves": ssd_saves,
+        "ssd_errors": ssd_errors,
+        "hot_cache_evictions": hot_cache_evictions,
+        "hot_cache_promotions": hot_cache_promotions,
+    }
+
+
+class TestCacheRateTrackerSnapshot:
+
+    def test_empty_tracker_returns_empty_rates(self):
+        tracker = CacheRateTracker()
+        result = tracker.get_rates()
+        assert result == {"windows": {}, "cumulative": {}}
+
+    def test_first_snapshot_always_accepted(self):
+        tracker = CacheRateTracker(min_interval=10.0)
+        assert tracker.maybe_snapshot(_make_counters()) is True
+
+    def test_snapshot_rejected_within_min_interval(self):
+        tracker = CacheRateTracker(min_interval=10.0)
+        tracker.maybe_snapshot(_make_counters())
+        assert tracker.maybe_snapshot(_make_counters()) is False
+
+    def test_snapshot_accepted_after_min_interval(self):
+        tracker = CacheRateTracker(min_interval=0.0)
+        tracker.maybe_snapshot(_make_counters())
+        assert tracker.maybe_snapshot(_make_counters()) is True
+
+    def test_deque_overflow_evicts_oldest(self):
+        tracker = CacheRateTracker(max_snapshots=3, min_interval=0.0)
+        for i in range(5):
+            tracker.maybe_snapshot(_make_counters(prefix_hits=i))
+        result = tracker.get_rates()
+        assert result["cumulative"]["prefix_hits"] == 4
+
+
+class TestCacheRateTrackerRates:
+
+    def _tracker_with_two_snapshots(self, old_counters, new_counters, elapsed=60.0):
+        tracker = CacheRateTracker(min_interval=0.0)
+        fake_time = [1000.0]
+
+        def mock_monotonic():
+            return fake_time[0]
+
+        with patch("omlx.cache.observability.time.monotonic", side_effect=mock_monotonic):
+            tracker.maybe_snapshot(old_counters)
+
+        fake_time[0] = 1000.0 + elapsed
+        with patch("omlx.cache.observability.time.monotonic", side_effect=mock_monotonic):
+            tracker.maybe_snapshot(new_counters)
+
+        with patch("omlx.cache.observability.time.monotonic", return_value=fake_time[0]):
+            return tracker.get_rates(windows=(60, 300, 900))
+
+    def test_steady_state_prefix_hit_rate(self):
+        old = _make_counters(prefix_hits=100, prefix_misses=50)
+        new = _make_counters(prefix_hits=200, prefix_misses=75)
+        result = self._tracker_with_two_snapshots(old, new, elapsed=60.0)
+        assert result["windows"]["1m"]["prefix_hit_rate"] == 0.8
+
+    def test_zero_activity_window_no_nan(self):
+        counters = _make_counters(prefix_hits=50, prefix_misses=10)
+        result = self._tracker_with_two_snapshots(counters, counters, elapsed=60.0)
+        assert result["windows"]["1m"]["prefix_hit_rate"] == 0.0
+        assert result["windows"]["1m"]["prefix_match_efficiency"] == 0.0
+        assert result["windows"]["1m"]["eviction_rate_per_min"] == 0.0
+
+    def test_eviction_rate_per_min(self):
+        old = _make_counters(evictions=10)
+        new = _make_counters(evictions=40)
+        result = self._tracker_with_two_snapshots(old, new, elapsed=300.0)
+        assert result["windows"]["5m"]["eviction_rate_per_min"] == 6.0
+
+    def test_prefix_match_efficiency(self):
+        old = _make_counters(prefix_tokens_matched=0, prefix_tokens_requested=0)
+        new = _make_counters(prefix_tokens_matched=600, prefix_tokens_requested=1000)
+        result = self._tracker_with_two_snapshots(old, new, elapsed=60.0)
+        assert result["windows"]["1m"]["prefix_match_efficiency"] == 0.6
+
+    def test_ssd_hot_rate(self):
+        old = _make_counters(ssd_hot_hits=0, ssd_disk_loads=0)
+        new = _make_counters(ssd_hot_hits=80, ssd_disk_loads=20)
+        result = self._tracker_with_two_snapshots(old, new, elapsed=60.0)
+        assert result["windows"]["1m"]["ssd_hot_rate"] == 0.8
+
+    def test_insufficient_data_returns_empty_window(self):
+        tracker = CacheRateTracker(min_interval=0.0)
+
+        with patch("omlx.cache.observability.time.monotonic", return_value=1000.0):
+            tracker.maybe_snapshot(_make_counters(prefix_hits=10))
+
+        with patch("omlx.cache.observability.time.monotonic", return_value=1000.5):
+            tracker.maybe_snapshot(_make_counters(prefix_hits=20))
+
+        with patch("omlx.cache.observability.time.monotonic", return_value=1000.5):
+            result = tracker.get_rates(windows=(60,))
+        assert result["windows"]["1m"] == {}
+
+    def test_cumulative_uses_latest_snapshot(self):
+        old = _make_counters(prefix_hits=10, prefix_misses=5)
+        new = _make_counters(prefix_hits=100, prefix_misses=20)
+        result = self._tracker_with_two_snapshots(old, new, elapsed=60.0)
+        assert result["cumulative"]["prefix_hits"] == 100
+        assert result["cumulative"]["prefix_misses"] == 20
+        assert abs(result["cumulative"]["prefix_hit_rate"] - 0.8333) < 0.001
+
+
+class TestCacheRateTrackerSnapshotAndGetRates:
+
+    def test_combines_snapshot_and_rates(self):
+        tracker = CacheRateTracker(min_interval=0.0)
+
+        with patch("omlx.cache.observability.time.monotonic", return_value=1000.0):
+            tracker.maybe_snapshot(_make_counters(prefix_hits=0))
+
+        with patch("omlx.cache.observability.time.monotonic", return_value=1060.0):
+            result = tracker.snapshot_and_get_rates(
+                _make_counters(prefix_hits=80, prefix_misses=20)
+            )
+
+        assert result["windows"]["1m"]["prefix_hit_rate"] == 0.8
+        assert result["cumulative"]["prefix_hits"] == 80
+
+
+class TestCacheRateTrackerThreadSafety:
+
+    def test_concurrent_snapshot_and_read(self):
+        tracker = CacheRateTracker(min_interval=0.0)
+        errors = []
+        stop = threading.Event()
+
+        def writer():
+            i = 0
+            while not stop.is_set():
+                try:
+                    tracker.maybe_snapshot(_make_counters(prefix_hits=i))
+                    i += 1
+                except Exception as e:
+                    errors.append(e)
+
+        def reader():
+            while not stop.is_set():
+                try:
+                    tracker.get_rates()
+                except Exception as e:
+                    errors.append(e)
+
+        threads = [threading.Thread(target=writer), threading.Thread(target=reader)]
+        for t in threads:
+            t.start()
+        time.sleep(0.2)
+        stop.set()
+        for t in threads:
+            t.join(timeout=2.0)
+
+        assert errors == [], f"Thread errors: {errors}"
+
+
+class TestCacheRateTrackerClear:
+
+    def test_clear_resets_state(self):
+        tracker = CacheRateTracker(min_interval=0.0)
+        tracker.maybe_snapshot(_make_counters(prefix_hits=100))
+        tracker.clear()
+        assert tracker.get_rates() == {"windows": {}, "cumulative": {}}