Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
0de6074
feat(cache): add per-model cache hit-rate observability
ivaniguarans May 11, 2026
b49963b
fix(cache): configure disk-max mock in cache observability tests
ivaniguarans May 13, 2026
649a221
feat(cache): MRU partial cache for repeat prompts
blightbow May 8, 2026
dab3807
fix(cache): close MRU partial review findings (C1-C3, H2-H3)
blightbow May 8, 2026
0e5fc97
docs(cache): pin MRU memory accounting invariant
blightbow May 9, 2026
7908f30
docs(cache): note pre-load admission's KV headroom in MRU docs
blightbow May 9, 2026
cfd4fa2
chore(cache): bit-rot proof MRU docs and collapse test factories
blightbow May 9, 2026
9b7c95a
refactor(cache): factor _can_reconstruct, fix gate docstrings
blightbow May 13, 2026
1bd11d3
feat(cache): multi-slot LRU MRU partial cache
blightbow May 13, 2026
cce198f
fix(admin): clear_ssd_cache also wipes MRU partials
blightbow May 13, 2026
18bc7cc
feat(cache): observability counters for MRU partial cache
blightbow May 13, 2026
cf844c1
refactor(cache): simplify MRU stack — dedupe test helpers, _evict_mis…
blightbow May 13, 2026
f0a9d1a
fix(cache): surface MRU fields in get_stats_dict
blightbow May 14, 2026
5848c49
feat(cache): flag model-incompatible MRU on dashboard + warn log
blightbow May 15, 2026
73033f9
Merge branch 'main' into feat/mru-partial-block-cache
blightbow May 15, 2026
b039eb2
refactor(cache): plain-language MRU incompatibility warning
blightbow May 15, 2026
a42542f
fix(cache): key MRU stash to the prompt boundary
blightbow May 15, 2026
88f809b
fix(cache): eval MRU partial KV at stash time
blightbow May 15, 2026
b04b18a
refactor(admin): drop global MRU-tails gauge
blightbow May 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 136 additions & 31 deletions omlx/admin/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3483,6 +3483,13 @@ def _build_runtime_cache_observability(
}

cache_dir = global_settings.cache.get_ssd_cache_dir(global_settings.base_path)
cache_cfg = global_settings.cache
try:
cfg_disk_max = cache_cfg.get_ssd_cache_max_size_bytes(global_settings.base_path)
except (ValueError, OSError, TypeError) as exc:
logger.warning("Could not read SSD cache max size from config: %s", exc)
cfg_disk_max = 0

payload = {
"base_path": str(global_settings.base_path),
"ssd_cache_dir": str(cache_dir),
Expand All @@ -3491,6 +3498,16 @@ def _build_runtime_cache_observability(
"total_num_files": 0,
"total_size_bytes": 0,
"effective_block_sizes": [],
"disk_max_bytes": cfg_disk_max,
"hot_cache_max_bytes": 0,
"hot_cache_size_bytes": 0,
"hot_cache_entries": 0,
# MRU partial cache feature gate. Per-model occupancy lives on each
# models[] entry; there is no payload-level entries aggregate
# because MRU tail slots are per-model, not a shared budget — only
# this max-entries sum is kept, purely so the dashboard can tell
# whether the feature is configured for any loaded model.
"mru_partial_max_entries": 0,
}

engine_pool = _get_engine_pool()
Expand Down Expand Up @@ -3602,11 +3619,28 @@ def _build_runtime_cache_observability(
"last_tokens_to_next_block": last_tokens_to_next_block,
"num_files": int(ssd_stats.get("num_files", 0) or 0),
"total_size_bytes": int(ssd_stats.get("total_size_bytes", 0) or 0),
"max_size_bytes": int(ssd_stats.get("max_size_bytes", 0) or 0),
"hot_cache_max_bytes": int(ssd_stats.get("hot_cache_max_bytes", 0) or 0),
"hot_cache_size_bytes": int(ssd_stats.get("hot_cache_size_bytes", 0) or 0),
"hot_cache_entries": int(ssd_stats.get("hot_cache_entries", 0) or 0),
"mru_partial_entries": int(
prefix_stats.get("mru_partial_entries", 0) or 0
),
"mru_partial_max_entries": int(
prefix_stats.get("mru_partial_max_entries", 0) or 0
),
# Tri-state: None (unknown / no inference yet), True (eligible),
# False (model uses non-sliceable cache layers — every stash
# refused at the safety gate; dashboard renders 'N/A (see log)').
"mru_partial_supported": prefix_stats.get(
"mru_partial_supported", None
),
}

cache_rates = runtime_stats.get("cache_rates")
if cache_rates:
model_payload["cache_rates"] = cache_rates

payload["models"].append(model_payload)
payload["total_num_files"] += model_payload["num_files"]
payload["total_size_bytes"] += model_payload["total_size_bytes"]
Expand All @@ -3616,6 +3650,33 @@ def _build_runtime_cache_observability(

payload["effective_block_sizes"] = sorted(block_sizes)

# Aggregate hot-cache and disk-max across models.
# hot_cache_max sums across models (each model reserves its own slice of
# the same process-wide hot cache budget) so the gauge denominator matches
# the summed numerator. disk_max keeps the config fallback via max()
# because a single SSD cache directory is shared — the effective cap is
# the largest configured limit, not a per-model sum.
hot_cache_max = 0
disk_max = payload["disk_max_bytes"]
hot_cache_size_total = 0
hot_cache_entries_total = 0
mru_max_entries_total = 0
for m in payload["models"]:
hot_cache_size_total += m.get("hot_cache_size_bytes", 0)
hot_cache_entries_total += m.get("hot_cache_entries", 0)
hot_cache_max += m.get("hot_cache_max_bytes", 0)
disk_max = max(disk_max, m.get("max_size_bytes", 0))
# MRU: only the max-entries sum is kept, and only as a feature-on
# gate for the dashboard. Per-model occupancy is on each models[]
# entry; an aggregate live count would be meaningless because the
# slots are per-model, not a shared budget.
mru_max_entries_total += m.get("mru_partial_max_entries", 0)
payload["hot_cache_max_bytes"] = hot_cache_max
payload["hot_cache_size_bytes"] = hot_cache_size_total
payload["hot_cache_entries"] = hot_cache_entries_total
payload["disk_max_bytes"] = disk_max
payload["mru_partial_max_entries"] = mru_max_entries_total

# Fallback: if no loaded models contributed stats, scan the cache
# directory directly so the dashboard still shows real disk usage.
if payload["total_num_files"] == 0 and cache_dir.exists():
Expand Down Expand Up @@ -3870,6 +3931,30 @@ async def clear_alltime_stats(is_admin: bool = Depends(require_admin)):
return {"status": "ok"}


def _iter_loaded_schedulers():
"""Yield (model_id, scheduler) for each loaded model.

Traverses the internal engine hierarchy: pool entry → async engine →
core engine → scheduler. Both ``clear_ssd_cache`` and
``clear_hot_cache`` share this traversal.
"""
engine_pool = _get_engine_pool()
if engine_pool is None:
return
for model_info in engine_pool.get_status().get("models", []):
model_id = model_info.get("id")
if not model_id or not model_info.get("loaded"):
continue
entry = engine_pool._entries.get(model_id)
if entry is None or entry.engine is None:
continue
async_core = getattr(entry.engine, "_engine", None)
core = getattr(async_core, "engine", None) if async_core is not None else None
scheduler = getattr(core, "scheduler", None) if core is not None else None
if scheduler is not None:
yield model_id, scheduler


@router.post("/api/ssd-cache/clear")
async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
"""Clear all SSD cache files for all loaded models.
Expand All @@ -3880,38 +3965,33 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
"""
total_deleted = 0

# Phase 1: clear via loaded models' cache managers (updates in-memory index)
engine_pool = _get_engine_pool()
if engine_pool is not None:
for model_info in engine_pool.get_status().get("models", []):
model_id = model_info.get("id")
if not model_id or not model_info.get("loaded"):
continue

entry = engine_pool._entries.get(model_id)
if entry is None or entry.engine is None:
continue

async_core = getattr(entry.engine, "_engine", None)
core = (
getattr(async_core, "engine", None) if async_core is not None else None
)
scheduler = (
getattr(core, "scheduler", None) if core is not None else None
)
for model_id, scheduler in _iter_loaded_schedulers():
ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
if ssd_manager is not None:
try:
total_deleted += ssd_manager.clear()
except Exception as exc:
logger.warning(
"Failed to clear SSD cache for model '%s': %s",
model_id,
exc,
)

if scheduler is not None:
ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
if ssd_manager is not None:
try:
deleted = ssd_manager.clear()
total_deleted += deleted
except Exception as exc:
logger.warning(
"Failed to clear SSD cache for model '%s': %s",
model_id,
exc,
)
# MRU partials chain from paged-block hashes whose KV bytes are
# gone after the ssd_manager.clear() above. Drop them so the
# admin "clear all warm caches" intent is honoured symmetrically.
# Single-tier behaviour (no clear) is the surviving-stash hazard
# the peer review caught for this endpoint.
block_aware_cache = getattr(scheduler, "block_aware_cache", None)
if block_aware_cache is not None:
try:
block_aware_cache.clear_mru_partials()
except Exception as exc:
logger.warning(
"Failed to clear MRU partials for model '%s': %s",
model_id,
exc,
)

# Phase 2: remove any remaining files on disk (covers unloaded models)
global_settings = _get_global_settings()
Expand All @@ -3937,6 +4017,31 @@ async def clear_ssd_cache(is_admin: bool = Depends(require_admin)):
return {"status": "ok", "total_deleted": total_deleted}


@router.post("/api/hot-cache/clear")
async def clear_hot_cache(is_admin: bool = Depends(require_admin)):
"""Clear the in-memory (hot) cache for all loaded models.

No filesystem fallback needed — hot cache is in-memory only and does
not survive process restart.
"""
total_cleared = 0
for model_id, scheduler in _iter_loaded_schedulers():
ssd_manager = getattr(scheduler, "paged_ssd_cache_manager", None)
if ssd_manager is not None and hasattr(ssd_manager, "clear_hot_cache"):
try:
total_cleared += ssd_manager.clear_hot_cache()
except Exception as exc:
logger.warning(
"Failed to clear hot cache for model '%s': %s",
model_id,
exc,
)
rate_tracker = getattr(scheduler, "_cache_rate_tracker", None)
if rate_tracker is not None:
rate_tracker.clear()
return {"status": "ok", "total_cleared": total_cleared}


@router.post("/api/cache/probe")
async def probe_cache(
request: CacheProbeRequest,
Expand Down
4 changes: 4 additions & 0 deletions omlx/admin/static/css/dashboard.css
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
[data-theme="dark"] .hover\:text-neutral-700:hover { color: var(--text-primary) !important; }
[data-theme="dark"] .hover\:text-neutral-600:hover { color: var(--text-secondary) !important; }

/* === Gauge track (visible in both themes) === */
.gauge-track { background-color: #e5e5e5; }
[data-theme="dark"] .gauge-track { background-color: #3f3f46 !important; }

/* === Active nav tab (bg-white with shadow inside dark nav) === */
[data-theme="dark"] .shadow-sm { box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.3) !important; }

Expand Down
85 changes: 84 additions & 1 deletion omlx/admin/static/js/dashboard.js
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@
total_num_files: 0,
total_size_bytes: 0,
effective_block_sizes: [],
hot_cache_size_bytes: 0,
hot_cache_entries: 0,
hot_cache_max_bytes: 0,
disk_max_bytes: 0,
},
},
alltimeStats: {
Expand Down Expand Up @@ -190,6 +194,7 @@
showClearStatsConfirm: false,
showClearAlltimeConfirm: false,
showClearSsdCacheConfirm: false,
showClearHotCacheConfirm: false,
_statsRefreshTimer: null,

// Log viewer state
Expand Down Expand Up @@ -2149,7 +2154,8 @@

async clearSsdCache() {
try {
await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
const resp = await fetch('/admin/api/ssd-cache/clear', { method: 'POST' });
if (!resp.ok) console.error('SSD cache clear failed:', resp.status);
this.showClearSsdCacheConfirm = false;
await this.loadStats();
} catch (err) {
Expand All @@ -2158,6 +2164,18 @@
}
},

async clearHotCache() {
try {
const resp = await fetch('/admin/api/hot-cache/clear', { method: 'POST' });
if (!resp.ok) console.error('Hot cache clear failed:', resp.status);
this.showClearHotCacheConfirm = false;
await this.loadStats();
} catch (err) {
console.error('Failed to clear hot cache:', err);
this.showClearHotCacheConfirm = false;
}
},

startStatsRefresh() {
this.stopStatsRefresh();
this._statsRefreshTimer = setInterval(() => {
Expand All @@ -2178,6 +2196,39 @@
return num.toLocaleString();
},

cacheObsCumulative(stats, selectedModel) {
const entries = stats.runtime_cache?.models || [];
if (entries.length === 0) return {};

if (selectedModel) {
const entry = entries.find(m => m.id === selectedModel);
return entry?.cache_rates?.cumulative || {};
}

const sumKeys = ['prefix_hits', 'prefix_misses', 'evictions', 'ssd_hot_hits', 'ssd_disk_loads', 'ssd_saves', 'hot_cache_evictions', 'hot_cache_promotions', 'mru_partial_stashes', 'mru_partial_hits', 'mru_partial_evictions', 'mru_partial_tokens_saved'];
let agg = {};

for (const m of entries) {
const c = m.cache_rates?.cumulative;
if (!c || Object.keys(c).length === 0) continue;
for (const k of sumKeys) {
agg[k] = (agg[k] || 0) + (c[k] || 0);
}
}

const ph = agg.prefix_hits || 0;
const pm = agg.prefix_misses || 0;
const sh = agg.ssd_hot_hits || 0;
const sd = agg.ssd_disk_loads || 0;
const ms = agg.mru_partial_stashes || 0;
const mh = agg.mru_partial_hits || 0;
agg.prefix_hit_rate = (ph + pm) > 0 ? ph / (ph + pm) : 0;
agg.ssd_hot_rate = (sh + sd) > 0 ? sh / (sh + sd) : 0;
agg.mru_partial_hit_rate = ms > 0 ? mh / ms : 0;

return agg;
},

getStatFontClass(value) {
if (value >= 1000000000) return 'text-2xl';
if (value >= 1000000) return 'text-3xl';
Expand Down Expand Up @@ -2239,6 +2290,38 @@
return 'bg-red-400';
},

get runtimeHotCachePercent() {
const rc = this.stats.runtime_cache;
if (!rc || !rc.hot_cache_max_bytes) return 0;
return Math.min(100, (rc.hot_cache_size_bytes / rc.hot_cache_max_bytes) * 100);
},

// mruEnabled is a feature-on gate (drives the rate strip and the
// per-model MRU Tails column). It reads the payload-level
// mru_partial_max_entries purely as "configured for any loaded
// model" — there is deliberately no aggregate MRU-tails gauge,
// since the slots are per-model, not a shared budget.
get mruEnabled() {
return (this.stats.runtime_cache?.mru_partial_max_entries || 0) > 0;
},

get hotCacheEnabled() {
return (this.stats.runtime_cache?.hot_cache_max_bytes || 0) > 0;
},

get cacheRatesGridCols() {
const both = this.hotCacheEnabled && this.mruEnabled;
if (both) return 'grid-cols-2 sm:grid-cols-6';
if (this.hotCacheEnabled || this.mruEnabled) return 'grid-cols-2 sm:grid-cols-4';
return 'grid-cols-2';
},

get runtimeSsdCachePercent() {
const rc = this.stats.runtime_cache;
if (!rc || !rc.disk_max_bytes) return 0;
return Math.min(100, (rc.total_size_bytes / rc.disk_max_bytes) * 100);
},

get activeModelsMemoryPercent() {
const am = this.stats.active_models;
if (!am || !am.model_memory_max) return 0;
Expand Down
Loading