diff --git a/api/routes.py b/api/routes.py index 0464045891..e2139a0c7e 100644 --- a/api/routes.py +++ b/api/routes.py @@ -480,6 +480,7 @@ def _clear_live_models_cache() -> None: _redact_text, ) from api.agent_health import build_agent_health_payload +from api.system_health import build_system_health_payload def _clear_stale_stream_state(session) -> bool: @@ -2491,6 +2492,10 @@ def handle_get(handler, parsed) -> bool: if parsed.path == "/api/health/agent": return j(handler, build_agent_health_payload()) + if parsed.path == "/api/system/health": + j(handler, build_system_health_payload()) + return True + if parsed.path == "/api/models": return j(handler, get_available_models()) diff --git a/api/system_health.py b/api/system_health.py new file mode 100644 index 0000000000..9b86f4ed22 --- /dev/null +++ b/api/system_health.py @@ -0,0 +1,167 @@ +"""Safe aggregate host resource metrics for the WebUI VPS panel (#693). + +The browser only needs coarse CPU/RAM/disk usage. Keep this module intentionally +small and dependency-free: no process lists, command strings, user identities, +environment variables, or filesystem topology leave the server. +""" + +from __future__ import annotations + +import shutil +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +_PROC_STAT = Path("/proc/stat") +_PROC_MEMINFO = Path("/proc/meminfo") +_CPU_SAMPLE_SECONDS = 0.05 + + +def _checked_at() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _clamp_percent(value: Any) -> float: + try: + numeric = float(value) + except (TypeError, ValueError): + return 0.0 + if numeric < 0: + numeric = 0.0 + if numeric > 100: + numeric = 100.0 + return round(numeric, 1) + + +def _read_proc_stat_cpu() -> tuple[int, int]: + """Return (idle_ticks, total_ticks) from Linux /proc/stat.""" + with _PROC_STAT.open("r", encoding="utf-8") as handle: + first = handle.readline().strip().split() + if not first or first[0] != "cpu": + raise RuntimeError("proc_stat_unavailable") + values = [int(part) for part in first[1:]] + if len(values) < 4: + raise RuntimeError("proc_stat_unavailable") + idle = values[3] + (values[4] if len(values) > 4 else 0) + total = sum(values) + if total <= 0: + raise RuntimeError("proc_stat_unavailable") + return idle, total + + +def _cpu_delta_percent(start: tuple[int, int], end: tuple[int, int]) -> float: + idle_delta = end[0] - start[0] + total_delta = end[1] - start[1] + if total_delta <= 0: + return 0.0 + busy_delta = max(0, total_delta - max(0, idle_delta)) + return _clamp_percent((busy_delta / total_delta) * 100.0) + + +def _cpu_percent() -> float: + """Sample aggregate CPU usage without psutil. + + A short local sample avoids storing cross-request state and returns a stable + percentage on the first poll. Unsupported platforms raise a safe error code. + """ + start = _read_proc_stat_cpu() + time.sleep(_CPU_SAMPLE_SECONDS) + end = _read_proc_stat_cpu() + return _cpu_delta_percent(start, end) + + +def _read_meminfo_kib() -> dict[str, int]: + data: dict[str, int] = {} + with _PROC_MEMINFO.open("r", encoding="utf-8") as handle: + for line in handle: + key, _, rest = line.partition(":") + if not key or not rest: + continue + parts = rest.strip().split() + if not parts: + continue + try: + data[key] = int(parts[0]) + except ValueError: + continue + return data + + +def _memory_usage() -> dict[str, int | float]: + meminfo = _read_meminfo_kib() + total = int(meminfo.get("MemTotal") or 0) * 1024 + if total <= 0: + raise RuntimeError("meminfo_unavailable") + available_kib = meminfo.get("MemAvailable") + if available_kib is None: + available_kib = ( + meminfo.get("MemFree", 0) + + meminfo.get("Buffers", 0) + + meminfo.get("Cached", 0) + + meminfo.get("SReclaimable", 0) + - meminfo.get("Shmem", 0) + ) + available = max(0, int(available_kib) * 1024) + used = max(0, min(total, total - available)) + return { + "used_bytes": used, + "total_bytes": total, + "percent": _clamp_percent((used / total) * 100.0), + } + + +def _disk_usage() -> dict[str, int | float]: + usage = shutil.disk_usage("/") + total = int(usage.total) + if total <= 0: + raise RuntimeError("disk_unavailable") + used = int(usage.used) + return { + "used_bytes": used, + "total_bytes": total, + "percent": _clamp_percent((used / total) * 100.0), + } + + +def _safe_error(metric: str, exc: Exception) -> dict[str, str]: + # Keep this intentionally coarse. Exception messages can contain local paths + # on unusual platforms; the browser only needs a safe unavailable reason. + return {"metric": metric, "code": type(exc).__name__} + + +def build_system_health_payload() -> dict[str, Any]: + metrics: dict[str, Any] = {"cpu": None, "memory": None, "disk": None} + errors: list[dict[str, str]] = [] + + collectors = { + "cpu": _cpu_percent, + "memory": _memory_usage, + "disk": _disk_usage, + } + for name, collect in collectors.items(): + try: + value = collect() + if name == "cpu": + metrics[name] = {"percent": _clamp_percent(value)} + else: + metrics[name] = { + "used_bytes": max(0, int(value["used_bytes"])), + "total_bytes": max(0, int(value["total_bytes"])), + "percent": _clamp_percent(value["percent"]), + } + except Exception as exc: + errors.append(_safe_error(name, exc)) + + available = any(metrics[name] is not None for name in metrics) + status = "ok" if available and not errors else "partial" if available else "unavailable" + return { + "status": status, + "available": available, + "checked_at": _checked_at(), + "cpu": metrics["cpu"], + "memory": metrics["memory"], + "disk": metrics["disk"], + "errors": errors, + } diff --git a/docs/pr-media/1688/chat-no-health-bar.png b/docs/pr-media/1688/chat-no-health-bar.png new file mode 100644 index 0000000000..f79ee650df Binary files /dev/null and b/docs/pr-media/1688/chat-no-health-bar.png differ diff --git a/docs/pr-media/1688/insights-system-health.png b/docs/pr-media/1688/insights-system-health.png new file mode 100644 index 0000000000..c9788835da Binary files /dev/null and b/docs/pr-media/1688/insights-system-health.png differ diff --git a/docs/pr-media/693/system-health-panel.png b/docs/pr-media/693/system-health-panel.png new file mode 100644 index 0000000000..a228a34630 Binary files /dev/null and b/docs/pr-media/693/system-health-panel.png differ diff --git a/static/panels.js b/static/panels.js index ea9a30aa7a..c07e6229fd 100644 --- a/static/panels.js +++ b/static/panels.js @@ -215,6 +215,7 @@ async function switchPanel(name, opts = {}) { if (nextPanel === 'insights') await loadInsights(); if (nextPanel === 'logs') await loadLogs(); _syncLogsAutoRefresh(); + if (typeof _syncSystemHealthMonitorVisibility === 'function') _syncSystemHealthMonitorVisibility(); if (nextPanel === 'settings') { switchSettingsSection(_currentSettingsSection); loadSettingsPanel(); @@ -2118,6 +2119,8 @@ async function loadInsights(animate) { api('/api/wiki/status').catch(err => ({status:'error', error: err.message || String(err)})), ]); _renderInsights(data, box, wikiStatus); + if (typeof _syncSystemHealthMonitorVisibility === 'function') _syncSystemHealthMonitorVisibility(); + if (typeof pollSystemHealth === 'function') void pollSystemHealth(); } catch(e) { box.innerHTML = `
${esc(t('error_prefix') + e.message)}
`; } finally { @@ -2134,6 +2137,34 @@ function _formatLlmWikiTimestamp(value) { catch (_) { return String(value); } } +function _renderSystemHealthPanel() { + return ` +
+
+
+
System health
+
Current VPS resource usage
+
+ Loading… +
+
+
+
CPU
+
+
+
+
RAM
+
+
+
+
Disk
+
+
+
+
Live snapshot only; historical resource charts can build on this surface later.
+
`; +} + function _renderLlmWikiStatus(d) { const status = d || {status:'error'}; const isReady = status.available && status.status === 'ready'; @@ -2279,6 +2310,7 @@ function _renderInsights(d, box, wikiStatus) { `; box.innerHTML = ` + ${_renderSystemHealthPanel()} ${_renderLlmWikiStatus(wikiStatus)}
${overviewCards.map(c => `
${c.icon}
${c.value}
${esc(c.label)}
`).join('')} diff --git a/static/style.css b/static/style.css index c08141d1d6..5ce105cbc7 100644 --- a/static/style.css +++ b/static/style.css @@ -293,6 +293,20 @@ .layout{display:flex;width:100%;flex:1 1 auto;min-height:0;} .app-titlebar{display:flex;align-items:center;justify-content:center;height:38px;flex-shrink:0;background:var(--sidebar);border-bottom:1px solid var(--border);padding:0 12px;padding-top:var(--app-titlebar-safe-top);padding-left:max(12px,env(safe-area-inset-left,0));padding-right:max(12px,env(safe-area-inset-right,0));box-sizing:content-box;font-size:12px;color:var(--muted);user-select:none;-webkit-app-region:drag;position:relative;z-index:20;} .app-titlebar-inner{display:flex;align-items:center;gap:8px;min-width:0;max-width:100%;justify-content:center;} + .system-health-panel.insights-card{display:flex;flex-direction:column;gap:12px;color:var(--muted);} + .system-health-panel.unavailable{display:none;} + .system-health-head{display:flex;align-items:flex-start;justify-content:space-between;gap:12px;} + .system-health-sub{font-size:11px;color:var(--muted);margin-top:-4px;} + .system-health-dot{width:7px;height:7px;border-radius:999px;background:var(--accent);box-shadow:0 0 0 3px var(--accent-bg);opacity:.88;} + .system-health-panel.loading .system-health-dot{background:var(--muted);box-shadow:none;opacity:.55;} + .system-health-status{display:inline-flex;align-items:center;gap:7px;border-radius:999px;padding:3px 8px;font-size:11px;font-weight:700;border:1px solid var(--border);color:var(--muted);background:var(--surface);white-space:nowrap;} + .system-health-metrics{display:grid;grid-template-columns:repeat(3,minmax(120px,1fr));gap:10px;min-width:0;} + .system-health-metric{min-width:0;display:flex;flex-direction:column;gap:5px;padding:10px 11px;border:1px solid var(--border);border-radius:8px;background:var(--surface);} + .system-health-label{display:flex;align-items:center;justify-content:space-between;gap:8px;font-size:11px;line-height:1;color:var(--muted);} + .system-health-value{font-variant-numeric:tabular-nums;color:var(--text);font-weight:650;} + .system-health-bar{height:5px;overflow:hidden;border-radius:999px;background:color-mix(in srgb,var(--border) 70%,transparent);border:1px solid color-mix(in srgb,var(--border) 75%,transparent);} + .system-health-bar-fill{height:100%;width:0%;border-radius:inherit;background:linear-gradient(90deg,var(--accent),var(--accent-hover));transition:width .25s ease;} + .system-health-foot{font-size:11px;color:var(--muted);line-height:1.45;opacity:.82;} .app-titlebar-icon{display:inline-flex;align-items:center;color:var(--accent);} .app-titlebar-title{font-size:12px;font-weight:600;color:var(--text);letter-spacing:-.01em;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;max-width:60vw;} .app-titlebar-sub{font-size:10px;color:var(--muted);background:var(--hover-bg);padding:2px 7px;border-radius:4px;font-family:'SF Mono',ui-monospace,monospace;white-space:nowrap;flex-shrink:0;} @@ -1280,6 +1294,11 @@ .app-titlebar{justify-content:space-between;} .app-titlebar-hamburger,.app-titlebar-spacer{display:flex;} .app-titlebar-inner{flex:1 1 auto;} + .system-health-panel.insights-card{gap:10px;padding:12px;} + .system-health-head{align-items:flex-start;} + .system-health-metrics{grid-template-columns:1fr;gap:8px;} + .system-health-label{font-size:10px;gap:4px;} + .system-health-bar{height:4px;} /* Overlay backdrop */ .mobile-overlay{display:none;position:fixed;inset:0;background:rgba(0,0,0,.5); z-index:199;-webkit-tap-highlight-color:transparent;} diff --git a/static/ui.js b/static/ui.js index 4b93eb294e..197abd6ae8 100644 --- a/static/ui.js +++ b/static/ui.js @@ -3065,6 +3065,100 @@ function dismissReconnect() { clearInflight(); } +// ── Live host resource health panel (#693) ── +const SYSTEM_HEALTH_INTERVAL_MS=5000; +let _systemHealthTimer=null; +function _systemHealthPercent(metric){ + const percent=Number(metric&&metric.percent); + if(!Number.isFinite(percent)) return null; + return Math.max(0,Math.min(100,Math.round(percent*10)/10)); +} +function _formatSystemHealthPercent(percent){ + if(percent == null) return '—'; + return `${percent.toFixed(percent%1?1:0)}%`; +} +function _formatSystemHealthBytes(metric){ + if(!metric||!metric.used_bytes||!metric.total_bytes) return ''; + const units=['B','KB','MB','GB','TB']; + const fmt=(bytes)=>{ + let value=Number(bytes)||0, idx=0; + while(value>=1024&&idx=10||idx===0?0:1)} ${units[idx]}`; + }; + return `${fmt(metric.used_bytes)} / ${fmt(metric.total_bytes)}`; +} +function _updateSystemHealthMetric(name,metric){ + const row=document.querySelector(`[data-system-health-metric="${name}"]`); + if(!row) return; + const rawPercent=_systemHealthPercent(metric); + const percent=rawPercent == null ? 0 : rawPercent; + const label=row.querySelector('[data-system-health-value]'); + const bar=row.querySelector('.system-health-bar'); + const fill=row.querySelector('.system-health-bar-fill'); + const text=_formatSystemHealthPercent(rawPercent); + if(label){ + label.textContent=text; + const bytes=(name==='memory'||name==='disk')?_formatSystemHealthBytes(metric):''; + label.title=bytes||text; + } + if(bar) bar.setAttribute('aria-valuenow',String(percent)); + if(fill) fill.style.width=`${percent}%`; +} +function setSystemHealthUnavailable(message){ + const panel=$('systemHealthPanel'); + const status=$('systemHealthStatus'); + if(!panel) return; + panel.classList.remove('loading'); + panel.classList.add('unavailable'); + if(status) status.textContent=message||'Unavailable'; + ['cpu','memory','disk'].forEach(name=>_updateSystemHealthMetric(name,null)); +} +function renderSystemHealth(payload){ + const panel=$('systemHealthPanel'); + const status=$('systemHealthStatus'); + if(!panel) return; + if(!payload||payload.available===false){ + setSystemHealthUnavailable('Unavailable'); + return; + } + panel.classList.remove('loading','unavailable'); + if(status) status.textContent=payload.status==='partial'?'Partial':'Live'; + _updateSystemHealthMetric('cpu',payload.cpu); + _updateSystemHealthMetric('memory',payload.memory); + _updateSystemHealthMetric('disk',payload.disk); +} +async function pollSystemHealth(){ + if(document.visibilityState !== 'visible') return; + if(!_systemHealthPanelIsVisible()) return; + try{ + const payload=await api('/api/system/health'); + renderSystemHealth(payload); + }catch(_){ + setSystemHealthUnavailable('Unavailable'); + } +} +function _systemHealthPanelIsVisible(){ + return document.visibilityState === 'visible' && + !!document.querySelector('main.main.showing-insights') && + !!$('systemHealthPanel'); +} +function startSystemHealthMonitor(){ + if(!_systemHealthPanelIsVisible()) return; + if(_systemHealthTimer) return; + void pollSystemHealth(); + _systemHealthTimer=setInterval(pollSystemHealth,SYSTEM_HEALTH_INTERVAL_MS); +} +function stopSystemHealthMonitor(){ + if(_systemHealthTimer){clearInterval(_systemHealthTimer);_systemHealthTimer=null;} +} +function _syncSystemHealthMonitorVisibility(){ + if(_systemHealthPanelIsVisible()) startSystemHealthMonitor(); + else stopSystemHealthMonitor(); +} +document.addEventListener('visibilitychange',_syncSystemHealthMonitorVisibility); +if(document.readyState==='loading') document.addEventListener('DOMContentLoaded',startSystemHealthMonitor); +else startSystemHealthMonitor(); + // ── Hermes agent/gateway heartbeat alert (#716) ── const AGENT_HEALTH_INTERVAL_MS=30000; const AGENT_HEALTH_DISMISSED_KEY='agent-health-dismissed'; diff --git a/tests/test_issue693_system_health_panel.py b/tests/test_issue693_system_health_panel.py new file mode 100644 index 0000000000..1161dc0af1 --- /dev/null +++ b/tests/test_issue693_system_health_panel.py @@ -0,0 +1,183 @@ +"""Regression coverage for #693 live VPS host resource health panel.""" + +from __future__ import annotations + +import json +import pathlib +from types import SimpleNamespace +from urllib.parse import urlparse + + +REPO_ROOT = pathlib.Path(__file__).parent.parent +UI_JS = (REPO_ROOT / "static" / "ui.js").read_text(encoding="utf-8") +PANELS_JS = (REPO_ROOT / "static" / "panels.js").read_text(encoding="utf-8") +INDEX_HTML = (REPO_ROOT / "static" / "index.html").read_text(encoding="utf-8") +STYLE_CSS = (REPO_ROOT / "static" / "style.css").read_text(encoding="utf-8") +ROUTES_PY = (REPO_ROOT / "api" / "routes.py").read_text(encoding="utf-8") +AUTH_PY = (REPO_ROOT / "api" / "auth.py").read_text(encoding="utf-8") + + +class _FakeHandler: + def __init__(self): + self.status = None + self.sent_headers = [] + self.body = bytearray() + self.wfile = self + self.headers = {} + + def send_response(self, status): + self.status = status + + def send_header(self, name, value): + self.sent_headers.append((name, value)) + + def end_headers(self): + pass + + def write(self, data): + self.body.extend(data) + + def json_body(self): + return json.loads(bytes(self.body).decode("utf-8")) + + +def test_system_health_payload_normalizes_safe_aggregate_metrics(monkeypatch): + from api import system_health + + monkeypatch.setattr(system_health, "_cpu_percent", lambda: 17.345) + monkeypatch.setattr( + system_health, + "_memory_usage", + lambda: {"used_bytes": 4_000, "total_bytes": 10_000, "percent": 40.0}, + ) + monkeypatch.setattr( + system_health, + "_disk_usage", + lambda: {"used_bytes": 55_500, "total_bytes": 100_000, "percent": 55.5}, + ) + + payload = system_health.build_system_health_payload() + + assert payload["status"] == "ok" + assert payload["available"] is True + assert payload["cpu"] == {"percent": 17.3} + assert payload["memory"] == {"used_bytes": 4000, "total_bytes": 10000, "percent": 40.0} + assert payload["disk"] == {"used_bytes": 55500, "total_bytes": 100000, "percent": 55.5} + assert payload["checked_at"] + rendered = repr(payload) + for private_fragment in ("/home/", "/Users/", "mount", "path", "argv", "command", "env", "token"): + assert private_fragment not in rendered + + +def test_system_health_payload_partial_and_unavailable_are_graceful(monkeypatch): + from api import system_health + + def boom(): + raise RuntimeError("private /home/user/path should not leak") + + monkeypatch.setattr(system_health, "_cpu_percent", boom) + monkeypatch.setattr(system_health, "_memory_usage", boom) + monkeypatch.setattr( + system_health, + "_disk_usage", + lambda: {"used_bytes": 1, "total_bytes": 4, "percent": 25.0}, + ) + + partial = system_health.build_system_health_payload() + assert partial["status"] == "partial" + assert partial["available"] is True + assert partial["disk"]["percent"] == 25.0 + assert partial["cpu"] is None + assert partial["memory"] is None + assert {e["metric"] for e in partial["errors"]} == {"cpu", "memory"} + assert "/home/user" not in repr(partial) + + monkeypatch.setattr(system_health, "_disk_usage", boom) + unavailable = system_health.build_system_health_payload() + assert unavailable["status"] == "unavailable" + assert unavailable["available"] is False + assert unavailable["cpu"] is None + assert unavailable["memory"] is None + assert unavailable["disk"] is None + assert "/home/user" not in repr(unavailable) + + +def test_system_health_route_registered_and_auth_gated(monkeypatch): + assert 'parsed.path == "/api/system/health"' in ROUTES_PY + assert "build_system_health_payload()" in ROUTES_PY + assert '"/api/system/health"' not in AUTH_PY, "system metrics must not be public" + + monkeypatch.setenv("HERMES_WEBUI_PASSWORD", "test-password") + from api.auth import check_auth + + handler = _FakeHandler() + assert check_auth(handler, SimpleNamespace(path="/api/system/health", query="")) is False + assert handler.status in (302, 401) + + +def test_system_health_route_returns_only_sanitized_payload(monkeypatch): + from api import routes + + monkeypatch.setattr( + routes, + "build_system_health_payload", + lambda: { + "status": "ok", + "available": True, + "checked_at": "2026-05-05T00:00:00+00:00", + "cpu": {"percent": 12.0}, + "memory": {"used_bytes": 1, "total_bytes": 2, "percent": 50.0}, + "disk": {"used_bytes": 3, "total_bytes": 4, "percent": 75.0}, + "errors": [], + }, + ) + handler = _FakeHandler() + assert routes.handle_get(handler, urlparse("http://example.test/api/system/health")) is True + payload = handler.json_body() + assert payload["cpu"]["percent"] == 12.0 + assert set(payload) == {"status", "available", "checked_at", "cpu", "memory", "disk", "errors"} + + +def test_system_health_panel_markup_and_styles_live_under_insights_not_top_chrome(): + top_shell = INDEX_HTML[: INDEX_HTML.index('
')] + assert 'id="systemHealthPanel"' not in top_shell + assert 'aria-label="Host resource health"' not in top_shell + assert 'function _renderSystemHealthPanel()' in PANELS_JS + assert 'id="systemHealthPanel"' in PANELS_JS + assert 'aria-label="Host resource health"' in PANELS_JS + assert 'System health' in PANELS_JS + assert 'Current VPS resource usage' in PANELS_JS + assert PANELS_JS.index('_renderSystemHealthPanel()') < PANELS_JS.index('_renderLlmWikiStatus(wikiStatus)') + assert 'data-system-health-metric="cpu"' in PANELS_JS + assert 'data-system-health-metric="memory"' in PANELS_JS + assert 'data-system-health-metric="disk"' in PANELS_JS + assert ".system-health-panel.insights-card" in STYLE_CSS + assert ".system-health-bar-fill" in STYLE_CSS + assert ".system-health-panel.unavailable" in STYLE_CSS + assert "@media(max-width:640px)" in STYLE_CSS and ".system-health-panel.insights-card" in STYLE_CSS + + +def test_system_health_frontend_polls_visible_and_renders_progress_labels(): + assert "const SYSTEM_HEALTH_INTERVAL_MS=5000" in UI_JS + assert "api('/api/system/health')" in UI_JS + assert "document.visibilityState !== 'visible'" in UI_JS + assert "document.querySelector('main.main.showing-insights')" in UI_JS + assert "document.addEventListener('visibilitychange',_syncSystemHealthMonitorVisibility)" in UI_JS + assert "typeof _syncSystemHealthMonitorVisibility === 'function'" in PANELS_JS + assert "function renderSystemHealth(payload)" in UI_JS + assert "setSystemHealthUnavailable" in UI_JS + assert "data-system-health-metric" in PANELS_JS + assert "CPU" in PANELS_JS and "RAM" in PANELS_JS and "Disk" in PANELS_JS + assert "aria-valuenow" in UI_JS + assert "style.width=`${percent}%`" in UI_JS + + +def test_system_health_backend_uses_no_shell_or_private_process_sources(): + src = (REPO_ROOT / "api" / "system_health.py").read_text(encoding="utf-8") + assert "import subprocess" not in src + assert "import psutil" not in src + assert "os.environ" not in src + assert "ps aux" not in src + assert "/proc/self/environ" not in src + for private_field in ("argv", "cmdline", "username", "mountpoint"): + assert private_field not in src